diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp index 4b6cd7b8dd66..815f28626940 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp @@ -664,9 +664,7 @@ static LogicalResult populateCanonicalOffsetsSizesAndStrides( llvm::zip_equal(subgroupLayout.outer, subgroupLayout.thread, subgroupLayout.element)) { if (outer != 1) { - // TODO: Support this case. Might need a reshape since this makes the - // slice non-contigious. - return failure(); + rankReducedShape.push_back(outer); } rankReducedShape.push_back(thread * element); } @@ -690,6 +688,7 @@ static LogicalResult populateCanonicalOffsetsSizesAndStrides( subgroupLayout.element)) { if (dimSize == 1) { vtids.push_back(zero); + continue; } // ((tid floordiv stride) mod size) * element. @@ -702,7 +701,12 @@ static LogicalResult populateCanonicalOffsetsSizesAndStrides( } int64_t idx = 0; - for (int64_t element : subgroupLayout.element) { + for (auto [element, outer] : + llvm::zip_equal(subgroupLayout.element, subgroupLayout.outer)) { + if (outer != 1) { + canonicalSizes.push_back(builder.getIndexAttr(outer)); + canonicalOffsets.push_back(zero); + } canonicalSizes.push_back(builder.getIndexAttr(element)); canonicalOffsets.push_back(vtids[idx++]); } @@ -716,13 +720,6 @@ LogicalResult MMAAttr::populateOperandOffsetsSizesStrides( Value laneId, ArrayRef permutation, SmallVector &offsets, SmallVector &sizes, SmallVector &strides) const { - switch (getIntrinsic().getValue()) { - case MMAIntrinsic::MFMA_F32_16x16x16_F16: - case MMAIntrinsic::MFMA_I32_16x16x32_I8: - break; - default: - return failure(); - } MMAAttr::SingleSubgroupLayout subgroupLayout; switch (fragment) { @@ -758,47 +755,33 @@ LogicalResult MMAAttr::materializeOperandConcreteShape( std::optional> permutation, SmallVector &reassociations, RankedTensorType &resultType) const { - OpaqueMmaLayout opaqueLayout = - getOpaqueMFMALayout(operand.getContext(), getIntrinsic().getValue()); - // TODO(Max191): The `getConcreteMFMALayout` function creates some - // `PerDimLayoutAttr` that are not used by this function. This means that - // any pass that uses `materializeOperandConcreteShape` needs to be - // dependent on the VectorExt dialect. Ideally, the `getConcreteMFMALayout` - // function should be refactored so we can reuse the shape information of - // the layout without needing to create any `PerDimLayoutAttr`. - ConcreteMmaLayout layout = - getConcreteMFMALayout(operand.getContext(), getIntrinsic().getValue()); - SmallVector> concreteSizes; + + SmallVector outerSizes; SmallVector opaqueSizes; + auto [m, n, k] = getMNKShape(); switch (fragment) { case IREE::GPU::MMAFragment::Lhs: { - concreteSizes.push_back(layout.aMLayout.getShapes()); - concreteSizes.push_back(layout.aKLayout.getShapes()); - opaqueSizes.push_back(opaqueLayout.mSize); - opaqueSizes.push_back(opaqueLayout.kSize); + outerSizes = getASingleSubgroupLayout().outer; + opaqueSizes.append({m, k}); break; } case IREE::GPU::MMAFragment::Rhs: { - concreteSizes.push_back(layout.bKLayout.getShapes()); - concreteSizes.push_back(layout.bNLayout.getShapes()); - opaqueSizes.push_back(opaqueLayout.kSize); - opaqueSizes.push_back(opaqueLayout.nSize); + outerSizes = getBSingleSubgroupLayout().outer; + opaqueSizes.append({k, n}); break; } case IREE::GPU::MMAFragment::Acc: { - concreteSizes.push_back(layout.cMLayout.getShapes()); - concreteSizes.push_back(layout.cNLayout.getShapes()); - opaqueSizes.push_back(opaqueLayout.mSize); - opaqueSizes.push_back(opaqueLayout.nSize); + outerSizes = getCSingleSubgroupLayout().outer; + opaqueSizes.append({m, n}); break; } } if (permutation.has_value()) { - if (permutation.value().size() != opaqueSizes.size()) { + if (permutation.value().size() != outerSizes.size()) { return failure(); } - applyPermutationToVector(concreteSizes, permutation.value()); applyPermutationToVector(opaqueSizes, permutation.value()); + applyPermutationToVector(outerSizes, permutation.value()); } // Inner tile must have sizes matching the opaque layout. @@ -819,11 +802,23 @@ LogicalResult MMAAttr::materializeOperandConcreteShape( return ReassociationIndices({idx}); }); int idx = reInds.size(); - for (ArrayRef sizes : concreteSizes) { - resultShape.append(SmallVector(sizes)); - reInds.push_back( - llvm::to_vector(llvm::seq(idx, idx + sizes.size()))); - idx += sizes.size(); + for (auto [outer, native] : llvm::zip_equal(outerSizes, opaqueSizes)) { + // Skip expansion if the outer dim is unit as the SingleSubgroupLayout gives + // a guarantee that the |element| counts are contiguous within the layout, + // and a unit outer implies a single offset and size for that dimension. + if (outer == 1) { + resultShape.push_back(native); + reInds.push_back(ReassociationIndices({idx++})); + continue; + } + + // Reshape to [outer, native / outer] == [outer, thread * element]. This + // corresponds to |outer| repetitions of the thread/element sublayout. + resultShape.push_back(outer); + assert(native % outer == 0 && "invalid mma layout"); + resultShape.push_back(native / outer); + reInds.push_back(ReassociationIndices{idx, idx + 1}); + idx += 2; } reassociations = reInds; diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp index 2f5a48b1d986..143cba102c94 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp @@ -73,16 +73,7 @@ LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target, lhsElemType, rhsElemType, initElemType}; SmallVector intrinsics; - SmallVector supportedMmas; for (IREE::GPU::MMAAttr mma : target.getWgp().getMma()) { - IREE::GPU::MMAIntrinsic type = mma.getIntrinsic().getValue(); - // TODO: Drop this once all intrinsics are supported. - if (type != IREE::GPU::MMAIntrinsic::MFMA_F32_16x16x16_F16 && - type != IREE::GPU::MMAIntrinsic::MFMA_I32_16x16x32_I8) { - continue; - } - supportedMmas.push_back(mma); - auto [mSize, nSize, kSize] = mma.getMNKShape(); auto [aType, bType, cType] = mma.getABCElementTypes(); if (mma.getSubgroupSize() != targetSubgroupSize) @@ -185,7 +176,8 @@ LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target, // Similarly the reduction tile size is just the post-packing tile count. reductionTileSizes[kDim] = schedule->kTileCount; - IREE::GPU::MmaInterfaceAttr mmaKind = supportedMmas[schedule->index]; + IREE::GPU::MmaInterfaceAttr mmaKind = + target.getWgp().getMma()[schedule->index]; // Attach the MMA schedule as an attribute to the entry point export function // for later access in the pipeline. diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/ConcretizeMmaShapes.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/ConcretizeMmaShapes.cpp index 9910840bc694..94bcf3dbe593 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/ConcretizeMmaShapes.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/ConcretizeMmaShapes.cpp @@ -9,7 +9,6 @@ #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.h" #include "iree/compiler/Codegen/Dialect/GPU/Transforms/Passes.h" #include "iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.h" -#include "iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtDialect.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" @@ -66,6 +65,13 @@ struct ConcretizeMmaOperandShape final : OpRewritePattern { return failure(); } + // Early exit if the operand is unaffected. + if (llvm::all_of(reassociations, [](ReassociationIndices reassoc) { + return reassoc.size() == 1; + })) { + return failure(); + } + // Create the expand_shape. Location loc = mmaOp->getLoc(); Value concreteOperand = rewriter diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Passes.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Passes.td index a882b835e4d2..a6eb9737611e 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Passes.td +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Passes.td @@ -16,6 +16,7 @@ def DistributeMmaToLanesPass : "::mlir::arith::ArithDialect", "::mlir::affine::AffineDialect", "::mlir::scf::SCFDialect", + "::mlir::tensor::TensorDialect", ]; } @@ -25,7 +26,6 @@ def ConcretizeMmaShapesPass : let dependentDialects = [ "::mlir::tensor::TensorDialect", "::mlir::iree_compiler::IREE::GPU::IREEGPUDialect", - "::mlir::iree_compiler::IREE::VectorExt::IREEVectorExtDialect", ]; let options = [ Option<"concretizeInputs", "concretize-inputs", diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.cpp index 45d8ad188c5b..7fd46e4c130a 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.cpp @@ -463,7 +463,8 @@ convertContractionToMultiMma(RewriterBase &rewriter, linalg::LinalgOp linalgOp, FailureOr distributeMultiMmaOp(RewriterBase &rewriter, IREE::GPU::MultiMmaOp mmaOp) { if (!mmaOp.hasTensorSemantics() || mmaOp.hasThreadSemantics()) { - return failure(); + return rewriter.notifyMatchFailure( + mmaOp, "mmaOp must have vector and subgroup for distribution."); } OpBuilder::InsertionGuard g(rewriter); @@ -508,7 +509,7 @@ FailureOr distributeMultiMmaOp(RewriterBase &rewriter, if (failed(mmaOp.getKind().populateOperandOffsetsSizesStrides( rewriter, loc, IREE::GPU::MMAFragment::Lhs, laneId, lhsPermutation, lhsOffsets, lhsSizes, lhsStrides))) { - return failure(); + return mmaOp->emitOpError("failed to populate lhs offsets"); } // Extract the rank-reduced slice of the lhs based on the expected inner // vector shape. @@ -528,7 +529,7 @@ FailureOr distributeMultiMmaOp(RewriterBase &rewriter, if (failed(mmaOp.getKind().populateOperandOffsetsSizesStrides( rewriter, loc, IREE::GPU::MMAFragment::Rhs, laneId, rhsPermutation, rhsOffsets, rhsSizes, rhsStrides))) { - return failure(); + return mmaOp->emitOpError("failed to populate rhs offsets"); } // Extract the rank-reduced slice of the rhs based on the expected inner // vector shape. @@ -548,7 +549,7 @@ FailureOr distributeMultiMmaOp(RewriterBase &rewriter, if (failed(mmaOp.getKind().populateOperandOffsetsSizesStrides( rewriter, loc, IREE::GPU::MMAFragment::Acc, laneId, accPermutation, accOffsets, accSizes, accStrides))) { - return failure(); + return mmaOp->emitOpError("failed to populate acc offsets"); } // Extract the rank-reduced slice of the accumulator based on the expected // inner vector shape. diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/concretize_mma_shapes.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/concretize_mma_shapes.mlir index 990bfea08d6d..facbb846efee 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/concretize_mma_shapes.mlir +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/concretize_mma_shapes.mlir @@ -21,19 +21,15 @@ func.func @concretize_multi_mma_F32_16x16x16_F16(%lhs: tensor<2x2x16x16xf16>, %r // CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<2x2x16x16xf16> // CHECK-SAME: %[[ACC:[A-Za-z0-9]+]]: tensor<2x2x16x16xf32> -// CHECK-INPUTS-DAG: %[[EXPANDED_LHS:.+]] = tensor.expand_shape %[[LHS]] {{\[}}[0], [1], [2], [3, 4]] output_shape [2, 2, 16, 4, 4] : tensor<2x2x16x16xf16> into tensor<2x2x16x4x4xf16> -// CHECK-INPUTS-DAG: %[[EXPANDED_RHS:.+]] = tensor.expand_shape %[[RHS]] {{\[}}[0], [1], [2, 3], [4]] output_shape [2, 2, 4, 4, 16] : tensor<2x2x16x16xf16> into tensor<2x2x4x4x16xf16> -// CHECK-INPUTS: %[[MMA:.+]] = iree_gpu.multi_mma %[[EXPANDED_LHS]], %[[EXPANDED_RHS]], %[[ACC]] +// CHECK-INPUTS: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]] // CHECK-INPUTS-SAME: lowering_config = #iree_gpu.lowering_config -// CHECK-INPUTS-SAME: : tensor<2x2x16x4x4xf16>, tensor<2x2x4x4x16xf16> into tensor<2x2x16x16xf32> +// CHECK-INPUTS-SAME: : tensor<2x2x16x16xf16>, tensor<2x2x16x16xf16> into tensor<2x2x16x16xf32> // CHECK-INPUTS: return %[[MMA]] -// CHECK-RESULT-DAG: %[[EXPANDED_ACC:.+]] = tensor.expand_shape %[[ACC]] {{\[}}[0], [1], [2, 3], [4]] output_shape [2, 2, 4, 4, 16] : tensor<2x2x16x16xf32> into tensor<2x2x4x4x16xf32> -// CHECK-RESULT: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[EXPANDED_ACC]] +// CHECK-RESULT: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]] // CHECK-RESULT-SAME: lowering_config = #iree_gpu.lowering_config -// CHECK-RESULT-SAME: : tensor<2x2x16x16xf16>, tensor<2x2x16x16xf16> into tensor<2x2x4x4x16xf32> -// CHECK-RESULT: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[MMA]] {{\[}}[0], [1], [2, 3], [4]] : tensor<2x2x4x4x16xf32> into tensor<2x2x16x16xf32> -// CHECK-RESULT: return %[[COLLAPSED]] +// CHECK-RESULT-SAME: : tensor<2x2x16x16xf16>, tensor<2x2x16x16xf16> into tensor<2x2x16x16xf32> +// CHECK-RESULT: return %[[MMA]] // ----- @@ -58,20 +54,16 @@ func.func @concretize_multi_mma_I32_16x16x32_I8(%lhs: tensor<2x2x16x32xi8>, %rhs // CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<2x2x16x32xi8> // CHECK-SAME: %[[ACC:[A-Za-z0-9]+]]: tensor<2x2x16x16xi32> -// CHECK-INPUTS-DAG: %[[EXPANDED_LHS:.+]] = tensor.expand_shape %[[LHS]] {{\[}}[0], [1], [2], [3, 4]] output_shape [2, 2, 16, 4, 8] : tensor<2x2x16x32xi8> into tensor<2x2x16x4x8xi8> -// CHECK-INPUTS-DAG: %[[EXPANDED_RHS:.+]] = tensor.expand_shape %[[RHS]] {{\[}}[0], [1], [2], [3, 4]] output_shape [2, 2, 16, 4, 8] : tensor<2x2x16x32xi8> into tensor<2x2x16x4x8xi8> -// CHECK-INPUTS: %[[MMA:.+]] = iree_gpu.multi_mma %[[EXPANDED_LHS]], %[[EXPANDED_RHS]], %[[ACC]] +// CHECK-INPUTS: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]] // CHECK-INPUTS-SAME: lowering_config = #iree_gpu.lowering_config -// CHECK-INPUTS-SAME: rhs_permutation = array -// CHECK-INPUTS-SAME: : tensor<2x2x16x4x8xi8>, tensor<2x2x16x4x8xi8> into tensor<2x2x16x16xi32> +// CHECK-INPUTS-SAME: rhs_permutation = array +// CHECK-INPUTS-SAME: : tensor<2x2x16x32xi8>, tensor<2x2x16x32xi8> into tensor<2x2x16x16xi32> // CHECK-INPUTS: return %[[MMA]] -// CHECK-RESULT-DAG: %[[EXPANDED_ACC:.+]] = tensor.expand_shape %[[ACC]] {{\[}}[0], [1], [2, 3], [4]] output_shape [2, 2, 4, 4, 16] : tensor<2x2x16x16xi32> into tensor<2x2x4x4x16xi32> -// CHECK-RESULT: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[EXPANDED_ACC]] +// CHECK-RESULT: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]] // CHECK-RESULT-SAME: lowering_config = #iree_gpu.lowering_config -// CHECK-RESULT-SAME: : tensor<2x2x16x32xi8>, tensor<2x2x16x32xi8> into tensor<2x2x4x4x16xi32> -// CHECK-RESULT: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[MMA]] {{\[}}[0], [1], [2, 3], [4]] : tensor<2x2x4x4x16xi32> into tensor<2x2x16x16xi32> -// CHECK-RESULT: return %[[COLLAPSED]] +// CHECK-RESULT-SAME: : tensor<2x2x16x32xi8>, tensor<2x2x16x32xi8> into tensor<2x2x16x16xi32> +// CHECK-RESULT: return %[[MMA]] // ----- @@ -95,16 +87,159 @@ func.func @concretize_multi_mma_F32_32x32x8_F16(%lhs: tensor<2x2x32x8xf16>, %rhs // CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<2x2x8x32xf16> // CHECK-SAME: %[[ACC:[A-Za-z0-9]+]]: tensor<2x2x32x32xf32> -// CHECK-INPUTS-DAG: %[[EXPANDED_LHS:.+]] = tensor.expand_shape %[[LHS]] {{\[}}[0], [1], [2], [3, 4]] output_shape [2, 2, 32, 2, 4] : tensor<2x2x32x8xf16> into tensor<2x2x32x2x4xf16> -// CHECK-INPUTS-DAG: %[[EXPANDED_RHS:.+]] = tensor.expand_shape %[[RHS]] {{\[}}[0], [1], [2, 3], [4]] output_shape [2, 2, 2, 4, 32] : tensor<2x2x8x32xf16> into tensor<2x2x2x4x32xf16> -// CHECK-INPUTS: %[[MMA:.+]] = iree_gpu.multi_mma %[[EXPANDED_LHS]], %[[EXPANDED_RHS]], %[[ACC]] +// CHECK-INPUTS: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]] // CHECK-INPUTS-SAME: lowering_config = #iree_gpu.lowering_config -// CHECK-INPUTS-SAME: : tensor<2x2x32x2x4xf16>, tensor<2x2x2x4x32xf16> into tensor<2x2x32x32xf32> +// CHECK-INPUTS-SAME: : tensor<2x2x32x8xf16>, tensor<2x2x8x32xf16> into tensor<2x2x32x32xf32> // CHECK-INPUTS: return %[[MMA]] -// CHECK-RESULT-DAG: %[[EXPANDED_ACC:.+]] = tensor.expand_shape %[[ACC]] {{\[}}[0], [1], [2, 3, 4], [5]] output_shape [2, 2, 4, 2, 4, 32] : tensor<2x2x32x32xf32> into tensor<2x2x4x2x4x32xf32> +// CHECK-RESULT-DAG: %[[EXPANDED_ACC:.+]] = tensor.expand_shape %[[ACC]] {{\[}}[0], [1], [2, 3], [4]] output_shape [2, 2, 4, 8, 32] // CHECK-RESULT: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[EXPANDED_ACC]] // CHECK-RESULT-SAME: lowering_config = #iree_gpu.lowering_config -// CHECK-RESULT-SAME: : tensor<2x2x32x8xf16>, tensor<2x2x8x32xf16> into tensor<2x2x4x2x4x32xf32> -// CHECK-RESULT: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[MMA]] {{\[}}[0], [1], [2, 3, 4], [5]] : tensor<2x2x4x2x4x32xf32> into tensor<2x2x32x32xf32> +// CHECK-RESULT-SAME: : tensor<2x2x32x8xf16>, tensor<2x2x8x32xf16> into tensor<2x2x4x8x32xf32> +// CHECK-RESULT: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[MMA]] {{\[}}[0], [1], [2, 3], [4]] +// CHECK-RESULT: return %[[COLLAPSED]] + +// ----- + +#contraction_accesses = [ + affine_map<(i, j, k) -> (i, k)>, + affine_map<(i, j, k) -> (k, j)>, + affine_map<(i, j, k) -> (i, j)> +] +#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 4], thread = [8, 4]}> +func.func @concretize_multi_mma_F32_32x32x8_F16(%lhs: tensor<2x2x32x8xf16>, %rhs: tensor<2x2x8x32xf16>, %acc: tensor<2x2x32x32xf32>) -> tensor<2x2x32x32xf32> { + %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { + indexing_maps = #contraction_accesses, + iterator_types = [#iree_gpu.iterator_type, #iree_gpu.iterator_type, #iree_gpu.iterator_type], + kind = #iree_gpu.mma_layout, lowering_config = #config, + acc_permutation = array + } : tensor<2x2x32x8xf16>, tensor<2x2x8x32xf16> into tensor<2x2x32x32xf32> + return %0 : tensor<2x2x32x32xf32> +} + +// CHECK-LABEL: func @concretize_multi_mma_F32_32x32x8_F16 +// CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: tensor<2x2x32x8xf16> +// CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<2x2x8x32xf16> +// CHECK-SAME: %[[ACC:[A-Za-z0-9]+]]: tensor<2x2x32x32xf32> + +// CHECK-RESULT-DAG: %[[EXPANDED_ACC:.+]] = tensor.expand_shape %[[ACC]] {{\[}}[0], [1], [2], [3, 4]] output_shape [2, 2, 32, 4, 8] +// CHECK-RESULT: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[EXPANDED_ACC]] +// CHECK-RESULT-SAME: acc_permutation = array +// CHECK-RESULT-SAME: lowering_config = #iree_gpu.lowering_config +// CHECK-RESULT-SAME: : tensor<2x2x32x8xf16>, tensor<2x2x8x32xf16> into tensor<2x2x32x4x8xf32> +// CHECK-RESULT: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[MMA]] {{\[}}[0], [1], [2], [3, 4]] +// CHECK-RESULT: return %[[COLLAPSED]] + +// ----- + +#contraction_accesses = [ + affine_map<() -> ()>, + affine_map<() -> ()>, + affine_map<() -> ()> +] +func.func @concretize_F32_16x16x4_F32(%lhs: tensor<16x4xf32>, %rhs: tensor<4x16xf32>, %acc: tensor<16x16xf32>) -> tensor<16x16xf32> { + %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { + indexing_maps = #contraction_accesses, + iterator_types = [], + kind = #iree_gpu.mma_layout + } : tensor<16x4xf32>, tensor<4x16xf32> into tensor<16x16xf32> + return %0 : tensor<16x16xf32> +} + +// CHECK-LABEL: func @concretize_F32_16x16x4_F32 + +// CHECK-INPUTS-NOT: tensor.expand_shape +// CHECK-INPUTS: %[[MMA:.+]] = iree_gpu.multi_mma +// CHECK-INPUTS: return %[[MMA]] + +// CHECK-RESULT-NOT: tensor.expand_shape +// CHECK-RESULT: %[[MMA:.+]] = iree_gpu.multi_mma +// CHECK-RESULT: return %[[MMA]] + +// ----- + +#contraction_accesses = [ + affine_map<() -> ()>, + affine_map<() -> ()>, + affine_map<() -> ()> +] +func.func @concretize_F32_16x16x32_F8E4M3FNUZ(%lhs: tensor<16x32xf8E4M3FNUZ>, %rhs: tensor<32x16xf8E4M3FNUZ>, %acc: tensor<16x16xf32>) -> tensor<16x16xf32> { + %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { + indexing_maps = #contraction_accesses, + iterator_types = [], + kind = #iree_gpu.mma_layout + } : tensor<16x32xf8E4M3FNUZ>, tensor<32x16xf8E4M3FNUZ> into tensor<16x16xf32> + return %0 : tensor<16x16xf32> +} + +// CHECK-LABEL: func @concretize_F32_16x16x32_F8E4M3FNUZ + +// CHECK-INPUTS-NOT: tensor.expand_shape +// CHECK-INPUTS: %[[MMA:.+]] = iree_gpu.multi_mma +// CHECK-INPUTS: return %[[MMA]] + +// CHECK-RESULT-NOT: tensor.expand_shape +// CHECK-RESULT: %[[MMA:.+]] = iree_gpu.multi_mma +// CHECK-RESULT: return %[[MMA]] + +// ----- + +#contraction_accesses = [ + affine_map<() -> ()>, + affine_map<() -> ()>, + affine_map<() -> ()> +] +func.func @concretize_I32_32x32x16_I8(%lhs: tensor<32x16xi8>, %rhs: tensor<16x32xi8>, %acc: tensor<32x32xi32>) -> tensor<32x32xi32> { + %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { + indexing_maps = #contraction_accesses, + iterator_types = [], + kind = #iree_gpu.mma_layout + } : tensor<32x16xi8>, tensor<16x32xi8> into tensor<32x32xi32> + return %0 : tensor<32x32xi32> +} + +// CHECK-LABEL: func @concretize_I32_32x32x16_I8 +// CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: tensor<32x16xi8> +// CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<16x32xi8> +// CHECK-SAME: %[[ACC:[A-Za-z0-9]+]]: tensor<32x32xi32> + +// CHECK-INPUTS-NOT: tensor.expand_shape +// CHECK-INPUTS: %[[MMA:.+]] = iree_gpu.multi_mma +// CHECK-INPUTS: return %[[MMA]] + +// CHECK-RESULT: %[[EXPANDED_ACC:.+]] = tensor.expand_shape %[[ACC]] {{\[}}[0, 1], [2]] output_shape [4, 8, 32] +// CHECK-RESULT: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[EXPANDED_ACC]] +// CHECK-RESULT-SAME: : tensor<32x16xi8>, tensor<16x32xi8> into tensor<4x8x32xi32> +// CHECK-RESULT: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[MMA]] {{\[}}[0, 1], [2]] +// CHECK-RESULT: return %[[COLLAPSED]] + +// ----- + +#contraction_accesses = [ + affine_map<() -> ()>, + affine_map<() -> ()>, + affine_map<() -> ()> +] +func.func @concretize_WMMA_F16_16x16x16_F16(%lhs: tensor<16x16xf16>, %rhs: tensor<16x16xf16>, %acc: tensor<16x16xf16>) -> tensor<16x16xf16> { + %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { + indexing_maps = #contraction_accesses, + iterator_types = [], + kind = #iree_gpu.mma_layout + } : tensor<16x16xf16>, tensor<16x16xf16> into tensor<16x16xf16> + return %0 : tensor<16x16xf16> +} + +// CHECK-LABEL: func @concretize_WMMA_F16_16x16x16_F16 +// CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: tensor<16x16xf16> +// CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<16x16xf16> +// CHECK-SAME: %[[ACC:[A-Za-z0-9]+]]: tensor<16x16xf16> + +// CHECK-INPUTS-NOT: tensor.expand_shape +// CHECK-INPUTS: %[[MMA:.+]] = iree_gpu.multi_mma +// CHECK-INPUTS: return %[[MMA]] + +// CHECK-RESULT: %[[EXPANDED_ACC:.+]] = tensor.expand_shape %[[ACC]] {{\[}}[0, 1], [2]] output_shape [8, 2, 16] +// CHECK-RESULT: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[EXPANDED_ACC]] +// CHECK-RESULT-SAME: : tensor<16x16xf16>, tensor<16x16xf16> into tensor<8x2x16xf16> +// CHECK-RESULT: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[MMA]] {{\[}}[0, 1], [2]] // CHECK-RESULT: return %[[COLLAPSED]] diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/distribute_mma_to_lanes.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/distribute_mma_to_lanes.mlir index 214b432f652c..5569b3b6247d 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/distribute_mma_to_lanes.mlir +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/distribute_mma_to_lanes.mlir @@ -31,3 +31,227 @@ module { // CHECK-SAME: kind = #iree_gpu.mma_layout // CHECK-SAME: : tensor<2x8x1x4xf16>, tensor<8x2x1x4xf16> into tensor<2x2x4x1xf32> // CHECK: mapping = [#iree_gpu.lane_id<0>] + +// ----- + +#contraction_accesses = [ + affine_map<(i, j, k) -> (i, k)>, + affine_map<(i, j, k) -> (k, j)>, + affine_map<(i, j, k) -> (i, j)> +] +module { + func.func @matmul_32x32x8(%arg0: tensor<2x8x32x8xf16>, %arg1: tensor<8x2x32x8xf16>, %arg2: tensor<2x2x4x8x32xf32>) -> tensor<2x2x4x8x32xf32> { + %mm = iree_gpu.multi_mma %arg0, %arg1, %arg2 { + indexing_maps = #contraction_accesses, + iterator_types = [#iree_gpu.iterator_type, #iree_gpu.iterator_type, #iree_gpu.iterator_type], + kind = #iree_gpu.mma_layout, + rhs_permutation = array + } : tensor<2x8x32x8xf16>, tensor<8x2x32x8xf16> into tensor<2x2x4x8x32xf32> + return %mm : tensor<2x2x4x8x32xf32> + } +} + +// CHECK-DAG: #[[$XMAP:.+]] = affine_map<(d0) -> (d0 mod 32)> +// CHECK-DAG: #[[$YMAP:.+]] = affine_map<(d0) -> ((d0 floordiv 32) * 4 - ((d0 floordiv 32) floordiv 2) * 8)> +// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> +// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)> +// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> + +// CHECK-LABEL: func @matmul_32x32x8 +// CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: tensor<2x8x32x8xf16> +// CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<8x2x32x8xf16> +// CHECK: scf.forall (%[[LANEID:.+]]) in (64) shared_outs(%[[ACC:.+]] = {{.*}}) -> (tensor<2x2x4x8x32xf32>) +// CHECK-DAG: %[[IDX:.+]] = affine.apply #[[$XMAP]](%[[LANEID]]) +// CHECK-DAG: %[[IDY:.+]] = affine.apply #[[$YMAP]](%[[LANEID]]) +// CHECK-DAG: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][0, 0, %[[IDX]], %[[IDY]]] [2, 8, 1, 4] +// CHECK-DAG: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][0, 0, %[[IDX]], %[[IDY]]] [8, 2, 1, 4] +// CHECK-DAG: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC]][0, 0, 0, %[[IDY]], %[[IDX]]] [2, 2, 4, 4, 1] +// CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS_SLICE]], %[[RHS_SLICE]], %[[ACC_SLICE]] +// CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] +// CHECK-SAME: kind = #iree_gpu.mma_layout +// CHECK-SAME: : tensor<2x8x1x4xf16>, tensor<8x2x1x4xf16> into tensor<2x2x4x4x1xf32> +// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ACC]][0, 0, 0, %[[IDY]], %[[IDX]]] [2, 2, 4, 4, 1] +// CHECK: mapping = [#iree_gpu.lane_id<0>] + +// ----- + +#contraction_accesses = [ + affine_map<(i, j, k) -> (i, k)>, + affine_map<(i, j, k) -> (k, j)>, + affine_map<(i, j, k) -> (i, j)> +] +module { + func.func @matmul_wmma_16x16x16(%arg0: tensor<2x8x16x16xf16>, %arg1: tensor<8x2x16x16xf16>, %arg2: tensor<2x2x8x2x16xf32>) -> tensor<2x2x8x2x16xf32> { + %mm = iree_gpu.multi_mma %arg0, %arg1, %arg2 { + indexing_maps = #contraction_accesses, + iterator_types = [#iree_gpu.iterator_type, #iree_gpu.iterator_type, #iree_gpu.iterator_type], + kind = #iree_gpu.mma_layout, + rhs_permutation = array + } : tensor<2x8x16x16xf16>, tensor<8x2x16x16xf16> into tensor<2x2x8x2x16xf32> + return %mm : tensor<2x2x8x2x16xf32> + } +} + +// CHECK-DAG: #[[$XMAP:.+]] = affine_map<(d0) -> (d0 mod 16)> +// CHECK-DAG: #[[$YMAP:.+]] = affine_map<(d0) -> ((d0 floordiv 16) mod 2)> +// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> +// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)> +// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> + +// CHECK-LABEL: func @matmul_wmma_16x16x16 +// CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: tensor<2x8x16x16xf16> +// CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<8x2x16x16xf16> +// CHECK: scf.forall (%[[LANEID:.+]]) in (32) shared_outs(%[[ACC:.+]] = {{.*}}) -> (tensor<2x2x8x2x16xf32>) +// CHECK-DAG: %[[IDX:.+]] = affine.apply #[[$XMAP]](%[[LANEID]]) +// CHECK-DAG: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][0, 0, %[[IDX]], 0] [2, 8, 1, 16] +// CHECK-DAG: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][0, 0, %[[IDX]], 0] [8, 2, 1, 16] +// CHECK-DAG: %[[IDY:.+]] = affine.apply #[[$YMAP]](%[[LANEID]]) +// CHECK-DAG: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC]][0, 0, 0, %[[IDY]], %[[IDX]]] [2, 2, 8, 1, 1] +// CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS_SLICE]], %[[RHS_SLICE]], %[[ACC_SLICE]] +// CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] +// CHECK-SAME: kind = #iree_gpu.mma_layout +// CHECK-SAME: : tensor<2x8x1x16xf16>, tensor<8x2x1x16xf16> into tensor<2x2x8x1x1xf32> +// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ACC]][0, 0, 0, %[[IDY]], %[[IDX]]] [2, 2, 8, 1, 1] +// CHECK: mapping = [#iree_gpu.lane_id<0>] + +// ----- + +#contraction_accesses = [ + affine_map<() -> ()>, + affine_map<() -> ()>, + affine_map<() -> ()> +] +func.func @distribute_MFMA_F32_16x16x4_F32(%lhs: tensor<16x4xf32>, %rhs: tensor<4x16xf32>, %acc: tensor<16x16xf32>) -> tensor<16x16xf32> { + %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { + indexing_maps = #contraction_accesses, + iterator_types = [], + kind = #iree_gpu.mma_layout + } : tensor<16x4xf32>, tensor<4x16xf32> into tensor<16x16xf32> + return %0 : tensor<16x16xf32> +} + +// CHECK-DAG: #[[$XMAP:.+]] = affine_map<(d0) -> (d0 mod 16)> +// CHECK-DAG: #[[$YMAP:.+]] = affine_map<(d0) -> ((d0 floordiv 16) mod 4)> +// CHECK-DAG: #[[$ZMAP:.+]] = affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)> + +// CHECK-LABEL: func @distribute_MFMA_F32_16x16x4_F32 +// CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: tensor<16x4xf32> +// CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<4x16xf32> +// CHECK: scf.forall (%[[LANEID:.+]]) in (64) shared_outs(%[[ACC:.+]] = {{.*}}) -> (tensor<16x16xf32>) +// CHECK-DAG: %[[IDX:.+]] = affine.apply #[[$XMAP]](%[[LANEID]]) +// CHECK-DAG: %[[IDY:.+]] = affine.apply #[[$YMAP]](%[[LANEID]]) +// CHECK-DAG: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][%[[IDX]], %[[IDY]]] [1, 1] +// CHECK-DAG: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][%[[IDY]], %[[IDX]]] [1, 1] +// CHECK-DAG: %[[IDZ:.+]] = affine.apply #[[$ZMAP]](%[[LANEID]]) +// CHECK-DAG: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC]][%[[IDZ]], %[[IDX]]] [4, 1] +// CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS_SLICE]], %[[RHS_SLICE]], %[[ACC_SLICE]] +// CHECK-SAME: kind = #iree_gpu.mma_layout +// CHECK-SAME: : tensor<1x1xf32>, tensor<1x1xf32> into tensor<4x1xf32> +// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ACC]][%[[IDZ]], %[[IDX]]] [4, 1] +// CHECK: mapping = [#iree_gpu.lane_id<0>] + +// ----- + +#contraction_accesses = [ + affine_map<() -> ()>, + affine_map<() -> ()>, + affine_map<() -> ()> +] +func.func @distribute_F32_16x16x32_F8E4M3FNUZ(%lhs: tensor<16x32xf8E4M3FNUZ>, %rhs: tensor<32x16xf8E4M3FNUZ>, %acc: tensor<16x16xf32>) -> tensor<16x16xf32> { + %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { + indexing_maps = #contraction_accesses, + iterator_types = [], + kind = #iree_gpu.mma_layout + } : tensor<16x32xf8E4M3FNUZ>, tensor<32x16xf8E4M3FNUZ> into tensor<16x16xf32> + return %0 : tensor<16x16xf32> +} + +// CHECK-DAG: #[[$XMAP:.+]] = affine_map<(d0) -> (d0 mod 16)> +// CHECK-DAG: #[[$YMAP:.+]] = affine_map<(d0) -> ((d0 floordiv 16) * 8 - ((d0 floordiv 16) floordiv 4) * 32)> +// CHECK-DAG: #[[$ZMAP:.+]] = affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)> + +// CHECK-LABEL: func @distribute_F32_16x16x32_F8E4M3FNUZ +// CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: tensor<16x32xf8E4M3FNUZ> +// CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<32x16xf8E4M3FNUZ> +// CHECK: scf.forall (%[[LANEID:.+]]) in (64) shared_outs(%[[ACC:.+]] = {{.*}}) -> (tensor<16x16xf32>) +// CHECK-DAG: %[[IDX:.+]] = affine.apply #[[$XMAP]](%[[LANEID]]) +// CHECK-DAG: %[[IDY:.+]] = affine.apply #[[$YMAP]](%[[LANEID]]) +// CHECK-DAG: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][%[[IDX]], %[[IDY]]] [1, 8] +// CHECK-DAG: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][%[[IDY]], %[[IDX]]] [8, 1] +// CHECK-DAG: %[[IDZ:.+]] = affine.apply #[[$ZMAP]](%[[LANEID]]) +// CHECK-DAG: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC]][%[[IDZ]], %[[IDX]]] [4, 1] +// CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS_SLICE]], %[[RHS_SLICE]], %[[ACC_SLICE]] +// CHECK-SAME: kind = #iree_gpu.mma_layout +// CHECK-SAME: : tensor<1x8xf8E4M3FNUZ>, tensor<8x1xf8E4M3FNUZ> into tensor<4x1xf32> +// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ACC]][%[[IDZ]], %[[IDX]]] [4, 1] +// CHECK: mapping = [#iree_gpu.lane_id<0>] + +// ----- + +#contraction_accesses = [ + affine_map<() -> ()>, + affine_map<() -> ()>, + affine_map<() -> ()> +] +func.func @distribute_I32_32x32x16_I8(%lhs: tensor<32x16xi8>, %rhs: tensor<16x32xi8>, %acc: tensor<4x8x32xi32>) -> tensor<4x8x32xi32> { + %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { + indexing_maps = #contraction_accesses, + iterator_types = [], + kind = #iree_gpu.mma_layout + } : tensor<32x16xi8>, tensor<16x32xi8> into tensor<4x8x32xi32> + return %0 : tensor<4x8x32xi32> +} + +// CHECK-DAG: #[[$XMAP:.+]] = affine_map<(d0) -> (d0 mod 32)> +// CHECK-DAG: #[[$YMAP:.+]] = affine_map<(d0) -> ((d0 floordiv 32) * 8 - ((d0 floordiv 32) floordiv 2) * 16)> +// CHECK-DAG: #[[$ZMAP:.+]] = affine_map<(d0) -> ((d0 floordiv 32) * 4 - ((d0 floordiv 32) floordiv 2) * 8)> + +// CHECK-LABEL: func @distribute_I32_32x32x16_I8 +// CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: tensor<32x16xi8> +// CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<16x32xi8> +// CHECK: scf.forall (%[[LANEID:.+]]) in (64) shared_outs(%[[ACC:.+]] = {{.*}}) -> (tensor<4x8x32xi32>) +// CHECK-DAG: %[[IDX:.+]] = affine.apply #[[$XMAP]](%[[LANEID]]) +// CHECK-DAG: %[[IDY:.+]] = affine.apply #[[$YMAP]](%[[LANEID]]) +// CHECK-DAG: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][%[[IDX]], %[[IDY]]] [1, 8] +// CHECK-DAG: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][%[[IDY]], %[[IDX]]] [8, 1] +// CHECK-DAG: %[[IDZ:.+]] = affine.apply #[[$ZMAP]](%[[LANEID]]) +// CHECK-DAG: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC]][0, %[[IDZ]], %[[IDX]]] [4, 4, 1] +// CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS_SLICE]], %[[RHS_SLICE]], %[[ACC_SLICE]] +// CHECK-SAME: kind = #iree_gpu.mma_layout +// CHECK-SAME: : tensor<1x8xi8>, tensor<8x1xi8> into tensor<4x4x1xi32> +// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ACC]][0, %[[IDZ]], %[[IDX]]] [4, 4, 1] +// CHECK: mapping = [#iree_gpu.lane_id<0>] + +// ----- + +#contraction_accesses = [ + affine_map<() -> ()>, + affine_map<() -> ()>, + affine_map<() -> ()> +] +func.func @distribute_WMMA_F16_16x16x16_F16(%lhs: tensor<16x16xf16>, %rhs: tensor<16x16xf16>, %acc: tensor<8x2x16xf16>) -> tensor<8x2x16xf16> { + %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { + indexing_maps = #contraction_accesses, + iterator_types = [], + kind = #iree_gpu.mma_layout + } : tensor<16x16xf16>, tensor<16x16xf16> into tensor<8x2x16xf16> + return %0 : tensor<8x2x16xf16> +} + +// CHECK-DAG: #[[$XMAP:.+]] = affine_map<(d0) -> (d0 mod 16)> +// CHECK-DAG: #[[$YMAP:.+]] = affine_map<(d0) -> ((d0 floordiv 16) mod 2)> + +// CHECK-LABEL: func @distribute_WMMA_F16_16x16x16_F16 +// CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: tensor<16x16xf16> +// CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<16x16xf16> +// CHECK: scf.forall (%[[LANEID:.+]]) in (32) shared_outs(%[[ACC:.+]] = {{.*}}) -> (tensor<8x2x16xf16>) +// CHECK-DAG: %[[IDX:.+]] = affine.apply #[[$XMAP]](%[[LANEID]]) +// CHECK-DAG: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][%[[IDX]], 0] [1, 16] +// CHECK-DAG: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][0, %[[IDX]]] [16, 1] +// CHECK-DAG: %[[IDY:.+]] = affine.apply #[[$YMAP]](%[[LANEID]]) +// CHECK-DAG: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC]][0, %[[IDY]], %[[IDX]]] [8, 1, 1] +// CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS_SLICE]], %[[RHS_SLICE]], %[[ACC_SLICE]] +// CHECK-SAME: kind = #iree_gpu.mma_layout +// CHECK-SAME: : tensor<1x16xf16>, tensor<16x1xf16> into tensor<8x1x1xf16> +// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ACC]][0, %[[IDY]], %[[IDX]]] [8, 1, 1] +// CHECK: mapping = [#iree_gpu.lane_id<0>] diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp index 8b74d1b4a3d4..68d1a4f6177c 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp @@ -306,6 +306,14 @@ void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager) { funcPassManager.addPass(createGPUPromoteMatmulOperandsPass()); funcPassManager.addPass(IREE::GPU::createPackToIntrinsicsPass()); + // Step 1.5. Expand result shapes of MultiMmaOps before reduction tiling. + { + IREE::GPU::ConcretizeMmaShapesPassOptions options; + options.concretizeInputs = false; + options.concretizeResult = true; + funcPassManager.addPass(IREE::GPU::createConcretizeMmaShapesPass()); + } + // Step 2. Tile and fuse tileable ops to reduction loops. { GPUApplyTilingLevelPassOptions options; @@ -315,16 +323,26 @@ void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager) { funcPassManager.addPass(createCSEPass()); } - // Decompose pack and unpack ops and propagte the resulting reshapes. + // Step 3. Decompose pack and unpack ops and propagate the resulting reshapes. funcPassManager.addPass( createDecomposePackUnPackOpsPass(/*tileOuterToOne=*/false)); + + // Step 3.5. Expand the inner dimensions of MultiMma ops in preparation for + // distribution to lanes. + { + IREE::GPU::ConcretizeMmaShapesPassOptions options; + options.concretizeInputs = true; + options.concretizeResult = false; + funcPassManager.addPass(IREE::GPU::createConcretizeMmaShapesPass()); + } + funcPassManager.addPass(createPropagateReshapesByExpansionPass()); funcPassManager.addPass(createCanonicalizerPass()); funcPassManager.addPass(createCSEPass()); funcPassManager.addPass(createConvertToDestinationPassingStylePass( /*useWARForCooperativeMatrixCodegen=*/false)); - // Step 3. Tile and fuse tileable ops to subgroups/threads. + // Step 4. Tile and fuse tileable ops to subgroups/threads. { GPUApplyTilingLevelPassOptions options; options.tilingLevel = IREE::GPU::TilingLevel::Thread; @@ -347,35 +365,35 @@ void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager) { funcPassManager.addPass(createCSEPass()); funcPassManager.addPass(createLoopInvariantCodeMotionPass()); - // Step 4. Greedily fuse parallel loops and hoist from serial loops. + // Step 5. Greedily fuse parallel loops and hoist from serial loops. funcPassManager.addPass(IREE::GPU::createFuseAndHoistParallelLoopsPass()); funcPassManager.addPass(createCanonicalizerPass()); funcPassManager.addPass(createCSEPass()); funcPassManager.addPass(createLoopInvariantCodeMotionPass()); - // Step 5. Lower special ops and vectorize. + // Step 6. Lower special ops and vectorize. funcPassManager.addPass(IREE::GPU::createVectorizeIREEGPUOpsPass()); addGPUVectorizationPasses(funcPassManager); funcPassManager.addPass(createCleanupBufferAllocViewPass()); - // Step 6. Bufferize. + // Step 7. Bufferize. // TODO: This is a workaround for a bug in the lowering of // `iree_gpu.shuffle_tensor` which does not properly represent the concurrent // nature of the write to the intermediate tensor. addBufferizePasses(funcPassManager, /*allowPrivateAllocations=*/false); - // Step 7. Resolve remaining parallel loops. + // Step 8. Resolve remaining parallel loops. funcPassManager.addPass(createGPUDistributePass()); - // Vectorize copies that came out of vectorization. + // Vectorize copies that came out of bufferization. funcPassManager.addPass(createVectorizeMemrefCopyPass()); - // Step 7. Unroll operations to native intrinsic widths. + // Step 8. Unroll operations to native intrinsic widths. funcPassManager.addPass(IREE::GPU::createUnrollToIntrinsicsPass()); funcPassManager.addPass(createCanonicalizerPass()); funcPassManager.addPass(createCSEPass()); - // Step 8. Remaining post-bufferization optimizations/lowerings. + // Step 9. Remaining post-bufferization optimizations/lowerings. funcPassManager.addPass(IREE::GPU::createLowerIREEGPUOpsPass()); funcPassManager.addPass(createLoopInvariantCodeMotionPass()); funcPassManager.addPass(memref::createFoldMemRefAliasOpsPass()); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir index a0e1ce623ea9..edf78ea66848 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir @@ -214,3 +214,287 @@ hal.executable private @main { // CHECK: %[[LOOP_T:.+]] = vector.transpose %[[LOOP]], [0, 1, 3, 2, 4] : vector<1x2x2x4x1xf32> to vector<1x2x4x2x1xf32> // CHECK: %[[EXTRACT:.+]] = vector.extract %[[LOOP_T]][0] : vector<2x4x2x1xf32> from vector<1x2x4x2x1xf32> // CHECK: vector.transfer_write %[[EXTRACT]], %[[B2]] + +// ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +#config = #iree_gpu.lowering_config<{ + workgroup = [64, 64, 0], + reduction = [0, 0, 2], + subgroup = [2, 2], + mma_kind = #iree_gpu.mma_layout}> +hal.executable public @main { + hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { + hal.executable.export public @matmul_transpose_b_wmma ordinal(0) layout(#pipeline_layout) { + ^bb0(%arg0: !hal.device): + %x, %y, %z = flow.dispatch.workgroup_count_from_slice + hal.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @matmul_transpose_b_wmma() + attributes {translation_info = #iree_codegen.translation_info} { + %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2048x1280xf16> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [10240, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<10240x1280xf16> + %5 = tensor.empty() : tensor<2048x10240xf32> + %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<2048x10240xf32>) -> tensor<2048x10240xf32> + %7 = linalg.matmul_transpose_b {lowering_config = #config} + ins(%3, %4 : tensor<2048x1280xf16>, tensor<10240x1280xf16>) + outs(%6 : tensor<2048x10240xf32>) -> tensor<2048x10240xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 10240], strides = [1, 1] : tensor<2048x10240xf32> -> !flow.dispatch.tensor> + return + } + } + } +} + +// CHECK-LABEL: func @matmul_transpose_b_wmma +// CHECK-DAG: %[[B0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[B1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) +// CHECK-DAG: %[[B2:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) +// CHECK-DAG: memref.alloc() : memref<64x32xf16, #gpu.address_space> +// CHECK-DAG: memref.alloc() : memref<64x32xf16, #gpu.address_space> +// CHECK: %[[LOOP:.+]] = scf.for %[[IV:.+]] = %c0 to %c80 step %c2 {{.*}} -> (vector<2x2x8x1x1xf32>) +// CHECK: gpu.barrier +// CHECK: %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<2x8xf16> +// CHECK: vector.transfer_write %[[LHS_RD]] +// CHECK: gpu.barrier +// CHECK: %[[LHS_MM:.+]] = vector.transfer_read {{.*}} vector<2x1x2x16xf16> +// CHECK: gpu.barrier +// CHECK: vector.transpose %[[LHS_MM]], [0, 2, 1, 3] : vector<2x1x2x16xf16> +// CHECK: %[[RHS_RD:.+]] = vector.transfer_read %[[B1]]{{.*}} vector<2x8xf16> +// CHECK: vector.transfer_write %[[RHS_RD]] +// CHECK: gpu.barrier +// CHECK: %[[RHS_MM:.+]] = vector.transfer_read {{.*}} vector<2x1x2x16xf16> +// CHECK: gpu.barrier +// CHECK: vector.transpose %[[RHS_MM]], [0, 2, 1, 3] : vector<2x1x2x16xf16> +// CHECK-COUNT-8: amdgpu.wmma {{.*}} : vector<16xf16>, vector<16xf16>, vector<8xf32> +// CHECK: scf.yield +// CHECK: %[[LOOP_T:.+]] = vector.transpose %[[LOOP]], [0, 2, 3, 1, 4] : vector<2x2x8x1x1xf32> to vector<2x8x1x2x1xf32> +// CHECK: vector.transfer_write %[[LOOP_T]], %[[B2]] + +// ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +#config = #iree_gpu.lowering_config<{ + workgroup = [64, 64, 0], + reduction = [0, 0, 2], + subgroup = [2, 2], + mma_kind = #iree_gpu.mma_layout}> + +!eltype = f32 +!aeltype = f32 + +hal.executable public @main { + hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { + hal.executable.export public @matmul_transpose_b_mfma_16x16x4 ordinal(0) layout(#pipeline_layout) { + ^bb0(%arg0: !hal.device): + %x, %y, %z = flow.dispatch.workgroup_count_from_slice + hal.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @matmul_transpose_b_mfma_16x16x4() + attributes {translation_info = #iree_codegen.translation_info} { + %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2048x1280x!eltype> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [10240, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<10240x1280x!eltype> + %5 = tensor.empty() : tensor<2048x10240x!aeltype> + %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<2048x10240x!aeltype>) -> tensor<2048x10240x!aeltype> + %7 = linalg.matmul_transpose_b {lowering_config = #config} + ins(%3, %4 : tensor<2048x1280x!eltype>, tensor<10240x1280x!eltype>) + outs(%6 : tensor<2048x10240x!aeltype>) -> tensor<2048x10240x!aeltype> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 10240], strides = [1, 1] : tensor<2048x10240x!aeltype> -> !flow.dispatch.tensor> + return + } + } + } +} + +// CHECK-LABEL: func @matmul_transpose_b_mfma_16x16x4 +// CHECK-DAG: memref.alloc() : memref<64x8xf32, #gpu.address_space> +// CHECK-DAG: memref.alloc() : memref<64x8xf32, #gpu.address_space> +// CHECK: scf.for %{{.*}} = %c0 to %c320 step %c2 {{.*}} -> (vector<2x2x4x1xf32>) +// CHECK-COUNT-8: amdgpu.mfma {{.*}}blocks = 1 : i32, k = 4 : i32, m = 16 : i32, n = 16 : i32 +// CHECK: scf.yield + +// ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +#config = #iree_gpu.lowering_config<{ + workgroup = [64, 64, 0], + reduction = [0, 0, 2], + subgroup = [2, 2], + mma_kind = #iree_gpu.mma_layout}> + +!eltype = f8E4M3FNUZ +!aeltype = f32 + +hal.executable public @main { + hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { + hal.executable.export public @matmul_transpose_b_mfma_16x16x32_f8 ordinal(0) layout(#pipeline_layout) { + ^bb0(%arg0: !hal.device): + %x, %y, %z = flow.dispatch.workgroup_count_from_slice + hal.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @matmul_transpose_b_mfma_16x16x32_f8() + attributes {translation_info = #iree_codegen.translation_info} { + %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2048x1280x!eltype> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [10240, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<10240x1280x!eltype> + %5 = tensor.empty() : tensor<2048x10240x!aeltype> + %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<2048x10240x!aeltype>) -> tensor<2048x10240x!aeltype> + %7 = linalg.matmul_transpose_b {lowering_config = #config} + ins(%3, %4 : tensor<2048x1280x!eltype>, tensor<10240x1280x!eltype>) + outs(%6 : tensor<2048x10240x!aeltype>) -> tensor<2048x10240x!aeltype> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 10240], strides = [1, 1] : tensor<2048x10240x!aeltype> -> !flow.dispatch.tensor> + return + } + } + } +} + +// CHECK-LABEL: func @matmul_transpose_b_mfma_16x16x32_f8 +// CHECK-DAG: memref.alloc() : memref<64x64xf8E4M3FNUZ, #gpu.address_space> +// CHECK-DAG: memref.alloc() : memref<64x64xf8E4M3FNUZ, #gpu.address_space> +// CHECK: scf.for %{{.*}} = %c0 to %c40 step %c2 {{.*}} -> (vector<2x2x4x1xf32>) +// CHECK-COUNT-8: amdgpu.mfma {{.*}}blocks = 1 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32 +// CHECK: scf.yield + +// ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +#config = #iree_gpu.lowering_config<{ + workgroup = [64, 64, 0], + reduction = [0, 0, 2], + subgroup = [2, 2], + mma_kind = #iree_gpu.mma_layout}> + +!eltype = i8 +!aeltype = i32 + +hal.executable public @main { + hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { + hal.executable.export public @matmul_transpose_b_mfma_32x32x16_i8 ordinal(0) layout(#pipeline_layout) { + ^bb0(%arg0: !hal.device): + %x, %y, %z = flow.dispatch.workgroup_count_from_slice + hal.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @matmul_transpose_b_mfma_32x32x16_i8() + attributes {translation_info = #iree_codegen.translation_info} { + %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2048x1280x!eltype> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [10240, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<10240x1280x!eltype> + %5 = tensor.empty() : tensor<2048x10240x!aeltype> + %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<2048x10240x!aeltype>) -> tensor<2048x10240x!aeltype> + %7 = linalg.matmul_transpose_b {lowering_config = #config} + ins(%3, %4 : tensor<2048x1280x!eltype>, tensor<10240x1280x!eltype>) + outs(%6 : tensor<2048x10240x!aeltype>) -> tensor<2048x10240x!aeltype> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 10240], strides = [1, 1] : tensor<2048x10240x!aeltype> -> !flow.dispatch.tensor> + return + } + } + } +} + +// CHECK-LABEL: func @matmul_transpose_b_mfma_32x32x16_i8 +// CHECK-DAG: memref.alloc() : memref<64x32xi8, #gpu.address_space> +// CHECK-DAG: memref.alloc() : memref<64x32xi8, #gpu.address_space> +// CHECK: scf.for %{{.*}} = %c0 to %c80 step %c2 {{.*}} -> (vector<2x2x4x4x1xi32>) +// CHECK-COUNT-8: amdgpu.mfma {{.*}}blocks = 1 : i32, k = 16 : i32, m = 32 : i32, n = 32 : i32 +// CHECK: scf.yield + +// ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +#config = #iree_gpu.lowering_config<{ + workgroup = [64, 64, 0], + reduction = [0, 0, 2], + subgroup = [2, 2], + mma_kind = #iree_gpu.mma_layout}> + +!eltype = f16 +!aeltype = f16 + +hal.executable public @main { + hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { + hal.executable.export public @matmul_transpose_b_wmma_f16_16x16x16_f16 ordinal(0) layout(#pipeline_layout) { + ^bb0(%arg0: !hal.device): + %x, %y, %z = flow.dispatch.workgroup_count_from_slice + hal.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @matmul_transpose_b_wmma_f16_16x16x16_f16() + attributes {translation_info = #iree_codegen.translation_info} { + %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2048x1280x!eltype> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [10240, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<10240x1280x!eltype> + %5 = tensor.empty() : tensor<2048x10240x!aeltype> + %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<2048x10240x!aeltype>) -> tensor<2048x10240x!aeltype> + %7 = linalg.matmul_transpose_b {lowering_config = #config} + ins(%3, %4 : tensor<2048x1280x!eltype>, tensor<10240x1280x!eltype>) + outs(%6 : tensor<2048x10240x!aeltype>) -> tensor<2048x10240x!aeltype> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 10240], strides = [1, 1] : tensor<2048x10240x!aeltype> -> !flow.dispatch.tensor> + return + } + } + } +} + +// CHECK-LABEL: func @matmul_transpose_b_wmma_f16_16x16x16_f16 +// CHECK-DAG: memref.alloc() : memref<64x32xf16, #gpu.address_space> +// CHECK-DAG: memref.alloc() : memref<64x32xf16, #gpu.address_space> +// CHECK: scf.for %{{.*}} = %c0 to %c80 step %c2 {{.*}} -> (vector<2x2x8x1x1xf16>) +// CHECK-COUNT-8: amdgpu.wmma {{.*}} : vector<16xf16>, vector<16xf16>, vector<8xf16> +// CHECK: scf.yield