From 7812c776d5d57b13d80b6ae27f2dc86c73fddbcf Mon Sep 17 00:00:00 2001 From: Quinn Dawkins Date: Tue, 13 Aug 2024 11:33:02 -0400 Subject: [PATCH] [Codegen][GPU] Add support for all other intrinsics to TileAndFuse (#18179) This adds the ConcretizeMmaShapes pass to the LLVMGPUTileAndFuse pipeline to add support for other intrinsic types, in particular MFMA and WMMA variants that require reshaping of the accumulator to match requirements of the layout. This also reworks the reshaping code to use SingleSubgroupLayout instead of VectorExt::PerDimLayoutAttr to drop an unneeded dialect dependency and also simplify the IR for cases where reshaping is not needed. In particular, when there is a unit `outer` dimension in a layout, no additional reshaping is needed so we can omit the reshapes in such cases. There is an option in the future to still do such reshaping so as to pre-swizzle the data needed for the MMA during the store to shared memory, but the details for how best to implement that are left as TODO. --- .../Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp | 77 +++-- .../Dialect/GPU/TargetUtils/ConfigUtils.cpp | 12 +- .../GPU/Transforms/ConcretizeMmaShapes.cpp | 8 +- .../Codegen/Dialect/GPU/Transforms/Passes.td | 2 +- .../Dialect/GPU/Transforms/Transforms.cpp | 9 +- .../test/concretize_mma_shapes.mlir | 187 ++++++++++-- .../test/distribute_mma_to_lanes.mlir | 224 ++++++++++++++ .../iree/compiler/Codegen/LLVMGPU/Passes.cpp | 36 ++- .../test/ROCDL/pipeline_tile_and_fuse.mlir | 284 ++++++++++++++++++ 9 files changed, 747 insertions(+), 92 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp index 4b6cd7b8dd66..815f28626940 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp @@ -664,9 +664,7 @@ static LogicalResult populateCanonicalOffsetsSizesAndStrides( llvm::zip_equal(subgroupLayout.outer, subgroupLayout.thread, subgroupLayout.element)) { if (outer != 1) { - // TODO: Support this case. Might need a reshape since this makes the - // slice non-contigious. - return failure(); + rankReducedShape.push_back(outer); } rankReducedShape.push_back(thread * element); } @@ -690,6 +688,7 @@ static LogicalResult populateCanonicalOffsetsSizesAndStrides( subgroupLayout.element)) { if (dimSize == 1) { vtids.push_back(zero); + continue; } // ((tid floordiv stride) mod size) * element. @@ -702,7 +701,12 @@ static LogicalResult populateCanonicalOffsetsSizesAndStrides( } int64_t idx = 0; - for (int64_t element : subgroupLayout.element) { + for (auto [element, outer] : + llvm::zip_equal(subgroupLayout.element, subgroupLayout.outer)) { + if (outer != 1) { + canonicalSizes.push_back(builder.getIndexAttr(outer)); + canonicalOffsets.push_back(zero); + } canonicalSizes.push_back(builder.getIndexAttr(element)); canonicalOffsets.push_back(vtids[idx++]); } @@ -716,13 +720,6 @@ LogicalResult MMAAttr::populateOperandOffsetsSizesStrides( Value laneId, ArrayRef permutation, SmallVector &offsets, SmallVector &sizes, SmallVector &strides) const { - switch (getIntrinsic().getValue()) { - case MMAIntrinsic::MFMA_F32_16x16x16_F16: - case MMAIntrinsic::MFMA_I32_16x16x32_I8: - break; - default: - return failure(); - } MMAAttr::SingleSubgroupLayout subgroupLayout; switch (fragment) { @@ -758,47 +755,33 @@ LogicalResult MMAAttr::materializeOperandConcreteShape( std::optional> permutation, SmallVector &reassociations, RankedTensorType &resultType) const { - OpaqueMmaLayout opaqueLayout = - getOpaqueMFMALayout(operand.getContext(), getIntrinsic().getValue()); - // TODO(Max191): The `getConcreteMFMALayout` function creates some - // `PerDimLayoutAttr` that are not used by this function. This means that - // any pass that uses `materializeOperandConcreteShape` needs to be - // dependent on the VectorExt dialect. Ideally, the `getConcreteMFMALayout` - // function should be refactored so we can reuse the shape information of - // the layout without needing to create any `PerDimLayoutAttr`. - ConcreteMmaLayout layout = - getConcreteMFMALayout(operand.getContext(), getIntrinsic().getValue()); - SmallVector> concreteSizes; + + SmallVector outerSizes; SmallVector opaqueSizes; + auto [m, n, k] = getMNKShape(); switch (fragment) { case IREE::GPU::MMAFragment::Lhs: { - concreteSizes.push_back(layout.aMLayout.getShapes()); - concreteSizes.push_back(layout.aKLayout.getShapes()); - opaqueSizes.push_back(opaqueLayout.mSize); - opaqueSizes.push_back(opaqueLayout.kSize); + outerSizes = getASingleSubgroupLayout().outer; + opaqueSizes.append({m, k}); break; } case IREE::GPU::MMAFragment::Rhs: { - concreteSizes.push_back(layout.bKLayout.getShapes()); - concreteSizes.push_back(layout.bNLayout.getShapes()); - opaqueSizes.push_back(opaqueLayout.kSize); - opaqueSizes.push_back(opaqueLayout.nSize); + outerSizes = getBSingleSubgroupLayout().outer; + opaqueSizes.append({k, n}); break; } case IREE::GPU::MMAFragment::Acc: { - concreteSizes.push_back(layout.cMLayout.getShapes()); - concreteSizes.push_back(layout.cNLayout.getShapes()); - opaqueSizes.push_back(opaqueLayout.mSize); - opaqueSizes.push_back(opaqueLayout.nSize); + outerSizes = getCSingleSubgroupLayout().outer; + opaqueSizes.append({m, n}); break; } } if (permutation.has_value()) { - if (permutation.value().size() != opaqueSizes.size()) { + if (permutation.value().size() != outerSizes.size()) { return failure(); } - applyPermutationToVector(concreteSizes, permutation.value()); applyPermutationToVector(opaqueSizes, permutation.value()); + applyPermutationToVector(outerSizes, permutation.value()); } // Inner tile must have sizes matching the opaque layout. @@ -819,11 +802,23 @@ LogicalResult MMAAttr::materializeOperandConcreteShape( return ReassociationIndices({idx}); }); int idx = reInds.size(); - for (ArrayRef sizes : concreteSizes) { - resultShape.append(SmallVector(sizes)); - reInds.push_back( - llvm::to_vector(llvm::seq(idx, idx + sizes.size()))); - idx += sizes.size(); + for (auto [outer, native] : llvm::zip_equal(outerSizes, opaqueSizes)) { + // Skip expansion if the outer dim is unit as the SingleSubgroupLayout gives + // a guarantee that the |element| counts are contiguous within the layout, + // and a unit outer implies a single offset and size for that dimension. + if (outer == 1) { + resultShape.push_back(native); + reInds.push_back(ReassociationIndices({idx++})); + continue; + } + + // Reshape to [outer, native / outer] == [outer, thread * element]. This + // corresponds to |outer| repetitions of the thread/element sublayout. + resultShape.push_back(outer); + assert(native % outer == 0 && "invalid mma layout"); + resultShape.push_back(native / outer); + reInds.push_back(ReassociationIndices{idx, idx + 1}); + idx += 2; } reassociations = reInds; diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp index 2f5a48b1d986..143cba102c94 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp @@ -73,16 +73,7 @@ LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target, lhsElemType, rhsElemType, initElemType}; SmallVector intrinsics; - SmallVector supportedMmas; for (IREE::GPU::MMAAttr mma : target.getWgp().getMma()) { - IREE::GPU::MMAIntrinsic type = mma.getIntrinsic().getValue(); - // TODO: Drop this once all intrinsics are supported. - if (type != IREE::GPU::MMAIntrinsic::MFMA_F32_16x16x16_F16 && - type != IREE::GPU::MMAIntrinsic::MFMA_I32_16x16x32_I8) { - continue; - } - supportedMmas.push_back(mma); - auto [mSize, nSize, kSize] = mma.getMNKShape(); auto [aType, bType, cType] = mma.getABCElementTypes(); if (mma.getSubgroupSize() != targetSubgroupSize) @@ -185,7 +176,8 @@ LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target, // Similarly the reduction tile size is just the post-packing tile count. reductionTileSizes[kDim] = schedule->kTileCount; - IREE::GPU::MmaInterfaceAttr mmaKind = supportedMmas[schedule->index]; + IREE::GPU::MmaInterfaceAttr mmaKind = + target.getWgp().getMma()[schedule->index]; // Attach the MMA schedule as an attribute to the entry point export function // for later access in the pipeline. diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/ConcretizeMmaShapes.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/ConcretizeMmaShapes.cpp index 9910840bc694..94bcf3dbe593 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/ConcretizeMmaShapes.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/ConcretizeMmaShapes.cpp @@ -9,7 +9,6 @@ #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.h" #include "iree/compiler/Codegen/Dialect/GPU/Transforms/Passes.h" #include "iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.h" -#include "iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtDialect.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" @@ -66,6 +65,13 @@ struct ConcretizeMmaOperandShape final : OpRewritePattern { return failure(); } + // Early exit if the operand is unaffected. + if (llvm::all_of(reassociations, [](ReassociationIndices reassoc) { + return reassoc.size() == 1; + })) { + return failure(); + } + // Create the expand_shape. Location loc = mmaOp->getLoc(); Value concreteOperand = rewriter diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Passes.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Passes.td index a882b835e4d2..a6eb9737611e 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Passes.td +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Passes.td @@ -16,6 +16,7 @@ def DistributeMmaToLanesPass : "::mlir::arith::ArithDialect", "::mlir::affine::AffineDialect", "::mlir::scf::SCFDialect", + "::mlir::tensor::TensorDialect", ]; } @@ -25,7 +26,6 @@ def ConcretizeMmaShapesPass : let dependentDialects = [ "::mlir::tensor::TensorDialect", "::mlir::iree_compiler::IREE::GPU::IREEGPUDialect", - "::mlir::iree_compiler::IREE::VectorExt::IREEVectorExtDialect", ]; let options = [ Option<"concretizeInputs", "concretize-inputs", diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.cpp index 45d8ad188c5b..7fd46e4c130a 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.cpp @@ -463,7 +463,8 @@ convertContractionToMultiMma(RewriterBase &rewriter, linalg::LinalgOp linalgOp, FailureOr distributeMultiMmaOp(RewriterBase &rewriter, IREE::GPU::MultiMmaOp mmaOp) { if (!mmaOp.hasTensorSemantics() || mmaOp.hasThreadSemantics()) { - return failure(); + return rewriter.notifyMatchFailure( + mmaOp, "mmaOp must have vector and subgroup for distribution."); } OpBuilder::InsertionGuard g(rewriter); @@ -508,7 +509,7 @@ FailureOr distributeMultiMmaOp(RewriterBase &rewriter, if (failed(mmaOp.getKind().populateOperandOffsetsSizesStrides( rewriter, loc, IREE::GPU::MMAFragment::Lhs, laneId, lhsPermutation, lhsOffsets, lhsSizes, lhsStrides))) { - return failure(); + return mmaOp->emitOpError("failed to populate lhs offsets"); } // Extract the rank-reduced slice of the lhs based on the expected inner // vector shape. @@ -528,7 +529,7 @@ FailureOr distributeMultiMmaOp(RewriterBase &rewriter, if (failed(mmaOp.getKind().populateOperandOffsetsSizesStrides( rewriter, loc, IREE::GPU::MMAFragment::Rhs, laneId, rhsPermutation, rhsOffsets, rhsSizes, rhsStrides))) { - return failure(); + return mmaOp->emitOpError("failed to populate rhs offsets"); } // Extract the rank-reduced slice of the rhs based on the expected inner // vector shape. @@ -548,7 +549,7 @@ FailureOr distributeMultiMmaOp(RewriterBase &rewriter, if (failed(mmaOp.getKind().populateOperandOffsetsSizesStrides( rewriter, loc, IREE::GPU::MMAFragment::Acc, laneId, accPermutation, accOffsets, accSizes, accStrides))) { - return failure(); + return mmaOp->emitOpError("failed to populate acc offsets"); } // Extract the rank-reduced slice of the accumulator based on the expected // inner vector shape. diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/concretize_mma_shapes.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/concretize_mma_shapes.mlir index 990bfea08d6d..facbb846efee 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/concretize_mma_shapes.mlir +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/concretize_mma_shapes.mlir @@ -21,19 +21,15 @@ func.func @concretize_multi_mma_F32_16x16x16_F16(%lhs: tensor<2x2x16x16xf16>, %r // CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<2x2x16x16xf16> // CHECK-SAME: %[[ACC:[A-Za-z0-9]+]]: tensor<2x2x16x16xf32> -// CHECK-INPUTS-DAG: %[[EXPANDED_LHS:.+]] = tensor.expand_shape %[[LHS]] {{\[}}[0], [1], [2], [3, 4]] output_shape [2, 2, 16, 4, 4] : tensor<2x2x16x16xf16> into tensor<2x2x16x4x4xf16> -// CHECK-INPUTS-DAG: %[[EXPANDED_RHS:.+]] = tensor.expand_shape %[[RHS]] {{\[}}[0], [1], [2, 3], [4]] output_shape [2, 2, 4, 4, 16] : tensor<2x2x16x16xf16> into tensor<2x2x4x4x16xf16> -// CHECK-INPUTS: %[[MMA:.+]] = iree_gpu.multi_mma %[[EXPANDED_LHS]], %[[EXPANDED_RHS]], %[[ACC]] +// CHECK-INPUTS: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]] // CHECK-INPUTS-SAME: lowering_config = #iree_gpu.lowering_config -// CHECK-INPUTS-SAME: : tensor<2x2x16x4x4xf16>, tensor<2x2x4x4x16xf16> into tensor<2x2x16x16xf32> +// CHECK-INPUTS-SAME: : tensor<2x2x16x16xf16>, tensor<2x2x16x16xf16> into tensor<2x2x16x16xf32> // CHECK-INPUTS: return %[[MMA]] -// CHECK-RESULT-DAG: %[[EXPANDED_ACC:.+]] = tensor.expand_shape %[[ACC]] {{\[}}[0], [1], [2, 3], [4]] output_shape [2, 2, 4, 4, 16] : tensor<2x2x16x16xf32> into tensor<2x2x4x4x16xf32> -// CHECK-RESULT: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[EXPANDED_ACC]] +// CHECK-RESULT: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]] // CHECK-RESULT-SAME: lowering_config = #iree_gpu.lowering_config -// CHECK-RESULT-SAME: : tensor<2x2x16x16xf16>, tensor<2x2x16x16xf16> into tensor<2x2x4x4x16xf32> -// CHECK-RESULT: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[MMA]] {{\[}}[0], [1], [2, 3], [4]] : tensor<2x2x4x4x16xf32> into tensor<2x2x16x16xf32> -// CHECK-RESULT: return %[[COLLAPSED]] +// CHECK-RESULT-SAME: : tensor<2x2x16x16xf16>, tensor<2x2x16x16xf16> into tensor<2x2x16x16xf32> +// CHECK-RESULT: return %[[MMA]] // ----- @@ -58,20 +54,16 @@ func.func @concretize_multi_mma_I32_16x16x32_I8(%lhs: tensor<2x2x16x32xi8>, %rhs // CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<2x2x16x32xi8> // CHECK-SAME: %[[ACC:[A-Za-z0-9]+]]: tensor<2x2x16x16xi32> -// CHECK-INPUTS-DAG: %[[EXPANDED_LHS:.+]] = tensor.expand_shape %[[LHS]] {{\[}}[0], [1], [2], [3, 4]] output_shape [2, 2, 16, 4, 8] : tensor<2x2x16x32xi8> into tensor<2x2x16x4x8xi8> -// CHECK-INPUTS-DAG: %[[EXPANDED_RHS:.+]] = tensor.expand_shape %[[RHS]] {{\[}}[0], [1], [2], [3, 4]] output_shape [2, 2, 16, 4, 8] : tensor<2x2x16x32xi8> into tensor<2x2x16x4x8xi8> -// CHECK-INPUTS: %[[MMA:.+]] = iree_gpu.multi_mma %[[EXPANDED_LHS]], %[[EXPANDED_RHS]], %[[ACC]] +// CHECK-INPUTS: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]] // CHECK-INPUTS-SAME: lowering_config = #iree_gpu.lowering_config -// CHECK-INPUTS-SAME: rhs_permutation = array -// CHECK-INPUTS-SAME: : tensor<2x2x16x4x8xi8>, tensor<2x2x16x4x8xi8> into tensor<2x2x16x16xi32> +// CHECK-INPUTS-SAME: rhs_permutation = array +// CHECK-INPUTS-SAME: : tensor<2x2x16x32xi8>, tensor<2x2x16x32xi8> into tensor<2x2x16x16xi32> // CHECK-INPUTS: return %[[MMA]] -// CHECK-RESULT-DAG: %[[EXPANDED_ACC:.+]] = tensor.expand_shape %[[ACC]] {{\[}}[0], [1], [2, 3], [4]] output_shape [2, 2, 4, 4, 16] : tensor<2x2x16x16xi32> into tensor<2x2x4x4x16xi32> -// CHECK-RESULT: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[EXPANDED_ACC]] +// CHECK-RESULT: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]] // CHECK-RESULT-SAME: lowering_config = #iree_gpu.lowering_config -// CHECK-RESULT-SAME: : tensor<2x2x16x32xi8>, tensor<2x2x16x32xi8> into tensor<2x2x4x4x16xi32> -// CHECK-RESULT: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[MMA]] {{\[}}[0], [1], [2, 3], [4]] : tensor<2x2x4x4x16xi32> into tensor<2x2x16x16xi32> -// CHECK-RESULT: return %[[COLLAPSED]] +// CHECK-RESULT-SAME: : tensor<2x2x16x32xi8>, tensor<2x2x16x32xi8> into tensor<2x2x16x16xi32> +// CHECK-RESULT: return %[[MMA]] // ----- @@ -95,16 +87,159 @@ func.func @concretize_multi_mma_F32_32x32x8_F16(%lhs: tensor<2x2x32x8xf16>, %rhs // CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<2x2x8x32xf16> // CHECK-SAME: %[[ACC:[A-Za-z0-9]+]]: tensor<2x2x32x32xf32> -// CHECK-INPUTS-DAG: %[[EXPANDED_LHS:.+]] = tensor.expand_shape %[[LHS]] {{\[}}[0], [1], [2], [3, 4]] output_shape [2, 2, 32, 2, 4] : tensor<2x2x32x8xf16> into tensor<2x2x32x2x4xf16> -// CHECK-INPUTS-DAG: %[[EXPANDED_RHS:.+]] = tensor.expand_shape %[[RHS]] {{\[}}[0], [1], [2, 3], [4]] output_shape [2, 2, 2, 4, 32] : tensor<2x2x8x32xf16> into tensor<2x2x2x4x32xf16> -// CHECK-INPUTS: %[[MMA:.+]] = iree_gpu.multi_mma %[[EXPANDED_LHS]], %[[EXPANDED_RHS]], %[[ACC]] +// CHECK-INPUTS: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]] // CHECK-INPUTS-SAME: lowering_config = #iree_gpu.lowering_config -// CHECK-INPUTS-SAME: : tensor<2x2x32x2x4xf16>, tensor<2x2x2x4x32xf16> into tensor<2x2x32x32xf32> +// CHECK-INPUTS-SAME: : tensor<2x2x32x8xf16>, tensor<2x2x8x32xf16> into tensor<2x2x32x32xf32> // CHECK-INPUTS: return %[[MMA]] -// CHECK-RESULT-DAG: %[[EXPANDED_ACC:.+]] = tensor.expand_shape %[[ACC]] {{\[}}[0], [1], [2, 3, 4], [5]] output_shape [2, 2, 4, 2, 4, 32] : tensor<2x2x32x32xf32> into tensor<2x2x4x2x4x32xf32> +// CHECK-RESULT-DAG: %[[EXPANDED_ACC:.+]] = tensor.expand_shape %[[ACC]] {{\[}}[0], [1], [2, 3], [4]] output_shape [2, 2, 4, 8, 32] // CHECK-RESULT: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[EXPANDED_ACC]] // CHECK-RESULT-SAME: lowering_config = #iree_gpu.lowering_config -// CHECK-RESULT-SAME: : tensor<2x2x32x8xf16>, tensor<2x2x8x32xf16> into tensor<2x2x4x2x4x32xf32> -// CHECK-RESULT: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[MMA]] {{\[}}[0], [1], [2, 3, 4], [5]] : tensor<2x2x4x2x4x32xf32> into tensor<2x2x32x32xf32> +// CHECK-RESULT-SAME: : tensor<2x2x32x8xf16>, tensor<2x2x8x32xf16> into tensor<2x2x4x8x32xf32> +// CHECK-RESULT: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[MMA]] {{\[}}[0], [1], [2, 3], [4]] +// CHECK-RESULT: return %[[COLLAPSED]] + +// ----- + +#contraction_accesses = [ + affine_map<(i, j, k) -> (i, k)>, + affine_map<(i, j, k) -> (k, j)>, + affine_map<(i, j, k) -> (i, j)> +] +#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 4], thread = [8, 4]}> +func.func @concretize_multi_mma_F32_32x32x8_F16(%lhs: tensor<2x2x32x8xf16>, %rhs: tensor<2x2x8x32xf16>, %acc: tensor<2x2x32x32xf32>) -> tensor<2x2x32x32xf32> { + %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { + indexing_maps = #contraction_accesses, + iterator_types = [#iree_gpu.iterator_type, #iree_gpu.iterator_type, #iree_gpu.iterator_type], + kind = #iree_gpu.mma_layout, lowering_config = #config, + acc_permutation = array + } : tensor<2x2x32x8xf16>, tensor<2x2x8x32xf16> into tensor<2x2x32x32xf32> + return %0 : tensor<2x2x32x32xf32> +} + +// CHECK-LABEL: func @concretize_multi_mma_F32_32x32x8_F16 +// CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: tensor<2x2x32x8xf16> +// CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<2x2x8x32xf16> +// CHECK-SAME: %[[ACC:[A-Za-z0-9]+]]: tensor<2x2x32x32xf32> + +// CHECK-RESULT-DAG: %[[EXPANDED_ACC:.+]] = tensor.expand_shape %[[ACC]] {{\[}}[0], [1], [2], [3, 4]] output_shape [2, 2, 32, 4, 8] +// CHECK-RESULT: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[EXPANDED_ACC]] +// CHECK-RESULT-SAME: acc_permutation = array +// CHECK-RESULT-SAME: lowering_config = #iree_gpu.lowering_config +// CHECK-RESULT-SAME: : tensor<2x2x32x8xf16>, tensor<2x2x8x32xf16> into tensor<2x2x32x4x8xf32> +// CHECK-RESULT: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[MMA]] {{\[}}[0], [1], [2], [3, 4]] +// CHECK-RESULT: return %[[COLLAPSED]] + +// ----- + +#contraction_accesses = [ + affine_map<() -> ()>, + affine_map<() -> ()>, + affine_map<() -> ()> +] +func.func @concretize_F32_16x16x4_F32(%lhs: tensor<16x4xf32>, %rhs: tensor<4x16xf32>, %acc: tensor<16x16xf32>) -> tensor<16x16xf32> { + %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { + indexing_maps = #contraction_accesses, + iterator_types = [], + kind = #iree_gpu.mma_layout + } : tensor<16x4xf32>, tensor<4x16xf32> into tensor<16x16xf32> + return %0 : tensor<16x16xf32> +} + +// CHECK-LABEL: func @concretize_F32_16x16x4_F32 + +// CHECK-INPUTS-NOT: tensor.expand_shape +// CHECK-INPUTS: %[[MMA:.+]] = iree_gpu.multi_mma +// CHECK-INPUTS: return %[[MMA]] + +// CHECK-RESULT-NOT: tensor.expand_shape +// CHECK-RESULT: %[[MMA:.+]] = iree_gpu.multi_mma +// CHECK-RESULT: return %[[MMA]] + +// ----- + +#contraction_accesses = [ + affine_map<() -> ()>, + affine_map<() -> ()>, + affine_map<() -> ()> +] +func.func @concretize_F32_16x16x32_F8E4M3FNUZ(%lhs: tensor<16x32xf8E4M3FNUZ>, %rhs: tensor<32x16xf8E4M3FNUZ>, %acc: tensor<16x16xf32>) -> tensor<16x16xf32> { + %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { + indexing_maps = #contraction_accesses, + iterator_types = [], + kind = #iree_gpu.mma_layout + } : tensor<16x32xf8E4M3FNUZ>, tensor<32x16xf8E4M3FNUZ> into tensor<16x16xf32> + return %0 : tensor<16x16xf32> +} + +// CHECK-LABEL: func @concretize_F32_16x16x32_F8E4M3FNUZ + +// CHECK-INPUTS-NOT: tensor.expand_shape +// CHECK-INPUTS: %[[MMA:.+]] = iree_gpu.multi_mma +// CHECK-INPUTS: return %[[MMA]] + +// CHECK-RESULT-NOT: tensor.expand_shape +// CHECK-RESULT: %[[MMA:.+]] = iree_gpu.multi_mma +// CHECK-RESULT: return %[[MMA]] + +// ----- + +#contraction_accesses = [ + affine_map<() -> ()>, + affine_map<() -> ()>, + affine_map<() -> ()> +] +func.func @concretize_I32_32x32x16_I8(%lhs: tensor<32x16xi8>, %rhs: tensor<16x32xi8>, %acc: tensor<32x32xi32>) -> tensor<32x32xi32> { + %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { + indexing_maps = #contraction_accesses, + iterator_types = [], + kind = #iree_gpu.mma_layout + } : tensor<32x16xi8>, tensor<16x32xi8> into tensor<32x32xi32> + return %0 : tensor<32x32xi32> +} + +// CHECK-LABEL: func @concretize_I32_32x32x16_I8 +// CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: tensor<32x16xi8> +// CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<16x32xi8> +// CHECK-SAME: %[[ACC:[A-Za-z0-9]+]]: tensor<32x32xi32> + +// CHECK-INPUTS-NOT: tensor.expand_shape +// CHECK-INPUTS: %[[MMA:.+]] = iree_gpu.multi_mma +// CHECK-INPUTS: return %[[MMA]] + +// CHECK-RESULT: %[[EXPANDED_ACC:.+]] = tensor.expand_shape %[[ACC]] {{\[}}[0, 1], [2]] output_shape [4, 8, 32] +// CHECK-RESULT: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[EXPANDED_ACC]] +// CHECK-RESULT-SAME: : tensor<32x16xi8>, tensor<16x32xi8> into tensor<4x8x32xi32> +// CHECK-RESULT: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[MMA]] {{\[}}[0, 1], [2]] +// CHECK-RESULT: return %[[COLLAPSED]] + +// ----- + +#contraction_accesses = [ + affine_map<() -> ()>, + affine_map<() -> ()>, + affine_map<() -> ()> +] +func.func @concretize_WMMA_F16_16x16x16_F16(%lhs: tensor<16x16xf16>, %rhs: tensor<16x16xf16>, %acc: tensor<16x16xf16>) -> tensor<16x16xf16> { + %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { + indexing_maps = #contraction_accesses, + iterator_types = [], + kind = #iree_gpu.mma_layout + } : tensor<16x16xf16>, tensor<16x16xf16> into tensor<16x16xf16> + return %0 : tensor<16x16xf16> +} + +// CHECK-LABEL: func @concretize_WMMA_F16_16x16x16_F16 +// CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: tensor<16x16xf16> +// CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<16x16xf16> +// CHECK-SAME: %[[ACC:[A-Za-z0-9]+]]: tensor<16x16xf16> + +// CHECK-INPUTS-NOT: tensor.expand_shape +// CHECK-INPUTS: %[[MMA:.+]] = iree_gpu.multi_mma +// CHECK-INPUTS: return %[[MMA]] + +// CHECK-RESULT: %[[EXPANDED_ACC:.+]] = tensor.expand_shape %[[ACC]] {{\[}}[0, 1], [2]] output_shape [8, 2, 16] +// CHECK-RESULT: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[EXPANDED_ACC]] +// CHECK-RESULT-SAME: : tensor<16x16xf16>, tensor<16x16xf16> into tensor<8x2x16xf16> +// CHECK-RESULT: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[MMA]] {{\[}}[0, 1], [2]] // CHECK-RESULT: return %[[COLLAPSED]] diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/distribute_mma_to_lanes.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/distribute_mma_to_lanes.mlir index 214b432f652c..5569b3b6247d 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/distribute_mma_to_lanes.mlir +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/distribute_mma_to_lanes.mlir @@ -31,3 +31,227 @@ module { // CHECK-SAME: kind = #iree_gpu.mma_layout // CHECK-SAME: : tensor<2x8x1x4xf16>, tensor<8x2x1x4xf16> into tensor<2x2x4x1xf32> // CHECK: mapping = [#iree_gpu.lane_id<0>] + +// ----- + +#contraction_accesses = [ + affine_map<(i, j, k) -> (i, k)>, + affine_map<(i, j, k) -> (k, j)>, + affine_map<(i, j, k) -> (i, j)> +] +module { + func.func @matmul_32x32x8(%arg0: tensor<2x8x32x8xf16>, %arg1: tensor<8x2x32x8xf16>, %arg2: tensor<2x2x4x8x32xf32>) -> tensor<2x2x4x8x32xf32> { + %mm = iree_gpu.multi_mma %arg0, %arg1, %arg2 { + indexing_maps = #contraction_accesses, + iterator_types = [#iree_gpu.iterator_type, #iree_gpu.iterator_type, #iree_gpu.iterator_type], + kind = #iree_gpu.mma_layout, + rhs_permutation = array + } : tensor<2x8x32x8xf16>, tensor<8x2x32x8xf16> into tensor<2x2x4x8x32xf32> + return %mm : tensor<2x2x4x8x32xf32> + } +} + +// CHECK-DAG: #[[$XMAP:.+]] = affine_map<(d0) -> (d0 mod 32)> +// CHECK-DAG: #[[$YMAP:.+]] = affine_map<(d0) -> ((d0 floordiv 32) * 4 - ((d0 floordiv 32) floordiv 2) * 8)> +// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> +// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)> +// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> + +// CHECK-LABEL: func @matmul_32x32x8 +// CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: tensor<2x8x32x8xf16> +// CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<8x2x32x8xf16> +// CHECK: scf.forall (%[[LANEID:.+]]) in (64) shared_outs(%[[ACC:.+]] = {{.*}}) -> (tensor<2x2x4x8x32xf32>) +// CHECK-DAG: %[[IDX:.+]] = affine.apply #[[$XMAP]](%[[LANEID]]) +// CHECK-DAG: %[[IDY:.+]] = affine.apply #[[$YMAP]](%[[LANEID]]) +// CHECK-DAG: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][0, 0, %[[IDX]], %[[IDY]]] [2, 8, 1, 4] +// CHECK-DAG: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][0, 0, %[[IDX]], %[[IDY]]] [8, 2, 1, 4] +// CHECK-DAG: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC]][0, 0, 0, %[[IDY]], %[[IDX]]] [2, 2, 4, 4, 1] +// CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS_SLICE]], %[[RHS_SLICE]], %[[ACC_SLICE]] +// CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] +// CHECK-SAME: kind = #iree_gpu.mma_layout +// CHECK-SAME: : tensor<2x8x1x4xf16>, tensor<8x2x1x4xf16> into tensor<2x2x4x4x1xf32> +// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ACC]][0, 0, 0, %[[IDY]], %[[IDX]]] [2, 2, 4, 4, 1] +// CHECK: mapping = [#iree_gpu.lane_id<0>] + +// ----- + +#contraction_accesses = [ + affine_map<(i, j, k) -> (i, k)>, + affine_map<(i, j, k) -> (k, j)>, + affine_map<(i, j, k) -> (i, j)> +] +module { + func.func @matmul_wmma_16x16x16(%arg0: tensor<2x8x16x16xf16>, %arg1: tensor<8x2x16x16xf16>, %arg2: tensor<2x2x8x2x16xf32>) -> tensor<2x2x8x2x16xf32> { + %mm = iree_gpu.multi_mma %arg0, %arg1, %arg2 { + indexing_maps = #contraction_accesses, + iterator_types = [#iree_gpu.iterator_type, #iree_gpu.iterator_type, #iree_gpu.iterator_type], + kind = #iree_gpu.mma_layout, + rhs_permutation = array + } : tensor<2x8x16x16xf16>, tensor<8x2x16x16xf16> into tensor<2x2x8x2x16xf32> + return %mm : tensor<2x2x8x2x16xf32> + } +} + +// CHECK-DAG: #[[$XMAP:.+]] = affine_map<(d0) -> (d0 mod 16)> +// CHECK-DAG: #[[$YMAP:.+]] = affine_map<(d0) -> ((d0 floordiv 16) mod 2)> +// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> +// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)> +// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> + +// CHECK-LABEL: func @matmul_wmma_16x16x16 +// CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: tensor<2x8x16x16xf16> +// CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<8x2x16x16xf16> +// CHECK: scf.forall (%[[LANEID:.+]]) in (32) shared_outs(%[[ACC:.+]] = {{.*}}) -> (tensor<2x2x8x2x16xf32>) +// CHECK-DAG: %[[IDX:.+]] = affine.apply #[[$XMAP]](%[[LANEID]]) +// CHECK-DAG: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][0, 0, %[[IDX]], 0] [2, 8, 1, 16] +// CHECK-DAG: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][0, 0, %[[IDX]], 0] [8, 2, 1, 16] +// CHECK-DAG: %[[IDY:.+]] = affine.apply #[[$YMAP]](%[[LANEID]]) +// CHECK-DAG: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC]][0, 0, 0, %[[IDY]], %[[IDX]]] [2, 2, 8, 1, 1] +// CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS_SLICE]], %[[RHS_SLICE]], %[[ACC_SLICE]] +// CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] +// CHECK-SAME: kind = #iree_gpu.mma_layout +// CHECK-SAME: : tensor<2x8x1x16xf16>, tensor<8x2x1x16xf16> into tensor<2x2x8x1x1xf32> +// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ACC]][0, 0, 0, %[[IDY]], %[[IDX]]] [2, 2, 8, 1, 1] +// CHECK: mapping = [#iree_gpu.lane_id<0>] + +// ----- + +#contraction_accesses = [ + affine_map<() -> ()>, + affine_map<() -> ()>, + affine_map<() -> ()> +] +func.func @distribute_MFMA_F32_16x16x4_F32(%lhs: tensor<16x4xf32>, %rhs: tensor<4x16xf32>, %acc: tensor<16x16xf32>) -> tensor<16x16xf32> { + %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { + indexing_maps = #contraction_accesses, + iterator_types = [], + kind = #iree_gpu.mma_layout + } : tensor<16x4xf32>, tensor<4x16xf32> into tensor<16x16xf32> + return %0 : tensor<16x16xf32> +} + +// CHECK-DAG: #[[$XMAP:.+]] = affine_map<(d0) -> (d0 mod 16)> +// CHECK-DAG: #[[$YMAP:.+]] = affine_map<(d0) -> ((d0 floordiv 16) mod 4)> +// CHECK-DAG: #[[$ZMAP:.+]] = affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)> + +// CHECK-LABEL: func @distribute_MFMA_F32_16x16x4_F32 +// CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: tensor<16x4xf32> +// CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<4x16xf32> +// CHECK: scf.forall (%[[LANEID:.+]]) in (64) shared_outs(%[[ACC:.+]] = {{.*}}) -> (tensor<16x16xf32>) +// CHECK-DAG: %[[IDX:.+]] = affine.apply #[[$XMAP]](%[[LANEID]]) +// CHECK-DAG: %[[IDY:.+]] = affine.apply #[[$YMAP]](%[[LANEID]]) +// CHECK-DAG: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][%[[IDX]], %[[IDY]]] [1, 1] +// CHECK-DAG: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][%[[IDY]], %[[IDX]]] [1, 1] +// CHECK-DAG: %[[IDZ:.+]] = affine.apply #[[$ZMAP]](%[[LANEID]]) +// CHECK-DAG: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC]][%[[IDZ]], %[[IDX]]] [4, 1] +// CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS_SLICE]], %[[RHS_SLICE]], %[[ACC_SLICE]] +// CHECK-SAME: kind = #iree_gpu.mma_layout +// CHECK-SAME: : tensor<1x1xf32>, tensor<1x1xf32> into tensor<4x1xf32> +// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ACC]][%[[IDZ]], %[[IDX]]] [4, 1] +// CHECK: mapping = [#iree_gpu.lane_id<0>] + +// ----- + +#contraction_accesses = [ + affine_map<() -> ()>, + affine_map<() -> ()>, + affine_map<() -> ()> +] +func.func @distribute_F32_16x16x32_F8E4M3FNUZ(%lhs: tensor<16x32xf8E4M3FNUZ>, %rhs: tensor<32x16xf8E4M3FNUZ>, %acc: tensor<16x16xf32>) -> tensor<16x16xf32> { + %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { + indexing_maps = #contraction_accesses, + iterator_types = [], + kind = #iree_gpu.mma_layout + } : tensor<16x32xf8E4M3FNUZ>, tensor<32x16xf8E4M3FNUZ> into tensor<16x16xf32> + return %0 : tensor<16x16xf32> +} + +// CHECK-DAG: #[[$XMAP:.+]] = affine_map<(d0) -> (d0 mod 16)> +// CHECK-DAG: #[[$YMAP:.+]] = affine_map<(d0) -> ((d0 floordiv 16) * 8 - ((d0 floordiv 16) floordiv 4) * 32)> +// CHECK-DAG: #[[$ZMAP:.+]] = affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)> + +// CHECK-LABEL: func @distribute_F32_16x16x32_F8E4M3FNUZ +// CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: tensor<16x32xf8E4M3FNUZ> +// CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<32x16xf8E4M3FNUZ> +// CHECK: scf.forall (%[[LANEID:.+]]) in (64) shared_outs(%[[ACC:.+]] = {{.*}}) -> (tensor<16x16xf32>) +// CHECK-DAG: %[[IDX:.+]] = affine.apply #[[$XMAP]](%[[LANEID]]) +// CHECK-DAG: %[[IDY:.+]] = affine.apply #[[$YMAP]](%[[LANEID]]) +// CHECK-DAG: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][%[[IDX]], %[[IDY]]] [1, 8] +// CHECK-DAG: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][%[[IDY]], %[[IDX]]] [8, 1] +// CHECK-DAG: %[[IDZ:.+]] = affine.apply #[[$ZMAP]](%[[LANEID]]) +// CHECK-DAG: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC]][%[[IDZ]], %[[IDX]]] [4, 1] +// CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS_SLICE]], %[[RHS_SLICE]], %[[ACC_SLICE]] +// CHECK-SAME: kind = #iree_gpu.mma_layout +// CHECK-SAME: : tensor<1x8xf8E4M3FNUZ>, tensor<8x1xf8E4M3FNUZ> into tensor<4x1xf32> +// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ACC]][%[[IDZ]], %[[IDX]]] [4, 1] +// CHECK: mapping = [#iree_gpu.lane_id<0>] + +// ----- + +#contraction_accesses = [ + affine_map<() -> ()>, + affine_map<() -> ()>, + affine_map<() -> ()> +] +func.func @distribute_I32_32x32x16_I8(%lhs: tensor<32x16xi8>, %rhs: tensor<16x32xi8>, %acc: tensor<4x8x32xi32>) -> tensor<4x8x32xi32> { + %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { + indexing_maps = #contraction_accesses, + iterator_types = [], + kind = #iree_gpu.mma_layout + } : tensor<32x16xi8>, tensor<16x32xi8> into tensor<4x8x32xi32> + return %0 : tensor<4x8x32xi32> +} + +// CHECK-DAG: #[[$XMAP:.+]] = affine_map<(d0) -> (d0 mod 32)> +// CHECK-DAG: #[[$YMAP:.+]] = affine_map<(d0) -> ((d0 floordiv 32) * 8 - ((d0 floordiv 32) floordiv 2) * 16)> +// CHECK-DAG: #[[$ZMAP:.+]] = affine_map<(d0) -> ((d0 floordiv 32) * 4 - ((d0 floordiv 32) floordiv 2) * 8)> + +// CHECK-LABEL: func @distribute_I32_32x32x16_I8 +// CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: tensor<32x16xi8> +// CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<16x32xi8> +// CHECK: scf.forall (%[[LANEID:.+]]) in (64) shared_outs(%[[ACC:.+]] = {{.*}}) -> (tensor<4x8x32xi32>) +// CHECK-DAG: %[[IDX:.+]] = affine.apply #[[$XMAP]](%[[LANEID]]) +// CHECK-DAG: %[[IDY:.+]] = affine.apply #[[$YMAP]](%[[LANEID]]) +// CHECK-DAG: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][%[[IDX]], %[[IDY]]] [1, 8] +// CHECK-DAG: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][%[[IDY]], %[[IDX]]] [8, 1] +// CHECK-DAG: %[[IDZ:.+]] = affine.apply #[[$ZMAP]](%[[LANEID]]) +// CHECK-DAG: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC]][0, %[[IDZ]], %[[IDX]]] [4, 4, 1] +// CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS_SLICE]], %[[RHS_SLICE]], %[[ACC_SLICE]] +// CHECK-SAME: kind = #iree_gpu.mma_layout +// CHECK-SAME: : tensor<1x8xi8>, tensor<8x1xi8> into tensor<4x4x1xi32> +// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ACC]][0, %[[IDZ]], %[[IDX]]] [4, 4, 1] +// CHECK: mapping = [#iree_gpu.lane_id<0>] + +// ----- + +#contraction_accesses = [ + affine_map<() -> ()>, + affine_map<() -> ()>, + affine_map<() -> ()> +] +func.func @distribute_WMMA_F16_16x16x16_F16(%lhs: tensor<16x16xf16>, %rhs: tensor<16x16xf16>, %acc: tensor<8x2x16xf16>) -> tensor<8x2x16xf16> { + %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { + indexing_maps = #contraction_accesses, + iterator_types = [], + kind = #iree_gpu.mma_layout + } : tensor<16x16xf16>, tensor<16x16xf16> into tensor<8x2x16xf16> + return %0 : tensor<8x2x16xf16> +} + +// CHECK-DAG: #[[$XMAP:.+]] = affine_map<(d0) -> (d0 mod 16)> +// CHECK-DAG: #[[$YMAP:.+]] = affine_map<(d0) -> ((d0 floordiv 16) mod 2)> + +// CHECK-LABEL: func @distribute_WMMA_F16_16x16x16_F16 +// CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: tensor<16x16xf16> +// CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<16x16xf16> +// CHECK: scf.forall (%[[LANEID:.+]]) in (32) shared_outs(%[[ACC:.+]] = {{.*}}) -> (tensor<8x2x16xf16>) +// CHECK-DAG: %[[IDX:.+]] = affine.apply #[[$XMAP]](%[[LANEID]]) +// CHECK-DAG: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][%[[IDX]], 0] [1, 16] +// CHECK-DAG: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][0, %[[IDX]]] [16, 1] +// CHECK-DAG: %[[IDY:.+]] = affine.apply #[[$YMAP]](%[[LANEID]]) +// CHECK-DAG: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC]][0, %[[IDY]], %[[IDX]]] [8, 1, 1] +// CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS_SLICE]], %[[RHS_SLICE]], %[[ACC_SLICE]] +// CHECK-SAME: kind = #iree_gpu.mma_layout +// CHECK-SAME: : tensor<1x16xf16>, tensor<16x1xf16> into tensor<8x1x1xf16> +// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ACC]][0, %[[IDY]], %[[IDX]]] [8, 1, 1] +// CHECK: mapping = [#iree_gpu.lane_id<0>] diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp index 8b74d1b4a3d4..68d1a4f6177c 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp @@ -306,6 +306,14 @@ void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager) { funcPassManager.addPass(createGPUPromoteMatmulOperandsPass()); funcPassManager.addPass(IREE::GPU::createPackToIntrinsicsPass()); + // Step 1.5. Expand result shapes of MultiMmaOps before reduction tiling. + { + IREE::GPU::ConcretizeMmaShapesPassOptions options; + options.concretizeInputs = false; + options.concretizeResult = true; + funcPassManager.addPass(IREE::GPU::createConcretizeMmaShapesPass()); + } + // Step 2. Tile and fuse tileable ops to reduction loops. { GPUApplyTilingLevelPassOptions options; @@ -315,16 +323,26 @@ void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager) { funcPassManager.addPass(createCSEPass()); } - // Decompose pack and unpack ops and propagte the resulting reshapes. + // Step 3. Decompose pack and unpack ops and propagate the resulting reshapes. funcPassManager.addPass( createDecomposePackUnPackOpsPass(/*tileOuterToOne=*/false)); + + // Step 3.5. Expand the inner dimensions of MultiMma ops in preparation for + // distribution to lanes. + { + IREE::GPU::ConcretizeMmaShapesPassOptions options; + options.concretizeInputs = true; + options.concretizeResult = false; + funcPassManager.addPass(IREE::GPU::createConcretizeMmaShapesPass()); + } + funcPassManager.addPass(createPropagateReshapesByExpansionPass()); funcPassManager.addPass(createCanonicalizerPass()); funcPassManager.addPass(createCSEPass()); funcPassManager.addPass(createConvertToDestinationPassingStylePass( /*useWARForCooperativeMatrixCodegen=*/false)); - // Step 3. Tile and fuse tileable ops to subgroups/threads. + // Step 4. Tile and fuse tileable ops to subgroups/threads. { GPUApplyTilingLevelPassOptions options; options.tilingLevel = IREE::GPU::TilingLevel::Thread; @@ -347,35 +365,35 @@ void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager) { funcPassManager.addPass(createCSEPass()); funcPassManager.addPass(createLoopInvariantCodeMotionPass()); - // Step 4. Greedily fuse parallel loops and hoist from serial loops. + // Step 5. Greedily fuse parallel loops and hoist from serial loops. funcPassManager.addPass(IREE::GPU::createFuseAndHoistParallelLoopsPass()); funcPassManager.addPass(createCanonicalizerPass()); funcPassManager.addPass(createCSEPass()); funcPassManager.addPass(createLoopInvariantCodeMotionPass()); - // Step 5. Lower special ops and vectorize. + // Step 6. Lower special ops and vectorize. funcPassManager.addPass(IREE::GPU::createVectorizeIREEGPUOpsPass()); addGPUVectorizationPasses(funcPassManager); funcPassManager.addPass(createCleanupBufferAllocViewPass()); - // Step 6. Bufferize. + // Step 7. Bufferize. // TODO: This is a workaround for a bug in the lowering of // `iree_gpu.shuffle_tensor` which does not properly represent the concurrent // nature of the write to the intermediate tensor. addBufferizePasses(funcPassManager, /*allowPrivateAllocations=*/false); - // Step 7. Resolve remaining parallel loops. + // Step 8. Resolve remaining parallel loops. funcPassManager.addPass(createGPUDistributePass()); - // Vectorize copies that came out of vectorization. + // Vectorize copies that came out of bufferization. funcPassManager.addPass(createVectorizeMemrefCopyPass()); - // Step 7. Unroll operations to native intrinsic widths. + // Step 8. Unroll operations to native intrinsic widths. funcPassManager.addPass(IREE::GPU::createUnrollToIntrinsicsPass()); funcPassManager.addPass(createCanonicalizerPass()); funcPassManager.addPass(createCSEPass()); - // Step 8. Remaining post-bufferization optimizations/lowerings. + // Step 9. Remaining post-bufferization optimizations/lowerings. funcPassManager.addPass(IREE::GPU::createLowerIREEGPUOpsPass()); funcPassManager.addPass(createLoopInvariantCodeMotionPass()); funcPassManager.addPass(memref::createFoldMemRefAliasOpsPass()); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir index a0e1ce623ea9..edf78ea66848 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir @@ -214,3 +214,287 @@ hal.executable private @main { // CHECK: %[[LOOP_T:.+]] = vector.transpose %[[LOOP]], [0, 1, 3, 2, 4] : vector<1x2x2x4x1xf32> to vector<1x2x4x2x1xf32> // CHECK: %[[EXTRACT:.+]] = vector.extract %[[LOOP_T]][0] : vector<2x4x2x1xf32> from vector<1x2x4x2x1xf32> // CHECK: vector.transfer_write %[[EXTRACT]], %[[B2]] + +// ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +#config = #iree_gpu.lowering_config<{ + workgroup = [64, 64, 0], + reduction = [0, 0, 2], + subgroup = [2, 2], + mma_kind = #iree_gpu.mma_layout}> +hal.executable public @main { + hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { + hal.executable.export public @matmul_transpose_b_wmma ordinal(0) layout(#pipeline_layout) { + ^bb0(%arg0: !hal.device): + %x, %y, %z = flow.dispatch.workgroup_count_from_slice + hal.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @matmul_transpose_b_wmma() + attributes {translation_info = #iree_codegen.translation_info} { + %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2048x1280xf16> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [10240, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<10240x1280xf16> + %5 = tensor.empty() : tensor<2048x10240xf32> + %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<2048x10240xf32>) -> tensor<2048x10240xf32> + %7 = linalg.matmul_transpose_b {lowering_config = #config} + ins(%3, %4 : tensor<2048x1280xf16>, tensor<10240x1280xf16>) + outs(%6 : tensor<2048x10240xf32>) -> tensor<2048x10240xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 10240], strides = [1, 1] : tensor<2048x10240xf32> -> !flow.dispatch.tensor> + return + } + } + } +} + +// CHECK-LABEL: func @matmul_transpose_b_wmma +// CHECK-DAG: %[[B0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[B1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) +// CHECK-DAG: %[[B2:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) +// CHECK-DAG: memref.alloc() : memref<64x32xf16, #gpu.address_space> +// CHECK-DAG: memref.alloc() : memref<64x32xf16, #gpu.address_space> +// CHECK: %[[LOOP:.+]] = scf.for %[[IV:.+]] = %c0 to %c80 step %c2 {{.*}} -> (vector<2x2x8x1x1xf32>) +// CHECK: gpu.barrier +// CHECK: %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<2x8xf16> +// CHECK: vector.transfer_write %[[LHS_RD]] +// CHECK: gpu.barrier +// CHECK: %[[LHS_MM:.+]] = vector.transfer_read {{.*}} vector<2x1x2x16xf16> +// CHECK: gpu.barrier +// CHECK: vector.transpose %[[LHS_MM]], [0, 2, 1, 3] : vector<2x1x2x16xf16> +// CHECK: %[[RHS_RD:.+]] = vector.transfer_read %[[B1]]{{.*}} vector<2x8xf16> +// CHECK: vector.transfer_write %[[RHS_RD]] +// CHECK: gpu.barrier +// CHECK: %[[RHS_MM:.+]] = vector.transfer_read {{.*}} vector<2x1x2x16xf16> +// CHECK: gpu.barrier +// CHECK: vector.transpose %[[RHS_MM]], [0, 2, 1, 3] : vector<2x1x2x16xf16> +// CHECK-COUNT-8: amdgpu.wmma {{.*}} : vector<16xf16>, vector<16xf16>, vector<8xf32> +// CHECK: scf.yield +// CHECK: %[[LOOP_T:.+]] = vector.transpose %[[LOOP]], [0, 2, 3, 1, 4] : vector<2x2x8x1x1xf32> to vector<2x8x1x2x1xf32> +// CHECK: vector.transfer_write %[[LOOP_T]], %[[B2]] + +// ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +#config = #iree_gpu.lowering_config<{ + workgroup = [64, 64, 0], + reduction = [0, 0, 2], + subgroup = [2, 2], + mma_kind = #iree_gpu.mma_layout}> + +!eltype = f32 +!aeltype = f32 + +hal.executable public @main { + hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { + hal.executable.export public @matmul_transpose_b_mfma_16x16x4 ordinal(0) layout(#pipeline_layout) { + ^bb0(%arg0: !hal.device): + %x, %y, %z = flow.dispatch.workgroup_count_from_slice + hal.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @matmul_transpose_b_mfma_16x16x4() + attributes {translation_info = #iree_codegen.translation_info} { + %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2048x1280x!eltype> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [10240, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<10240x1280x!eltype> + %5 = tensor.empty() : tensor<2048x10240x!aeltype> + %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<2048x10240x!aeltype>) -> tensor<2048x10240x!aeltype> + %7 = linalg.matmul_transpose_b {lowering_config = #config} + ins(%3, %4 : tensor<2048x1280x!eltype>, tensor<10240x1280x!eltype>) + outs(%6 : tensor<2048x10240x!aeltype>) -> tensor<2048x10240x!aeltype> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 10240], strides = [1, 1] : tensor<2048x10240x!aeltype> -> !flow.dispatch.tensor> + return + } + } + } +} + +// CHECK-LABEL: func @matmul_transpose_b_mfma_16x16x4 +// CHECK-DAG: memref.alloc() : memref<64x8xf32, #gpu.address_space> +// CHECK-DAG: memref.alloc() : memref<64x8xf32, #gpu.address_space> +// CHECK: scf.for %{{.*}} = %c0 to %c320 step %c2 {{.*}} -> (vector<2x2x4x1xf32>) +// CHECK-COUNT-8: amdgpu.mfma {{.*}}blocks = 1 : i32, k = 4 : i32, m = 16 : i32, n = 16 : i32 +// CHECK: scf.yield + +// ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +#config = #iree_gpu.lowering_config<{ + workgroup = [64, 64, 0], + reduction = [0, 0, 2], + subgroup = [2, 2], + mma_kind = #iree_gpu.mma_layout}> + +!eltype = f8E4M3FNUZ +!aeltype = f32 + +hal.executable public @main { + hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { + hal.executable.export public @matmul_transpose_b_mfma_16x16x32_f8 ordinal(0) layout(#pipeline_layout) { + ^bb0(%arg0: !hal.device): + %x, %y, %z = flow.dispatch.workgroup_count_from_slice + hal.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @matmul_transpose_b_mfma_16x16x32_f8() + attributes {translation_info = #iree_codegen.translation_info} { + %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2048x1280x!eltype> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [10240, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<10240x1280x!eltype> + %5 = tensor.empty() : tensor<2048x10240x!aeltype> + %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<2048x10240x!aeltype>) -> tensor<2048x10240x!aeltype> + %7 = linalg.matmul_transpose_b {lowering_config = #config} + ins(%3, %4 : tensor<2048x1280x!eltype>, tensor<10240x1280x!eltype>) + outs(%6 : tensor<2048x10240x!aeltype>) -> tensor<2048x10240x!aeltype> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 10240], strides = [1, 1] : tensor<2048x10240x!aeltype> -> !flow.dispatch.tensor> + return + } + } + } +} + +// CHECK-LABEL: func @matmul_transpose_b_mfma_16x16x32_f8 +// CHECK-DAG: memref.alloc() : memref<64x64xf8E4M3FNUZ, #gpu.address_space> +// CHECK-DAG: memref.alloc() : memref<64x64xf8E4M3FNUZ, #gpu.address_space> +// CHECK: scf.for %{{.*}} = %c0 to %c40 step %c2 {{.*}} -> (vector<2x2x4x1xf32>) +// CHECK-COUNT-8: amdgpu.mfma {{.*}}blocks = 1 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32 +// CHECK: scf.yield + +// ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +#config = #iree_gpu.lowering_config<{ + workgroup = [64, 64, 0], + reduction = [0, 0, 2], + subgroup = [2, 2], + mma_kind = #iree_gpu.mma_layout}> + +!eltype = i8 +!aeltype = i32 + +hal.executable public @main { + hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { + hal.executable.export public @matmul_transpose_b_mfma_32x32x16_i8 ordinal(0) layout(#pipeline_layout) { + ^bb0(%arg0: !hal.device): + %x, %y, %z = flow.dispatch.workgroup_count_from_slice + hal.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @matmul_transpose_b_mfma_32x32x16_i8() + attributes {translation_info = #iree_codegen.translation_info} { + %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2048x1280x!eltype> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [10240, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<10240x1280x!eltype> + %5 = tensor.empty() : tensor<2048x10240x!aeltype> + %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<2048x10240x!aeltype>) -> tensor<2048x10240x!aeltype> + %7 = linalg.matmul_transpose_b {lowering_config = #config} + ins(%3, %4 : tensor<2048x1280x!eltype>, tensor<10240x1280x!eltype>) + outs(%6 : tensor<2048x10240x!aeltype>) -> tensor<2048x10240x!aeltype> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 10240], strides = [1, 1] : tensor<2048x10240x!aeltype> -> !flow.dispatch.tensor> + return + } + } + } +} + +// CHECK-LABEL: func @matmul_transpose_b_mfma_32x32x16_i8 +// CHECK-DAG: memref.alloc() : memref<64x32xi8, #gpu.address_space> +// CHECK-DAG: memref.alloc() : memref<64x32xi8, #gpu.address_space> +// CHECK: scf.for %{{.*}} = %c0 to %c80 step %c2 {{.*}} -> (vector<2x2x4x4x1xi32>) +// CHECK-COUNT-8: amdgpu.mfma {{.*}}blocks = 1 : i32, k = 16 : i32, m = 32 : i32, n = 32 : i32 +// CHECK: scf.yield + +// ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +#config = #iree_gpu.lowering_config<{ + workgroup = [64, 64, 0], + reduction = [0, 0, 2], + subgroup = [2, 2], + mma_kind = #iree_gpu.mma_layout}> + +!eltype = f16 +!aeltype = f16 + +hal.executable public @main { + hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { + hal.executable.export public @matmul_transpose_b_wmma_f16_16x16x16_f16 ordinal(0) layout(#pipeline_layout) { + ^bb0(%arg0: !hal.device): + %x, %y, %z = flow.dispatch.workgroup_count_from_slice + hal.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @matmul_transpose_b_wmma_f16_16x16x16_f16() + attributes {translation_info = #iree_codegen.translation_info} { + %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2048x1280x!eltype> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [10240, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<10240x1280x!eltype> + %5 = tensor.empty() : tensor<2048x10240x!aeltype> + %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<2048x10240x!aeltype>) -> tensor<2048x10240x!aeltype> + %7 = linalg.matmul_transpose_b {lowering_config = #config} + ins(%3, %4 : tensor<2048x1280x!eltype>, tensor<10240x1280x!eltype>) + outs(%6 : tensor<2048x10240x!aeltype>) -> tensor<2048x10240x!aeltype> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 10240], strides = [1, 1] : tensor<2048x10240x!aeltype> -> !flow.dispatch.tensor> + return + } + } + } +} + +// CHECK-LABEL: func @matmul_transpose_b_wmma_f16_16x16x16_f16 +// CHECK-DAG: memref.alloc() : memref<64x32xf16, #gpu.address_space> +// CHECK-DAG: memref.alloc() : memref<64x32xf16, #gpu.address_space> +// CHECK: scf.for %{{.*}} = %c0 to %c80 step %c2 {{.*}} -> (vector<2x2x8x1x1xf16>) +// CHECK-COUNT-8: amdgpu.wmma {{.*}} : vector<16xf16>, vector<16xf16>, vector<8xf16> +// CHECK: scf.yield