From a5537bc683b8476d12917ced00f3540859894de1 Mon Sep 17 00:00:00 2001 From: Stanley Winata <68087699+raikonenfnu@users.noreply.github.com> Date: Mon, 4 Nov 2024 10:30:53 -0800 Subject: [PATCH] [LLVMGPU] Teach KernelConfig to set MMA schedules per op in LoweringConfig (#18984) The main motivation for this change is to enable different intrinsics/layout on different ops inside the same function/dispatch, especially for attention. To that extend, we move the scheduling MMA information such as mma_intrinsic, subgroup_m_count, and subgroup_n_count, from the translation info attached to the function onto the lowering_config per op. Here is a quick summary of things we needed to do to achieve that: 1. Introduce setMmaKind, set/get subgroupMCount, and set/get subgroupMCount on IREE::GPU::LoweringConfigAttr 2. Move configuring of QK matmul's schedule into KernelConfig from LLVMGPUConfigureTensorLayout. 3. Now that qk and pv may have different intrinsic, update information used to decide transposeIntrinsic and reuseIntrinsic in LLVMGPUConfigureTensorLayout 4. Update a bunch of tests to use lowering config to configure MMAs now. --------- Signed-off-by: Stanley Winata --- .../Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp | 45 ++++++++++ .../Codegen/Dialect/GPU/IR/IREEGPUAttrs.td | 17 +++- .../compiler/Codegen/LLVMGPU/KernelConfig.cpp | 54 +++++++----- .../LLVMGPU/LLVMGPUConfigureTensorLayouts.cpp | 83 ++++++++++++------- .../ROCDL/config_user_vector_distribute.mlir | 20 ++--- .../config_vector_distribute_gfx1100.mlir | 8 +- .../config_vector_distribute_gfx940.mlir | 42 +++++----- .../pipeline_vector_distribute_gfx1100.mlir | 8 +- .../pipeline_vector_distribute_gfx940.mlir | 70 ++++++++-------- .../LLVMGPU/test/config_custom_op.mlir | 3 +- .../LLVMGPU/test/configure_tensor_layout.mlir | 42 ++++------ tests/e2e/matmul/generate_e2e_matmul_tests.py | 9 +- 12 files changed, 244 insertions(+), 157 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp index dfa6745a926e..0bbb8d1f4218 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp @@ -1618,6 +1618,51 @@ IREE::GPU::MmaInterfaceAttr LoweringConfigAttr::getMmaKind() const { return getAttributes().getAs(kMmaKindName); } +void LoweringConfigAttr::setMmaKind(MLIRContext *context, + SmallVectorImpl &attrs, + IREE::GPU::MmaInterfaceAttr kind) { + attrs.emplace_back(StringAttr::get(context, kMmaKindName), kind); +} + +// TODO: Merge subgroup counts functionality into subgroup tiling level +// lowering, when we have it implemented. +constexpr StringLiteral kSubgroupMCountName = "subgroup_m_count"; +constexpr StringLiteral kSubgroupNCountName = "subgroup_n_count"; + +std::optional LoweringConfigAttr::getSubgroupMCount() const { + auto subgroup_m_count_attr = + getAttributes().getAs(kSubgroupMCountName); + if (!subgroup_m_count_attr) { + return std::nullopt; + } + return subgroup_m_count_attr.getInt(); +} + +std::optional LoweringConfigAttr::getSubgroupNCount() const { + auto subgroup_n_count_attr = + getAttributes().getAs(kSubgroupNCountName); + if (!subgroup_n_count_attr) { + return std::nullopt; + } + return subgroup_n_count_attr.getInt(); +} + +void LoweringConfigAttr::setSubgroupMCount( + MLIRContext *context, SmallVectorImpl &attrs, + int64_t subgroup_m_count) { + attrs.emplace_back( + StringAttr::get(context, kSubgroupMCountName), + IntegerAttr::get(IntegerType::get(context, 64), subgroup_m_count)); +} + +void LoweringConfigAttr::setSubgroupNCount( + MLIRContext *context, SmallVectorImpl &attrs, + int64_t subgroup_n_count) { + attrs.emplace_back( + StringAttr::get(context, kSubgroupNCountName), + IntegerAttr::get(IntegerType::get(context, 64), subgroup_n_count)); +} + constexpr StringLiteral kPromoteOperandsName = "promote_operands"; std::optional> diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td index 411d3827888d..ee4cb932ded4 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td @@ -57,8 +57,23 @@ def IREEGPU_LoweringConfigAttr : "The configured fields, including tiling levels">:$attributes ); let extraClassDeclaration = [{ - /// Helper to retrieve a target mma intrinsic if present. + /// Helper to retrieve/set a target mma intrinsic. ::mlir::iree_compiler::IREE::GPU::MmaInterfaceAttr getMmaKind() const; + static void setMmaKind(MLIRContext *context, + SmallVectorImpl &attrs, + ::mlir::iree_compiler::IREE::GPU::MmaInterfaceAttr kind); + + // TODO: Merge subgroup counts functionality into subgroup tiling level + // lowering, when we have it implemented. + /// Helper to retrieve/set a target subgroup M/N counts. + std::optional getSubgroupMCount() const; + std::optional getSubgroupNCount() const; + static void setSubgroupMCount(MLIRContext *context, + SmallVectorImpl &attrs, + int64_t subgroup_m_count); + static void setSubgroupNCount(MLIRContext *context, + SmallVectorImpl &attrs, + int64_t subgroup_n_count); /// Helper to retrieve/set a list of operand indices to promote. std::optional> getPromotedOperandList() const; diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp index e8926eb6cb4d..6271fe458806 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp @@ -416,18 +416,17 @@ setConvolutionVectorDistributionConfig(IREE::GPU::TargetAttr target, attrs.emplace_back(StringAttr::get(context, "reduction"), b.getI64ArrayAttr(reductionTileSizes)); IREE::GPU::LoweringConfigAttr::setPromotedOperandList(context, attrs, {0, 1}); + IREE::GPU::LoweringConfigAttr::setMmaKind(context, attrs, + mmaAttrs[schedule->index]); + IREE::GPU::LoweringConfigAttr::setSubgroupMCount( + context, attrs, schedule->mSubgroupCounts[0]); + IREE::GPU::LoweringConfigAttr::setSubgroupNCount( + context, attrs, schedule->nSubgroupCounts[0]); auto configDict = DictionaryAttr::get(context, attrs); auto loweringConfig = IREE::GPU::LoweringConfigAttr::get(context, configDict); - // Attach the MMA schedule as an attribute to the entry point export function - // for later access in the pipeline. SmallVector pipelineAttrs; - auto scheduleAttr = IREE::GPU::MMAScheduleAttr::get( - context, mmaAttrs[schedule->index], schedule->mSubgroupCounts[0], - schedule->nSubgroupCounts[0]); - pipelineAttrs.emplace_back(StringAttr::get(context, "mma_schedule"), - scheduleAttr); // Prefetch shared memory if requested. if (clLLVMGPUEnablePrefetch) { @@ -682,6 +681,12 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target, attrs.emplace_back(StringAttr::get(context, "reduction"), b.getI64ArrayAttr(reductionTileSizes)); IREE::GPU::LoweringConfigAttr::setPromotedOperandList(context, attrs, {0, 1}); + IREE::GPU::LoweringConfigAttr::setMmaKind(context, attrs, + mmaAttrs[schedule->index]); + IREE::GPU::LoweringConfigAttr::setSubgroupMCount( + context, attrs, schedule->mSubgroupCounts[0]); + IREE::GPU::LoweringConfigAttr::setSubgroupNCount( + context, attrs, schedule->nSubgroupCounts[0]); auto configDict = DictionaryAttr::get(context, attrs); auto loweringConfig = IREE::GPU::LoweringConfigAttr::get(context, configDict); @@ -689,11 +694,6 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target, // Attach the MMA schedule as an attribute to the entry point export function // for later access in the pipeline. SmallVector pipelineAttrs; - auto scheduleAttr = IREE::GPU::MMAScheduleAttr::get( - context, mmaAttrs[schedule->index], schedule->mSubgroupCounts[0], - schedule->nSubgroupCounts[0]); - pipelineAttrs.emplace_back(StringAttr::get(context, "mma_schedule"), - scheduleAttr); // Prefetch shared memory if requested. if (clLLVMGPUEnablePrefetch) { @@ -902,9 +902,32 @@ setAttentionVectorDistributionConfig(IREE::GPU::TargetAttr target, SmallVector qkConfig; SmallVector pvConfig; + // On attention subgroup distribution: + // The subgroup distribution in attention is controlled by the second matmul + // (Parallel dimension distribution is usually (almost always) controlled by + // the last reduction operation in a dispatch). Since VectorDistribution + // doesn't have logic to set subgroup and thread layouts seperately, we + // explicitly set the subgroup count for the first matmul as well, + // corresponding to what the second matmul dictates. + + // Configuring for qk matmul. + // subgroup_n count for qk matmul is always 1, since we do not tile K1. IREE::GPU::LoweringConfigAttr::setPromotedOperandList(context, qkConfig, {0, 1}); + IREE::GPU::LoweringConfigAttr::setMmaKind(context, qkConfig, + mmaAttrs[schedule->index]); + IREE::GPU::LoweringConfigAttr::setSubgroupMCount( + context, qkConfig, schedule->mSubgroupCounts[0]); + IREE::GPU::LoweringConfigAttr::setSubgroupNCount(context, qkConfig, 1); + + // Configuring for pv matmul. IREE::GPU::LoweringConfigAttr::setPromotedOperandList(context, pvConfig, {1}); + IREE::GPU::LoweringConfigAttr::setMmaKind(context, pvConfig, + mmaAttrs[schedule->index]); + IREE::GPU::LoweringConfigAttr::setSubgroupMCount( + context, pvConfig, schedule->mSubgroupCounts[0]); + IREE::GPU::LoweringConfigAttr::setSubgroupNCount( + context, pvConfig, schedule->nSubgroupCounts[0]); SmallVector qkAttrs; SmallVector pvAttrs; @@ -938,14 +961,7 @@ setAttentionVectorDistributionConfig(IREE::GPU::TargetAttr target, auto configDict = b.getDictionaryAttr(attrs); auto loweringConfig = IREE::GPU::LoweringConfigAttr::get(context, configDict); - // Attach the MMA schedule as an attribute to the entry point export function - // for later access in the pipeline. SmallVector pipelineAttrs; - auto scheduleAttr = IREE::GPU::MMAScheduleAttr::get( - context, mmaAttrs[schedule->index], schedule->mSubgroupCounts[0], - schedule->nSubgroupCounts[0]); - pipelineAttrs.emplace_back(StringAttr::get(context, "mma_schedule"), - scheduleAttr); // TODO: We do not turn prefetching on even when requested by the prefetching // flag because there is a shared memory allocation the two matmuls, which diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureTensorLayouts.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureTensorLayouts.cpp index 4945e6677e83..7008f3e1376e 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureTensorLayouts.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureTensorLayouts.cpp @@ -48,6 +48,29 @@ static SmallVector getPromotedOperands(Operation *op) { return promotedOperands; } +static IREE::GPU::MmaInterfaceAttr getIntrinsic(Operation *op) { + auto config = getLoweringConfig(op); + assert(config && "Cannot find intrinsic from unconfigured op."); + + IREE::GPU::MmaInterfaceAttr mmaIntrinsic = config.getMmaKind(); + assert(mmaIntrinsic && "Cannot find intrinsic in lowering config."); + return mmaIntrinsic; +} + +static int64_t getSubgroupMCount(Operation *op) { + auto config = getLoweringConfig(op); + assert(config && "Cannot find intrinsic from unconfigured op."); + + return *config.getSubgroupMCount(); +} + +static int64_t getSubgroupNCount(Operation *op) { + auto config = getLoweringConfig(op); + assert(config && "Cannot find intrinsic from unconfigured op."); + + return *config.getSubgroupNCount(); +} + static LogicalResult setContractionAnchor(IREE::GPU::MMAScheduleAttr schedule, SmallVector promotedOperands, RewriterBase &rewriter, @@ -264,14 +287,19 @@ transposeSchedule(RewriterBase &rewriter, IREE::GPU::MMAScheduleAttr schedule) { schedule.getSubgroupMCount()); } -static LogicalResult -setAttentionMatmulAnchor(IREE::GPU::MMAScheduleAttr schedule, - RewriterBase &rewriter, linalg::LinalgOp qkMatmul, - linalg::LinalgOp pvMatmul) { - // TODO: Add SIMT fallback. - if (!schedule) { - return pvMatmul->emitError("missing mma schedule for contraction"); - } +static LogicalResult setAttentionMatmulAnchor(RewriterBase &rewriter, + linalg::LinalgOp qkMatmul, + linalg::LinalgOp pvMatmul) { + + IREE::GPU::MMAScheduleAttr qkSchedule = + rewriter.getAttr(getIntrinsic(qkMatmul), + getSubgroupMCount(qkMatmul), + getSubgroupNCount(qkMatmul)); + + IREE::GPU::MMAScheduleAttr pvSchedule = + rewriter.getAttr(getIntrinsic(pvMatmul), + getSubgroupMCount(pvMatmul), + getSubgroupNCount(pvMatmul)); // Check if the intrinsic output for qkMatmul can be reused for pvMatmul. // We know that pvMatmul takes result of qkMatmul as it's lhs. @@ -280,13 +308,14 @@ setAttentionMatmulAnchor(IREE::GPU::MMAScheduleAttr schedule, bool reuseIntrinsicOutput = false; bool transposeIntrinsic = false; - auto intrinsic = cast(schedule.getIntrinsic()); + auto qkIntrinsic = cast(qkSchedule.getIntrinsic()); + auto pvIntrinsic = cast(pvSchedule.getIntrinsic()); IREE::GPU::MMASingleSubgroupLayout lhsLayout = - intrinsic.getASingleSubgroupLayout(); + pvIntrinsic.getASingleSubgroupLayout(); IREE::GPU::MMASingleSubgroupLayout rhsLayout = - intrinsic.getBSingleSubgroupLayout(); + pvIntrinsic.getBSingleSubgroupLayout(); IREE::GPU::MMASingleSubgroupLayout outLayout = - intrinsic.getCSingleSubgroupLayout(); + qkIntrinsic.getCSingleSubgroupLayout(); auto matchLayout = [](IREE::GPU::MMASingleSubgroupLayout layoutA, IREE::GPU::MMASingleSubgroupLayout layoutB) -> bool { @@ -305,15 +334,6 @@ setAttentionMatmulAnchor(IREE::GPU::MMAScheduleAttr schedule, transposeIntrinsic = true; } - // subgroup_n count for attention matmul is always 1, because it is the - // reduction dimension. The subgroup_n count is in reality, for the pvMatmul. - IREE::GPU::MMAScheduleAttr qkSchedule = - rewriter.getAttr( - schedule.getIntrinsic(), - /*subgroup_m_count=*/schedule.getSubgroupMCount(), - /*subgroup_n_count=*/1); - IREE::GPU::MMAScheduleAttr pvSchedule = schedule; - SmallVector promotedQKOperands = getPromotedOperands(qkMatmul); SmallVector promotedPVOperands = getPromotedOperands(pvMatmul); @@ -488,12 +508,6 @@ struct LLVMGPUConfigureTensorLayoutsPass final return signalPassFailure(); } - llvm::StringLiteral scheduleAttrName = - IREE::GPU::MMAScheduleAttr::getMnemonic(); - DictionaryAttr configDict = getTranslationInfo(func).getConfiguration(); - auto scheduleAttr = dyn_cast_or_null( - configDict.get(scheduleAttrName)); - // Vector layout option setter aimed at contractions and convolutions. For // now, layout setting for other problems like reductions is TODO. SmallVector contracts; @@ -529,23 +543,28 @@ struct LLVMGPUConfigureTensorLayoutsPass final for (linalg::LinalgOp contract : contracts) { SmallVector promotedOperands = getPromotedOperands(contract); - if (failed(setContractionAnchor(scheduleAttr, promotedOperands, rewriter, - contract))) { + auto contractionSchedule = rewriter.getAttr( + getIntrinsic(contract), getSubgroupMCount(contract), + getSubgroupNCount(contract)); + if (failed(setContractionAnchor(contractionSchedule, promotedOperands, + rewriter, contract))) { return signalPassFailure(); } } for (linalg::LinalgOp conv : convs) { SmallVector promotedOperands = getPromotedOperands(conv); - if (failed(setConvolutionAnchor(scheduleAttr, promotedOperands, rewriter, + auto convSchedule = rewriter.getAttr( + getIntrinsic(conv), getSubgroupMCount(conv), getSubgroupNCount(conv)); + if (failed(setConvolutionAnchor(convSchedule, promotedOperands, rewriter, conv))) { return signalPassFailure(); } } if (attentionQKMatmul && attentionPVMatmul) { - if (failed(setAttentionMatmulAnchor( - scheduleAttr, rewriter, attentionQKMatmul, attentionPVMatmul))) { + if (failed(setAttentionMatmulAnchor(rewriter, attentionQKMatmul, + attentionPVMatmul))) { return signalPassFailure(); } } diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_user_vector_distribute.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_user_vector_distribute.mlir index a1b1627a84dd..28201626af15 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_user_vector_distribute.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_user_vector_distribute.mlir @@ -11,12 +11,12 @@ // OPT-OUT: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info -// OPT-OUT-SAME: mma_schedule = #iree_gpu.mma_schedule, // OPT-IN: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info -// OPT-IN-SAME: mma_schedule = #iree_gpu.mma_schedule, -#config = #iree_gpu.lowering_config<{workgroup = [128, 128, 0], reduction = [0, 0, 32], promote_operands = [0, 1]}> +#config = #iree_gpu.lowering_config<{workgroup = [128, 128, 0], reduction = [0, 0, 32], promote_operands = [0, 1], + mma_kind = #iree_gpu.mma_layout, + subgroup_m_count = 2, subgroup_n_count = 2}> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, @@ -48,7 +48,6 @@ hal.executable public @main_0_dispatch_0 { func.func @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32() attributes {translation_info = #iree_codegen.translation_info, subgroup_m_count = 2, subgroup_n_count = 2>, gpu_pipeline_options = #iree_gpu.pipeline_options // Disable the 'reduceSharedMemoryBankConflicts' pass. }>} { %cst = arith.constant 0.000000e+00 : f16 @@ -87,12 +86,12 @@ hal.executable public @main_0_dispatch_0 { // OPT-OUT: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info> -// OPT-OUT-SAME: mma_schedule = #iree_gpu.mma_schedule, // OPT-IN: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info> -// OPT-IN-SAME: mma_schedule = #iree_gpu.mma_schedule, -#config = #iree_gpu.lowering_config<{workgroup = [128, 128, 0], reduction = [0, 0, 32], promote_operands = [0, 1]}> +#config = #iree_gpu.lowering_config<{workgroup = [128, 128, 0], reduction = [0, 0, 32], promote_operands = [0, 1], + mma_kind = #iree_gpu.mma_layout, + subgroup_m_count = 2, subgroup_n_count = 2}> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, @@ -125,7 +124,6 @@ hal.executable public @main_0_dispatch_0 { // OPT-IN: scf.for func.func @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32() attributes {translation_info = #iree_codegen.translation_info, subgroup_m_count = 2, subgroup_n_count = 2>, gpu_pipeline_options = #iree_gpu.pipeline_options> // enable the 'reorderWorkgroups' pass. }>} { %cst = arith.constant 0.000000e+00 : f16 @@ -163,8 +161,9 @@ hal.executable public @main_0_dispatch_0 { // OPT-OUT: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info> -// OPT-OUT-SAME: mma_schedule = #iree_gpu.mma_schedule, -#config = #iree_gpu.lowering_config<{workgroup = [128, 128, 0], reduction = [0, 0, 32], promote_operands = [0, 1]}> +#config = #iree_gpu.lowering_config<{workgroup = [128, 128, 0], reduction = [0, 0, 32], promote_operands = [0, 1], + mma_kind = #iree_gpu.mma_layout, + subgroup_m_count = 2, subgroup_n_count = 2}> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, @@ -186,7 +185,6 @@ hal.executable public @main_0_dispatch_0 { // OPT-OUT-NEXT: scf.for func.func @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32() attributes {translation_info = #iree_codegen.translation_info, subgroup_m_count = 2, subgroup_n_count = 2>, gpu_pipeline_options = #iree_gpu.pipeline_options> // Disable the 'reorderWorkgroups' pass. }>} { %cst = arith.constant 0.000000e+00 : f16 diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_gfx1100.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_gfx1100.mlir index 9d45ea04490a..6028b4790552 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_gfx1100.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_gfx1100.mlir @@ -6,9 +6,8 @@ // located here. // WMMA: #iree_codegen.translation_info -// WMMA-SAME: subgroup_m_count = 2, subgroup_n_count = 2 +// WMMA-SAME: workgroup_size = [128, 1, 1] +// WMMA-SAME: subgroup_size = 32 #pipeline_layout = #hal.pipeline.layout, @@ -32,5 +31,8 @@ func.func @wmma_matmul_1024x1024x1024() { // WMMA-LABEL: func.func @wmma_matmul_1024x1024x1024() // WMMA: linalg.matmul {{.*}}lowering_config = #iree_gpu.lowering_config +// WMMA-SAME: mma_kind = #iree_gpu.mma_layout // WMMA-SAME: reduction = [0, 0, 64] +// WMMA-SAME: subgroup_m_count = 2 +// WMMA-SAME: subgroup_n_count = 2 // WMMA-SAME: workgroup = [64, 128, 0] diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_gfx940.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_gfx940.mlir index da6a563c676d..46b82923fe88 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_gfx940.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_gfx940.mlir @@ -7,9 +7,6 @@ // located here. // CHECK: #iree_codegen.translation_info -// CHECK-SAME: subgroup_m_count = 1, subgroup_n_count = 4 #pipeline_layout = #hal.pipeline.layout, @@ -41,15 +38,15 @@ func.func @expanded_matmul_transpose_b() { // CHECK-LABEL: func.func @expanded_matmul_transpose_b() // CHECK: linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config +// CHECK-SAME: mma_kind = #iree_gpu.mma_layout // CHECK-SAME: reduction = [0, 0, 0, 0, 128] +// CHECK-SAME: subgroup_m_count = 1 +// CHECK-SAME: subgroup_n_count = 4 // CHECK-SAME: workgroup = [1, 1, 64, 64, 0] // ----- // CHECK: #iree_codegen.translation_info -// CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2 #pipeline_layout = #hal.pipeline.layout, @@ -73,7 +70,10 @@ func.func @conv_nhwc() { // CHECK-LABEL: func.func @conv_nhwc() // CHECK: linalg.conv_2d_nhwc_hwcf {{.*}} lowering_config = #iree_gpu.lowering_config +// CHECK-SAME: mma_kind = #iree_gpu.mma_layout // CHECK-SAME: reduction = [0, 0, 0, 0, 1, 1, 32] +// CHECK-SAME: subgroup_m_count = 2 +// CHECK-SAME: subgroup_n_count = 2 // CHECK-SAME: workgroup = [1, 1, 64, 128, 0, 0, 0] // ----- @@ -113,9 +113,6 @@ func.func @matmul_256x256x256() attributes {hal.executable.target = #executable_ // ----- // CHECK: #iree_codegen.translation_info -// CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2 #pipeline_layout = #hal.pipeline.layout, @@ -139,15 +136,15 @@ func.func @mfma_matmul_1024x1024x1024() { // CHECK-LABEL: func.func @mfma_matmul_1024x1024x1024() // CHECK: linalg.matmul {{.*}}lowering_config = #iree_gpu.lowering_config +// CHECK-SAME: mma_kind = #iree_gpu.mma_layout // CHECK-SAME: reduction = [0, 0, 64] +// CHECK-SAME: subgroup_m_count = 2 +// CHECK-SAME: subgroup_n_count = 2 // CHECK-SAME: workgroup = [64, 128, 0] // ----- // CHECK: #iree_codegen.translation_info -// CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2 #pipeline_layout = #hal.pipeline.layout, @@ -190,15 +187,15 @@ func.func @conv_nchwc() { // CHECK-LABEL: func.func @conv_nchwc() // CHECK: linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config +// CHECK-SAME: mma_kind = #iree_gpu.mma_layout // CHECK-SAME: reduction = [0, 0, 0, 0, 0, 1, 1, 1, 32] +// CHECK-SAME: subgroup_m_count = 2 +// CHECK-SAME: subgroup_n_count = 2 // CHECK-SAME: workgroup = [1, 1, 1, 32, 32, 0, 0, 0, 0] // ----- // CHECK: #iree_codegen.translation_info -// CHECK-SAME: subgroup_m_count = 1, subgroup_n_count = 1 #pipeline_layout = #hal.pipeline.layout, @@ -222,15 +219,15 @@ func.func @unaligned_mk_batch_matmul() { // CHECK-LABEL: func.func @unaligned_mk_batch_matmul() // CHECK: linalg.batch_matmul // CHECK-SAME: lowering_config = #iree_gpu.lowering_config +// CHECK-SAME: mma_kind = #iree_gpu.mma_layout // CHECK-SAME: reduction = [0, 0, 0, 16] +// CHECK-SAME: subgroup_m_count = 1 +// CHECK-SAME: subgroup_n_count = 1 // CHECK-SAME: workgroup = [1, 16, 16, 0] // ----- // CHECK: #iree_codegen.translation_info -// CHECK-SAME: subgroup_m_count = 1, subgroup_n_count = 4 #pipeline_layout = #hal.pipeline.layout, @@ -254,7 +251,10 @@ func.func @unaligned_m_batch_matmul_64x72x1280x1280() { // CHECK-LABEL: func.func @unaligned_m_batch_matmul_64x72x1280x1280() // CHECK: linalg.batch_matmul // CHECK-SAME: lowering_config = #iree_gpu.lowering_config +// CHECK-SAME: mma_kind = #iree_gpu.mma_layout // CHECK-SAME: reduction = [0, 0, 0, 128] +// CHECK-SAME: subgroup_m_count = 1 +// CHECK-SAME: subgroup_n_count = 4 // CHECK-SAME: workgroup = [1, 16, 128, 0] // ----- @@ -318,7 +318,6 @@ func.func @matmul_dynamic_dim() { // ----- // CHECK: #iree_codegen.translation_info -#translation = #iree_codegen.translation_info, mma_schedule = #iree_gpu.mma_schedule, subgroup_m_count = 2, subgroup_n_count = 2>}> +#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 128], mma_kind = #iree_gpu.mma_layout, subgroup_m_count = 2, subgroup_n_count = 2}> +#translation = #iree_codegen.translation_info}> #pipeline_layout = #hal.pipeline.layout, @@ -50,8 +50,8 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { // ----- -#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 128]}> -#translation = #iree_codegen.translation_info, mma_schedule = #iree_gpu.mma_schedule, subgroup_m_count = 2, subgroup_n_count = 2>}> +#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 128], mma_kind = #iree_gpu.mma_layout, subgroup_m_count = 2, subgroup_n_count = 2}> +#translation = #iree_codegen.translation_info}> #pipeline_layout = #hal.pipeline.layout, diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx940.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx940.mlir index cedec2d21f2f..1722b894657e 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx940.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx940.mlir @@ -8,8 +8,8 @@ // RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-llvmgpu-lower-executable-target)))))" \ // RUN: %s | FileCheck %s --check-prefix=MEMORY -#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 128], promote_operands = [0, 1]}> -#translation = #iree_codegen.translation_info, mma_schedule = #iree_gpu.mma_schedule, subgroup_m_count = 2, subgroup_n_count = 2>}> +#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 128], promote_operands = [0, 1], mma_kind = #iree_gpu.mma_layout, subgroup_m_count = 2, subgroup_n_count = 2}> +#translation = #iree_codegen.translation_info}> #pipeline_layout = #hal.pipeline.layout, @@ -54,8 +54,8 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { // ----- -#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 128], promote_operands = [0, 1]}> -#translation = #iree_codegen.translation_info, mma_schedule = #iree_gpu.mma_schedule, subgroup_m_count = 2, subgroup_n_count = 2>}> +#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 128], promote_operands = [0, 1], mma_kind = #iree_gpu.mma_layout, subgroup_m_count = 2, subgroup_n_count = 2}> +#translation = #iree_codegen.translation_info}> #pipeline_layout = #hal.pipeline.layout, @@ -98,8 +98,8 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { // ----- -#config = #iree_gpu.lowering_config<{workgroup = [1, 1, 64, 64, 0], reduction = [0, 0, 0, 0, 128], promote_operands = [0, 1]}> -#translation = #iree_codegen.translation_info, mma_schedule = #iree_gpu.mma_schedule, subgroup_m_count = 1, subgroup_n_count = 4>}> +#config = #iree_gpu.lowering_config<{workgroup = [1, 1, 64, 64, 0], reduction = [0, 0, 0, 0, 128], promote_operands = [0, 1], mma_kind = #iree_gpu.mma_layout, subgroup_m_count = 1, subgroup_n_count = 4}> +#translation = #iree_codegen.translation_info}> #pipeline_layout = #hal.pipeline.layout, @@ -180,7 +180,7 @@ hal.executable @matmul_multiple_k { hal.return %x, %y, %z : index, index, index } builtin.module { - func.func @matmul_multiple_k() attributes {translation_info = #iree_codegen.translation_info, subgroup_m_count = 1, subgroup_n_count = 4>}>} { + func.func @matmul_multiple_k() attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index %0 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> @@ -190,7 +190,7 @@ hal.executable @matmul_multiple_k { %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [10, 128, 64, 2048], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<10x128x64x2048xf16> %5 = tensor.empty() : tensor<2x10x64x64xf16> %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<2x10x64x64xf16>) -> tensor<2x10x64x64xf16> - %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d4, d2, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d4, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction"]} ins(%3, %4 : tensor<2x128x64x2048xf16>, tensor<10x128x64x2048xf16>) outs(%6 : tensor<2x10x64x64xf16>) attrs = {lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 0, 0, 1, 128], workgroup = [1, 1, 64, 64, 0, 0], promote_operands = [0, 1]}>} { + %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d4, d2, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d4, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction"]} ins(%3, %4 : tensor<2x128x64x2048xf16>, tensor<10x128x64x2048xf16>) outs(%6 : tensor<2x10x64x64xf16>) attrs = {lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 0, 0, 1, 128], workgroup = [1, 1, 64, 64, 0, 0], promote_operands = [0, 1], mma_kind = #iree_gpu.mma_layout, subgroup_m_count = 1, subgroup_n_count = 4}>} { ^bb0(%in: f16, %in_0: f16, %out: f16): %8 = arith.mulf %in, %in_0 : f16 %9 = arith.addf %8, %out : f16 @@ -217,8 +217,8 @@ hal.executable @matmul_multiple_k { // Basic f8, f8 -> f32 matmul. -#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 256], promote_operands = [0, 1]}> -#translation = #iree_codegen.translation_info, mma_schedule = #iree_gpu.mma_schedule, subgroup_m_count = 2, subgroup_n_count = 2>}> +#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 256], promote_operands = [0, 1], mma_kind = #iree_gpu.mma_layout, subgroup_m_count = 2, subgroup_n_count = 2}> +#translation = #iree_codegen.translation_info}> #pipeline_layout = #hal.pipeline.layout, @@ -263,8 +263,8 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { // Basic i8, i8 -> i32 matmul. -#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 256], promote_operands = [0, 1]}> -#translation = #iree_codegen.translation_info, mma_schedule = #iree_gpu.mma_schedule, subgroup_m_count = 2, subgroup_n_count = 2>}> +#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 256], promote_operands = [0, 1], mma_kind = #iree_gpu.mma_layout, subgroup_m_count = 2, subgroup_n_count = 2}> +#translation = #iree_codegen.translation_info}> #pipeline_layout = #hal.pipeline.layout, @@ -309,8 +309,8 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { // Basic i8, i8 -> i32 matmul_transpose_b. -#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 256], promote_operands = [0, 1]}> -#translation = #iree_codegen.translation_info, mma_schedule = #iree_gpu.mma_schedule, subgroup_m_count = 2, subgroup_n_count = 2>}> +#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 256], promote_operands = [0, 1], mma_kind = #iree_gpu.mma_layout, subgroup_m_count = 2, subgroup_n_count = 2}> +#translation = #iree_codegen.translation_info}> #pipeline_layout = #hal.pipeline.layout, @@ -353,8 +353,8 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { // ----- -#config = #iree_gpu.lowering_config<{workgroup = [1, 1, 64, 128, 0, 0, 0], reduction = [0, 0, 0, 0, 1, 1, 32], promote_operands = [0, 1]}> -#translation = #iree_codegen.translation_info, mma_schedule = #iree_gpu.mma_schedule, subgroup_m_count = 2, subgroup_n_count = 2>}> +#config = #iree_gpu.lowering_config<{workgroup = [1, 1, 64, 128, 0, 0, 0], reduction = [0, 0, 0, 0, 1, 1, 32], promote_operands = [0, 1], mma_kind = #iree_gpu.mma_layout, subgroup_m_count = 2, subgroup_n_count = 2}> +#translation = #iree_codegen.translation_info}> #pipeline_layout = #hal.pipeline.layout, @@ -396,8 +396,8 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { // ----- -#config = #iree_gpu.lowering_config<{workgroup = [1, 64, 1, 64, 0], reduction = [0, 0, 0, 0, 128], promote_operands = [0, 1]}> -#translation = #iree_codegen.translation_info, mma_schedule = #iree_gpu.mma_schedule, subgroup_m_count = 2, subgroup_n_count = 2>}> +#config = #iree_gpu.lowering_config<{workgroup = [1, 64, 1, 64, 0], reduction = [0, 0, 0, 0, 128], promote_operands = [0, 1], mma_kind = #iree_gpu.mma_layout, subgroup_m_count = 2, subgroup_n_count = 2}> +#translation = #iree_codegen.translation_info}> #pipeline_layout = #hal.pipeline.layout, @@ -462,8 +462,8 @@ hal.executable public @main_dispatch_expanded_matmul { // ----- -#config = #iree_gpu.lowering_config<{workgroup = [1, 16, 16, 0], reduction = [0, 0, 0, 16], promote_operands = [0, 1]}> -#translation = #iree_codegen.translation_info, mma_schedule = #iree_gpu.mma_schedule, subgroup_m_count = 1, subgroup_n_count = 1>}> +#config = #iree_gpu.lowering_config<{workgroup = [1, 16, 16, 0], reduction = [0, 0, 0, 16], promote_operands = [0, 1], mma_kind = #iree_gpu.mma_layout, subgroup_m_count = 1, subgroup_n_count = 1}> +#translation = #iree_codegen.translation_info}> #pipeline_layout = #hal.pipeline.layout, @@ -533,8 +533,8 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { // ----- -#config = #iree_gpu.lowering_config<{workgroup = [1, 16, 32, 0], reduction = [0, 0, 0, 8], promote_operands = [0, 1]}> -#translation = #iree_codegen.translation_info, subgroup_m_count = 1, subgroup_n_count = 2>}> +#config = #iree_gpu.lowering_config<{workgroup = [1, 16, 32, 0], reduction = [0, 0, 0, 8], promote_operands = [0, 1], mma_kind = #iree_gpu.mma_layout, subgroup_m_count = 1, subgroup_n_count = 2}> +#translation = #iree_codegen.translation_info #pipeline_layout = #hal.pipeline.layout, @@ -604,8 +604,8 @@ hal.executable public @pad_batch_matmul { // NOTE: This test is not exhaustive of all possible ways the above condition is breaking, // but rather is an example of a matmul shape from a model that broke our compilation heuristic. -#config = #iree_gpu.lowering_config<{workgroup = [1, 16, 128, 0], reduction = [0, 0, 0, 128], promote_operands = [0, 1]}> -#translation = #iree_codegen.translation_info, mma_schedule = #iree_gpu.mma_schedule, subgroup_m_count = 1, subgroup_n_count = 4>}> +#config = #iree_gpu.lowering_config<{workgroup = [1, 16, 128, 0], reduction = [0, 0, 0, 128], promote_operands = [0, 1], mma_kind = #iree_gpu.mma_layout, subgroup_m_count = 1, subgroup_n_count = 4}> +#translation = #iree_codegen.translation_info}> #pipeline_layout = #hal.pipeline.layout, @@ -657,8 +657,8 @@ hal.executable public @contract_schedule_considering_read_layout { // This test ensures that we can generate and decompose the right instructions from V(Virtual) MFMAs. -#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 128], promote_operands = [0, 1]}> -#translation = #iree_codegen.translation_info, mma_schedule = #iree_gpu.mma_schedule, subgroup_m_count = 2, subgroup_n_count = 2>}> +#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 128], promote_operands = [0, 1], mma_kind = #iree_gpu.mma_layout, subgroup_m_count = 2, subgroup_n_count = 2}> +#translation = #iree_codegen.translation_info}> #pipeline_layout = #hal.pipeline.layout, @@ -719,7 +719,7 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { // ----- #config = #iree_gpu.lowering_config<{workgroup = [1, 64, 0, 0, 64], reduction = [0, 0, 0, 64, 0], promote_operands = [0, 1, 2]}> -#translation = #iree_codegen.translation_info, subgroup_m_count = 2, subgroup_n_count = 1>}> +#translation = #iree_codegen.translation_info #pipeline_layout = #hal.pipeline.layout, @@ -753,8 +753,8 @@ hal.executable private @attention_20x4096x64x4096x64 { affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>], lowering_config = #config, decomposition_config = { - qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1]}>}, - pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{promote_operands = [1]}>} + qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout, subgroup_m_count = 2, subgroup_n_count = 1, promote_operands = [0, 1]}>}, + pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout, subgroup_m_count = 2, subgroup_n_count = 1, promote_operands = [1]}>} }} ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) { ^bb0(%score: f32): @@ -792,7 +792,7 @@ hal.executable private @attention_20x4096x64x4096x64 { // ----- #config = #iree_gpu.lowering_config<{workgroup = [1, 1, 64, 0, 0, 64], reduction = [0, 0, 0, 0, 64, 0], promote_operands = [0, 1, 2]}> -#translation = #iree_codegen.translation_info, subgroup_m_count = 2, subgroup_n_count = 1>}> +#translation = #iree_codegen.translation_info #pipeline_layout = #hal.pipeline.layout, @@ -827,8 +827,8 @@ hal.executable private @attention_multiple_m_transpose { affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d5)>], lowering_config = #config, decomposition_config = { - qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1]}>}, - pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{promote_operands = [1]}>} + qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout, subgroup_m_count = 2, subgroup_n_count = 1, promote_operands = [0, 1]}>}, + pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout, subgroup_m_count = 2, subgroup_n_count = 1, promote_operands = [1]}>} }} ins(%4, %5, %6, %cst : tensor<24x64x4608x128xf16>, tensor<24x4608x128xf16>, tensor<24x4608x128xf16>, f16) outs(%8 : tensor<24x64x4608x128xf16>) { ^bb0(%score: f32): @@ -860,7 +860,7 @@ hal.executable private @attention_multiple_m_transpose { // ----- #config = #iree_gpu.lowering_config<{workgroup = [1, 1, 128, 0, 0, 64], reduction = [0, 0, 0, 0, 32, 0], promote_operands = [0, 1, 2]}> -#translation = #iree_codegen.translation_info, subgroup_m_count = 4, subgroup_n_count = 1>}> +#translation = #iree_codegen.translation_info #pipeline_layout = #hal.pipeline.layout, @@ -895,8 +895,8 @@ hal.executable private @attention_mfma_32x32x8 { affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d5)>], lowering_config = #config, decomposition_config = { - qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1]}>}, - pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{promote_operands = [1]}>} + qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout, subgroup_m_count = 4, subgroup_n_count = 1, promote_operands = [0, 1]}>}, + pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout, subgroup_m_count = 4, subgroup_n_count = 1, promote_operands = [1]}>} }} ins(%4, %5, %6, %cst : tensor<24x64x4608x128xf16>, tensor<24x4608x128xf16>, tensor<24x4608x128xf16>, f16) outs(%8 : tensor<24x64x4608x128xf16>) { ^bb0(%score: f32): diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_custom_op.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_custom_op.mlir index 032bd68ce040..2dd57f30da4a 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_custom_op.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_custom_op.mlir @@ -34,14 +34,13 @@ func.func @custom_op(%arg0 : tensor<384x512xf32>, %arg1 : tensor<512x128xf32>, } // CHECK: #[[CONFIG:.+]] = #iree_codegen.lowering_config // CHECK: #[[TRANSLATION:.+]] = #iree_codegen.translation_info, subgroup_m_count = 2, subgroup_n_count = 2> // CHECK: func @custom_op // CHECK-SAME: translation_info = #[[TRANSLATION]] // CHECK: iree_linalg_ext.custom_op // CHECK-SAME: lowering_config = #[[CONFIG]] // CHECK: ^bb // CHECK: linalg.matmul -// CHECK-SAME: lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 32], workgroup = [64, 64, 0]}> +// CHECK-SAME: lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout, promote_operands = [0, 1], reduction = [0, 0, 32], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 64, 0]}> // CHECK: iree_linalg_ext.yield // ----- diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_tensor_layout.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_tensor_layout.mlir index 6c96d1505e36..937174d13fb4 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_tensor_layout.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_tensor_layout.mlir @@ -2,10 +2,7 @@ #translation = #iree_codegen.translation_info, - subgroup_m_count = 1, - subgroup_n_count = 1>}> + subgroup_size = 64> #maps = [ affine_map<(m, n, k) -> (m, k)>, @@ -15,7 +12,9 @@ #traits = { indexing_maps = #maps, - iterator_types = ["parallel", "parallel", "reduction"] + iterator_types = ["parallel", "parallel", "reduction"], + lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout, + subgroup_m_count = 1, subgroup_n_count = 1}> } func.func @matmul_96x64x16_mfma(%lhs: tensor<96x16xf16>, @@ -53,10 +52,7 @@ func.func @matmul_96x64x16_mfma(%lhs: tensor<96x16xf16>, #translation = #iree_codegen.translation_info, - subgroup_m_count = 1, - subgroup_n_count = 1>}> + subgroup_size = 64> #maps = [ affine_map<(m, n, k) -> (m, k)>, @@ -66,7 +62,9 @@ func.func @matmul_96x64x16_mfma(%lhs: tensor<96x16xf16>, #traits = { indexing_maps = #maps, - iterator_types = ["parallel", "parallel", "reduction"] + iterator_types = ["parallel", "parallel", "reduction"], + lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout, + subgroup_m_count = 1 : i64, subgroup_n_count = 1 : i64}> } func.func @matmul_96x64x16_wmma(%lhs: tensor<96x16xf16>, @@ -117,7 +115,9 @@ func.func @matmul_96x64x16_wmma(%lhs: tensor<96x16xf16>, #traits = { indexing_maps = #maps, - iterator_types = ["parallel", "parallel", "reduction"] + iterator_types = ["parallel", "parallel", "reduction"], + lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout, + subgroup_m_count = 4, subgroup_n_count = 1}> } func.func @matmul_128x64x16_multi_subgroup(%lhs: tensor<128x16xf16>, @@ -155,10 +155,7 @@ func.func @matmul_128x64x16_multi_subgroup(%lhs: tensor<128x16xf16>, #translation = #iree_codegen.translation_info, - subgroup_m_count = 2, - subgroup_n_count = 2>}> + subgroup_size = 64> #maps = [ affine_map<(bm, bn, m, n, k) -> (bm, m, k)>, @@ -169,7 +166,9 @@ func.func @matmul_128x64x16_multi_subgroup(%lhs: tensor<128x16xf16>, #traits = { indexing_maps = #maps, iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"], - lowering_config = #iree_gpu.lowering_config<{promote_operands = [0]}> + lowering_config = #iree_gpu.lowering_config<{promote_operands = [0], + mma_kind = #iree_gpu.mma_layout, + subgroup_m_count = 2, subgroup_n_count = 2}> } func.func @packed_matmul_128x128x128(%lhs: tensor<8x16x16xf16>, @@ -205,13 +204,9 @@ func.func @packed_matmul_128x128x128(%lhs: tensor<8x16x16xf16>, // ----- -// TODO: We shouldn't have to specify mma_schedule here. #translation = #iree_codegen.translation_info, - subgroup_m_count = 1, - subgroup_n_count = 1>}> + subgroup_size = 64> func.func @linalg_copy(%in : tensor<16x16x16xf16>) -> tensor<16x16x16xf16> attributes { translation_info = #translation } { @@ -233,10 +228,7 @@ func.func @linalg_copy(%in : tensor<16x16x16xf16>) -> tensor<16x16x16xf16> #translation = #iree_codegen.translation_info, - subgroup_m_count = 1, - subgroup_n_count = 1>}> + subgroup_size = 64> #map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2)> #map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)> diff --git a/tests/e2e/matmul/generate_e2e_matmul_tests.py b/tests/e2e/matmul/generate_e2e_matmul_tests.py index dd387f31141f..8761f1f3dc8b 100644 --- a/tests/e2e/matmul/generate_e2e_matmul_tests.py +++ b/tests/e2e/matmul/generate_e2e_matmul_tests.py @@ -135,9 +135,6 @@ def get_compilation_info_attr(self) -> str: requested_pipeline = self.dispatch_lowering_pass_pipeline compiler_pipeline = requested_pipeline - mma_schedule = "" - if self.mma_schedule is not None: - mma_schedule = "{}".format(self.mma_schedule) subgroup_size_str = "" if self.subgroup_size is not None: subgroup_size_str = f"subgroup_size = {self.subgroup_size}" @@ -145,11 +142,13 @@ def get_compilation_info_attr(self) -> str: return ( "#iree_codegen.compilation_info<\n" f" lowering_config = #iree_gpu.lowering_config<{{" + f" mma_kind = #iree_gpu.mma_layout<{self.mma_schedule.intrinsic}>, " + f" subgroup_m_count = {self.mma_schedule.m_count}, " + f" subgroup_n_count = {self.mma_schedule.n_count}, " f" workgroup = {self.workgroup_tile}, " f" reduction = {self.reduction_tile} }}>,\n" f" translation_info = <{compiler_pipeline} {self.workgroup_size_str()}\n" - f" {subgroup_size_str},\n" - f" {{ {mma_schedule} }}>>\n" + f" {subgroup_size_str}>>\n" )