From a5537bc683b8476d12917ced00f3540859894de1 Mon Sep 17 00:00:00 2001
From: Stanley Winata <68087699+raikonenfnu@users.noreply.github.com>
Date: Mon, 4 Nov 2024 10:30:53 -0800
Subject: [PATCH]  [LLVMGPU] Teach KernelConfig to set MMA schedules per op in
 LoweringConfig (#18984)

The main motivation for this change is to enable different
intrinsics/layout on different ops inside the same function/dispatch,
especially for attention. To that extend, we move the scheduling MMA
information such as mma_intrinsic, subgroup_m_count, and
subgroup_n_count, from the translation info attached to the function
onto the lowering_config per op. Here is a quick summary of things we
needed to do to achieve that:

1. Introduce setMmaKind, set/get subgroupMCount, and set/get
subgroupMCount on IREE::GPU::LoweringConfigAttr
2. Move configuring of QK matmul's schedule into KernelConfig from
LLVMGPUConfigureTensorLayout.
3. Now that qk and pv may have different intrinsic, update information
used to decide transposeIntrinsic and reuseIntrinsic in
LLVMGPUConfigureTensorLayout
4. Update a bunch of tests to use lowering config to configure MMAs now.

---------

Signed-off-by: Stanley Winata <stanley.winata@amd.com>
---
 .../Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp   | 45 ++++++++++
 .../Codegen/Dialect/GPU/IR/IREEGPUAttrs.td    | 17 +++-
 .../compiler/Codegen/LLVMGPU/KernelConfig.cpp | 54 +++++++-----
 .../LLVMGPU/LLVMGPUConfigureTensorLayouts.cpp | 83 ++++++++++++-------
 .../ROCDL/config_user_vector_distribute.mlir  | 20 ++---
 .../config_vector_distribute_gfx1100.mlir     |  8 +-
 .../config_vector_distribute_gfx940.mlir      | 42 +++++-----
 .../pipeline_vector_distribute_gfx1100.mlir   |  8 +-
 .../pipeline_vector_distribute_gfx940.mlir    | 70 ++++++++--------
 .../LLVMGPU/test/config_custom_op.mlir        |  3 +-
 .../LLVMGPU/test/configure_tensor_layout.mlir | 42 ++++------
 tests/e2e/matmul/generate_e2e_matmul_tests.py |  9 +-
 12 files changed, 244 insertions(+), 157 deletions(-)
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
index dfa6745a926e..0bbb8d1f4218 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
@@ -1618,6 +1618,51 @@ IREE::GPU::MmaInterfaceAttr LoweringConfigAttr::getMmaKind() const {
   return getAttributes().getAs<IREE::GPU::MmaInterfaceAttr>(kMmaKindName);
 }
 
+void LoweringConfigAttr::setMmaKind(MLIRContext *context,
+                                    SmallVectorImpl<NamedAttribute> &attrs,
+                                    IREE::GPU::MmaInterfaceAttr kind) {
+  attrs.emplace_back(StringAttr::get(context, kMmaKindName), kind);
+}
+
+// TODO: Merge subgroup counts functionality into subgroup tiling level
+//       lowering, when we have it implemented.
+constexpr StringLiteral kSubgroupMCountName = "subgroup_m_count";
+constexpr StringLiteral kSubgroupNCountName = "subgroup_n_count";
+
+std::optional<int64_t> LoweringConfigAttr::getSubgroupMCount() const {
+  auto subgroup_m_count_attr =
+      getAttributes().getAs<IntegerAttr>(kSubgroupMCountName);
+  if (!subgroup_m_count_attr) {
+    return std::nullopt;
+  }
+  return subgroup_m_count_attr.getInt();
+}
+
+std::optional<int64_t> LoweringConfigAttr::getSubgroupNCount() const {
+  auto subgroup_n_count_attr =
+      getAttributes().getAs<IntegerAttr>(kSubgroupNCountName);
+  if (!subgroup_n_count_attr) {
+    return std::nullopt;
+  }
+  return subgroup_n_count_attr.getInt();
+}
+
+void LoweringConfigAttr::setSubgroupMCount(
+    MLIRContext *context, SmallVectorImpl<NamedAttribute> &attrs,
+    int64_t subgroup_m_count) {
+  attrs.emplace_back(
+      StringAttr::get(context, kSubgroupMCountName),
+      IntegerAttr::get(IntegerType::get(context, 64), subgroup_m_count));
+}
+
+void LoweringConfigAttr::setSubgroupNCount(
+    MLIRContext *context, SmallVectorImpl<NamedAttribute> &attrs,
+    int64_t subgroup_n_count) {
+  attrs.emplace_back(
+      StringAttr::get(context, kSubgroupNCountName),
+      IntegerAttr::get(IntegerType::get(context, 64), subgroup_n_count));
+}
+
 constexpr StringLiteral kPromoteOperandsName = "promote_operands";
 
 std::optional<SmallVector<int64_t>>
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td
index 411d3827888d..ee4cb932ded4 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td
@@ -57,8 +57,23 @@ def IREEGPU_LoweringConfigAttr :
         "The configured fields, including tiling levels">:$attributes
   );
   let extraClassDeclaration = [{
-    /// Helper to retrieve a target mma intrinsic if present.
+    /// Helper to retrieve/set a target mma intrinsic.
     ::mlir::iree_compiler::IREE::GPU::MmaInterfaceAttr getMmaKind() const;
+    static void setMmaKind(MLIRContext *context,
+                          SmallVectorImpl<NamedAttribute> &attrs,
+                          ::mlir::iree_compiler::IREE::GPU::MmaInterfaceAttr kind);
+
+    // TODO: Merge subgroup counts functionality into subgroup tiling level
+    //       lowering, when we have it implemented.
+    /// Helper to retrieve/set a target subgroup M/N counts.
+    std::optional<int64_t> getSubgroupMCount() const;
+    std::optional<int64_t> getSubgroupNCount() const;
+    static void setSubgroupMCount(MLIRContext *context,
+                                   SmallVectorImpl<NamedAttribute> &attrs,
+                                   int64_t subgroup_m_count);
+    static void setSubgroupNCount(MLIRContext *context,
+                                   SmallVectorImpl<NamedAttribute> &attrs,
+                                   int64_t subgroup_n_count);
 
     /// Helper to retrieve/set a list of operand indices to promote.
     std::optional<SmallVector<int64_t>> getPromotedOperandList() const;
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index e8926eb6cb4d..6271fe458806 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -416,18 +416,17 @@ setConvolutionVectorDistributionConfig(IREE::GPU::TargetAttr target,
   attrs.emplace_back(StringAttr::get(context, "reduction"),
                      b.getI64ArrayAttr(reductionTileSizes));
   IREE::GPU::LoweringConfigAttr::setPromotedOperandList(context, attrs, {0, 1});
+  IREE::GPU::LoweringConfigAttr::setMmaKind(context, attrs,
+                                            mmaAttrs[schedule->index]);
+  IREE::GPU::LoweringConfigAttr::setSubgroupMCount(
+      context, attrs, schedule->mSubgroupCounts[0]);
+  IREE::GPU::LoweringConfigAttr::setSubgroupNCount(
+      context, attrs, schedule->nSubgroupCounts[0]);
 
   auto configDict = DictionaryAttr::get(context, attrs);
   auto loweringConfig = IREE::GPU::LoweringConfigAttr::get(context, configDict);
 
-  // Attach the MMA schedule as an attribute to the entry point export function
-  // for later access in the pipeline.
   SmallVector<NamedAttribute, 1> pipelineAttrs;
-  auto scheduleAttr = IREE::GPU::MMAScheduleAttr::get(
-      context, mmaAttrs[schedule->index], schedule->mSubgroupCounts[0],
-      schedule->nSubgroupCounts[0]);
-  pipelineAttrs.emplace_back(StringAttr::get(context, "mma_schedule"),
-                             scheduleAttr);
 
   // Prefetch shared memory if requested.
   if (clLLVMGPUEnablePrefetch) {
@@ -682,6 +681,12 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target,
   attrs.emplace_back(StringAttr::get(context, "reduction"),
                      b.getI64ArrayAttr(reductionTileSizes));
   IREE::GPU::LoweringConfigAttr::setPromotedOperandList(context, attrs, {0, 1});
+  IREE::GPU::LoweringConfigAttr::setMmaKind(context, attrs,
+                                            mmaAttrs[schedule->index]);
+  IREE::GPU::LoweringConfigAttr::setSubgroupMCount(
+      context, attrs, schedule->mSubgroupCounts[0]);
+  IREE::GPU::LoweringConfigAttr::setSubgroupNCount(
+      context, attrs, schedule->nSubgroupCounts[0]);
 
   auto configDict = DictionaryAttr::get(context, attrs);
   auto loweringConfig = IREE::GPU::LoweringConfigAttr::get(context, configDict);
@@ -689,11 +694,6 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target,
   // Attach the MMA schedule as an attribute to the entry point export function
   // for later access in the pipeline.
   SmallVector<NamedAttribute, 1> pipelineAttrs;
-  auto scheduleAttr = IREE::GPU::MMAScheduleAttr::get(
-      context, mmaAttrs[schedule->index], schedule->mSubgroupCounts[0],
-      schedule->nSubgroupCounts[0]);
-  pipelineAttrs.emplace_back(StringAttr::get(context, "mma_schedule"),
-                             scheduleAttr);
 
   // Prefetch shared memory if requested.
   if (clLLVMGPUEnablePrefetch) {
@@ -902,9 +902,32 @@ setAttentionVectorDistributionConfig(IREE::GPU::TargetAttr target,
   SmallVector<NamedAttribute, 2> qkConfig;
   SmallVector<NamedAttribute, 2> pvConfig;
 
+  // On attention subgroup distribution:
+  // The subgroup distribution in attention is controlled by the second matmul
+  // (Parallel dimension distribution is usually (almost always) controlled by
+  // the last reduction operation in a dispatch). Since VectorDistribution
+  // doesn't have logic to set subgroup and thread layouts seperately, we
+  // explicitly set the subgroup count for the first matmul as well,
+  // corresponding to what the second matmul dictates.
+
+  // Configuring for qk matmul.
+  // subgroup_n count for qk matmul is always 1, since we do not tile K1.
   IREE::GPU::LoweringConfigAttr::setPromotedOperandList(context, qkConfig,
                                                         {0, 1});
+  IREE::GPU::LoweringConfigAttr::setMmaKind(context, qkConfig,
+                                            mmaAttrs[schedule->index]);
+  IREE::GPU::LoweringConfigAttr::setSubgroupMCount(
+      context, qkConfig, schedule->mSubgroupCounts[0]);
+  IREE::GPU::LoweringConfigAttr::setSubgroupNCount(context, qkConfig, 1);
+
+  // Configuring for pv matmul.
   IREE::GPU::LoweringConfigAttr::setPromotedOperandList(context, pvConfig, {1});
+  IREE::GPU::LoweringConfigAttr::setMmaKind(context, pvConfig,
+                                            mmaAttrs[schedule->index]);
+  IREE::GPU::LoweringConfigAttr::setSubgroupMCount(
+      context, pvConfig, schedule->mSubgroupCounts[0]);
+  IREE::GPU::LoweringConfigAttr::setSubgroupNCount(
+      context, pvConfig, schedule->nSubgroupCounts[0]);
 
   SmallVector<NamedAttribute, 2> qkAttrs;
   SmallVector<NamedAttribute, 2> pvAttrs;
@@ -938,14 +961,7 @@ setAttentionVectorDistributionConfig(IREE::GPU::TargetAttr target,
   auto configDict = b.getDictionaryAttr(attrs);
   auto loweringConfig = IREE::GPU::LoweringConfigAttr::get(context, configDict);
 
-  // Attach the MMA schedule as an attribute to the entry point export function
-  // for later access in the pipeline.
   SmallVector<NamedAttribute, 1> pipelineAttrs;
-  auto scheduleAttr = IREE::GPU::MMAScheduleAttr::get(
-      context, mmaAttrs[schedule->index], schedule->mSubgroupCounts[0],
-      schedule->nSubgroupCounts[0]);
-  pipelineAttrs.emplace_back(StringAttr::get(context, "mma_schedule"),
-                             scheduleAttr);
 
   // TODO: We do not turn prefetching on even when requested by the prefetching
   // flag because there is a shared memory allocation the two matmuls, which
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureTensorLayouts.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureTensorLayouts.cpp
index 4945e6677e83..7008f3e1376e 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureTensorLayouts.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureTensorLayouts.cpp
@@ -48,6 +48,29 @@ static SmallVector<bool> getPromotedOperands(Operation *op) {
   return promotedOperands;
 }
 
+static IREE::GPU::MmaInterfaceAttr getIntrinsic(Operation *op) {
+  auto config = getLoweringConfig<IREE::GPU::LoweringConfigAttr>(op);
+  assert(config && "Cannot find intrinsic from unconfigured op.");
+
+  IREE::GPU::MmaInterfaceAttr mmaIntrinsic = config.getMmaKind();
+  assert(mmaIntrinsic && "Cannot find intrinsic in lowering config.");
+  return mmaIntrinsic;
+}
+
+static int64_t getSubgroupMCount(Operation *op) {
+  auto config = getLoweringConfig<IREE::GPU::LoweringConfigAttr>(op);
+  assert(config && "Cannot find intrinsic from unconfigured op.");
+
+  return *config.getSubgroupMCount();
+}
+
+static int64_t getSubgroupNCount(Operation *op) {
+  auto config = getLoweringConfig<IREE::GPU::LoweringConfigAttr>(op);
+  assert(config && "Cannot find intrinsic from unconfigured op.");
+
+  return *config.getSubgroupNCount();
+}
+
 static LogicalResult setContractionAnchor(IREE::GPU::MMAScheduleAttr schedule,
                                           SmallVector<bool> promotedOperands,
                                           RewriterBase &rewriter,
@@ -264,14 +287,19 @@ transposeSchedule(RewriterBase &rewriter, IREE::GPU::MMAScheduleAttr schedule) {
       schedule.getSubgroupMCount());
 }
 
-static LogicalResult
-setAttentionMatmulAnchor(IREE::GPU::MMAScheduleAttr schedule,
-                         RewriterBase &rewriter, linalg::LinalgOp qkMatmul,
-                         linalg::LinalgOp pvMatmul) {
-  // TODO: Add SIMT fallback.
-  if (!schedule) {
-    return pvMatmul->emitError("missing mma schedule for contraction");
-  }
+static LogicalResult setAttentionMatmulAnchor(RewriterBase &rewriter,
+                                              linalg::LinalgOp qkMatmul,
+                                              linalg::LinalgOp pvMatmul) {
+
+  IREE::GPU::MMAScheduleAttr qkSchedule =
+      rewriter.getAttr<IREE::GPU::MMAScheduleAttr>(getIntrinsic(qkMatmul),
+                                                   getSubgroupMCount(qkMatmul),
+                                                   getSubgroupNCount(qkMatmul));
+
+  IREE::GPU::MMAScheduleAttr pvSchedule =
+      rewriter.getAttr<IREE::GPU::MMAScheduleAttr>(getIntrinsic(pvMatmul),
+                                                   getSubgroupMCount(pvMatmul),
+                                                   getSubgroupNCount(pvMatmul));
 
   // Check if the intrinsic output for qkMatmul can be reused for pvMatmul.
   // We know that pvMatmul takes result of qkMatmul as it's lhs.
@@ -280,13 +308,14 @@ setAttentionMatmulAnchor(IREE::GPU::MMAScheduleAttr schedule,
   bool reuseIntrinsicOutput = false;
   bool transposeIntrinsic = false;
 
-  auto intrinsic = cast<IREE::GPU::MMAAttr>(schedule.getIntrinsic());
+  auto qkIntrinsic = cast<IREE::GPU::MMAAttr>(qkSchedule.getIntrinsic());
+  auto pvIntrinsic = cast<IREE::GPU::MMAAttr>(pvSchedule.getIntrinsic());
   IREE::GPU::MMASingleSubgroupLayout lhsLayout =
-      intrinsic.getASingleSubgroupLayout();
+      pvIntrinsic.getASingleSubgroupLayout();
   IREE::GPU::MMASingleSubgroupLayout rhsLayout =
-      intrinsic.getBSingleSubgroupLayout();
+      pvIntrinsic.getBSingleSubgroupLayout();
   IREE::GPU::MMASingleSubgroupLayout outLayout =
-      intrinsic.getCSingleSubgroupLayout();
+      qkIntrinsic.getCSingleSubgroupLayout();
 
   auto matchLayout = [](IREE::GPU::MMASingleSubgroupLayout layoutA,
                         IREE::GPU::MMASingleSubgroupLayout layoutB) -> bool {
@@ -305,15 +334,6 @@ setAttentionMatmulAnchor(IREE::GPU::MMAScheduleAttr schedule,
     transposeIntrinsic = true;
   }
 
-  // subgroup_n count for attention matmul is always 1, because it is the
-  // reduction dimension. The subgroup_n count is in reality, for the pvMatmul.
-  IREE::GPU::MMAScheduleAttr qkSchedule =
-      rewriter.getAttr<IREE::GPU::MMAScheduleAttr>(
-          schedule.getIntrinsic(),
-          /*subgroup_m_count=*/schedule.getSubgroupMCount(),
-          /*subgroup_n_count=*/1);
-  IREE::GPU::MMAScheduleAttr pvSchedule = schedule;
-
   SmallVector<bool> promotedQKOperands = getPromotedOperands(qkMatmul);
   SmallVector<bool> promotedPVOperands = getPromotedOperands(pvMatmul);
 
@@ -488,12 +508,6 @@ struct LLVMGPUConfigureTensorLayoutsPass final
       return signalPassFailure();
     }
 
-    llvm::StringLiteral scheduleAttrName =
-        IREE::GPU::MMAScheduleAttr::getMnemonic();
-    DictionaryAttr configDict = getTranslationInfo(func).getConfiguration();
-    auto scheduleAttr = dyn_cast_or_null<IREE::GPU::MMAScheduleAttr>(
-        configDict.get(scheduleAttrName));
-
     // Vector layout option setter aimed at contractions and convolutions. For
     // now, layout setting for other problems like reductions is TODO.
     SmallVector<linalg::LinalgOp> contracts;
@@ -529,23 +543,28 @@ struct LLVMGPUConfigureTensorLayoutsPass final
 
     for (linalg::LinalgOp contract : contracts) {
       SmallVector<bool> promotedOperands = getPromotedOperands(contract);
-      if (failed(setContractionAnchor(scheduleAttr, promotedOperands, rewriter,
-                                      contract))) {
+      auto contractionSchedule = rewriter.getAttr<IREE::GPU::MMAScheduleAttr>(
+          getIntrinsic(contract), getSubgroupMCount(contract),
+          getSubgroupNCount(contract));
+      if (failed(setContractionAnchor(contractionSchedule, promotedOperands,
+                                      rewriter, contract))) {
         return signalPassFailure();
       }
     }
 
     for (linalg::LinalgOp conv : convs) {
       SmallVector<bool> promotedOperands = getPromotedOperands(conv);
-      if (failed(setConvolutionAnchor(scheduleAttr, promotedOperands, rewriter,
+      auto convSchedule = rewriter.getAttr<IREE::GPU::MMAScheduleAttr>(
+          getIntrinsic(conv), getSubgroupMCount(conv), getSubgroupNCount(conv));
+      if (failed(setConvolutionAnchor(convSchedule, promotedOperands, rewriter,
                                       conv))) {
         return signalPassFailure();
       }
     }
 
     if (attentionQKMatmul && attentionPVMatmul) {
-      if (failed(setAttentionMatmulAnchor(
-              scheduleAttr, rewriter, attentionQKMatmul, attentionPVMatmul))) {
+      if (failed(setAttentionMatmulAnchor(rewriter, attentionQKMatmul,
+                                          attentionPVMatmul))) {
         return signalPassFailure();
       }
     }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_user_vector_distribute.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_user_vector_distribute.mlir
index a1b1627a84dd..28201626af15 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_user_vector_distribute.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_user_vector_distribute.mlir
@@ -11,12 +11,12 @@
 
 // OPT-OUT:       #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
 // OPT-OUT-SAME:    gpu_pipeline_options = #iree_gpu.pipeline_options<no_reduce_shared_memory_bank_conflicts = true>
-// OPT-OUT-SAME:    mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
 
 // OPT-IN:       #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
 // OPT-IN-SAME:    gpu_pipeline_options = #iree_gpu.pipeline_options<no_reduce_shared_memory_bank_conflicts = true>
-// OPT-IN-SAME:    mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
-#config = #iree_gpu.lowering_config<{workgroup = [128, 128, 0], reduction = [0, 0, 32], promote_operands = [0, 1]}>
+#config = #iree_gpu.lowering_config<{workgroup = [128, 128, 0], reduction = [0, 0, 32], promote_operands = [0, 1],
+                                    mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
+                                    subgroup_m_count = 2, subgroup_n_count = 2}>
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   #hal.pipeline.binding<storage_buffer>,
   #hal.pipeline.binding<storage_buffer>,
@@ -48,7 +48,6 @@ hal.executable public @main_0_dispatch_0 {
 
       func.func @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32()
         attributes {translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64, {
-          mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 2, subgroup_n_count = 2>,
           gpu_pipeline_options = #iree_gpu.pipeline_options<no_reduce_shared_memory_bank_conflicts = true>  // Disable the 'reduceSharedMemoryBankConflicts' pass.
         }>} {
         %cst = arith.constant 0.000000e+00 : f16
@@ -87,12 +86,12 @@ hal.executable public @main_0_dispatch_0 {
 
 // OPT-OUT:       #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
 // OPT-OUT-SAME:    gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = <Transpose>>
-// OPT-OUT-SAME:    mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
 
 // OPT-IN:       #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
 // OPT-IN-SAME:    gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = <Transpose>>
-// OPT-IN-SAME:    mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
-#config = #iree_gpu.lowering_config<{workgroup = [128, 128, 0], reduction = [0, 0, 32], promote_operands = [0, 1]}>
+#config = #iree_gpu.lowering_config<{workgroup = [128, 128, 0], reduction = [0, 0, 32], promote_operands = [0, 1],
+                                    mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
+                                    subgroup_m_count = 2, subgroup_n_count = 2}>
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   #hal.pipeline.binding<storage_buffer>,
   #hal.pipeline.binding<storage_buffer>,
@@ -125,7 +124,6 @@ hal.executable public @main_0_dispatch_0 {
       // OPT-IN:         scf.for
       func.func @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32()
         attributes {translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64, {
-          mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 2, subgroup_n_count = 2>,
           gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = <Transpose>>  // enable the 'reorderWorkgroups' pass.
         }>} {
         %cst = arith.constant 0.000000e+00 : f16
@@ -163,8 +161,9 @@ hal.executable public @main_0_dispatch_0 {
 
 // OPT-OUT:       #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
 // OPT-OUT-SAME:    gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = <None>>
-// OPT-OUT-SAME:    mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
-#config = #iree_gpu.lowering_config<{workgroup = [128, 128, 0], reduction = [0, 0, 32], promote_operands = [0, 1]}>
+#config = #iree_gpu.lowering_config<{workgroup = [128, 128, 0], reduction = [0, 0, 32], promote_operands = [0, 1],
+                                    mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
+                                    subgroup_m_count = 2, subgroup_n_count = 2}>
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   #hal.pipeline.binding<storage_buffer>,
   #hal.pipeline.binding<storage_buffer>,
@@ -186,7 +185,6 @@ hal.executable public @main_0_dispatch_0 {
       // OPT-OUT-NEXT:    scf.for
       func.func @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32()
         attributes {translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64, {
-          mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 2, subgroup_n_count = 2>,
           gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = <None>>  // Disable the 'reorderWorkgroups' pass.
         }>} {
         %cst = arith.constant 0.000000e+00 : f16
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_gfx1100.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_gfx1100.mlir
index 9d45ea04490a..6028b4790552 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_gfx1100.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_gfx1100.mlir
@@ -6,9 +6,8 @@
 // located here.
 
 // WMMA:      #iree_codegen.translation_info<LLVMGPUVectorDistribute
-// WMMA-SAME: mma_schedule = #iree_gpu.mma_schedule
-// WMMA-SAME:   intrinsic = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>
-// WMMA-SAME:   subgroup_m_count = 2, subgroup_n_count = 2
+// WMMA-SAME: workgroup_size = [128, 1, 1]
+// WMMA-SAME: subgroup_size = 32
 
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   #hal.pipeline.binding<storage_buffer>,
@@ -32,5 +31,8 @@ func.func @wmma_matmul_1024x1024x1024() {
 
 // WMMA-LABEL: func.func @wmma_matmul_1024x1024x1024()
 // WMMA: linalg.matmul {{.*}}lowering_config = #iree_gpu.lowering_config
+// WMMA-SAME:                           mma_kind = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>
 // WMMA-SAME:                           reduction =  [0, 0, 64]
+// WMMA-SAME:                           subgroup_m_count = 2
+// WMMA-SAME:                           subgroup_n_count = 2
 // WMMA-SAME:                           workgroup =  [64, 128, 0]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_gfx940.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_gfx940.mlir
index da6a563c676d..46b82923fe88 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_gfx940.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_gfx940.mlir
@@ -7,9 +7,6 @@
 // located here.
 
 // CHECK:      #iree_codegen.translation_info<LLVMGPUVectorDistribute
-// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule
-// CHECK-SAME:   intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
-// CHECK-SAME:   subgroup_m_count = 1, subgroup_n_count = 4
 
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   #hal.pipeline.binding<storage_buffer>,
@@ -41,15 +38,15 @@ func.func @expanded_matmul_transpose_b() {
 
 // CHECK-LABEL: func.func @expanded_matmul_transpose_b()
 // CHECK: linalg.generic {{.*}}lowering_config =  #iree_gpu.lowering_config
+// CHECK-SAME:                           mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
 // CHECK-SAME:                           reduction =  [0, 0, 0, 0, 128]
+// CHECK-SAME:                           subgroup_m_count = 1
+// CHECK-SAME:                           subgroup_n_count = 4
 // CHECK-SAME:                           workgroup =  [1, 1, 64, 64, 0]
 
 // -----
 
 // CHECK:      #iree_codegen.translation_info<LLVMGPUVectorDistribute
-// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule
-// CHECK-SAME:   intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
-// CHECK-SAME:   subgroup_m_count = 2, subgroup_n_count = 2
 
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   #hal.pipeline.binding<storage_buffer>,
@@ -73,7 +70,10 @@ func.func @conv_nhwc() {
 
 // CHECK-LABEL: func.func @conv_nhwc()
 // CHECK: linalg.conv_2d_nhwc_hwcf {{.*}} lowering_config = #iree_gpu.lowering_config
+// CHECK-SAME:                           mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
 // CHECK-SAME:                           reduction =  [0, 0, 0, 0, 1, 1, 32]
+// CHECK-SAME:                           subgroup_m_count = 2
+// CHECK-SAME:                           subgroup_n_count = 2
 // CHECK-SAME:                           workgroup =  [1, 1, 64, 128, 0, 0, 0]
 
 // -----
@@ -113,9 +113,6 @@ func.func @matmul_256x256x256() attributes {hal.executable.target = #executable_
 // -----
 
 // CHECK:      #iree_codegen.translation_info<LLVMGPUVectorDistribute
-// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule
-// CHECK-SAME:   intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
-// CHECK-SAME:   subgroup_m_count = 2, subgroup_n_count = 2
 
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   #hal.pipeline.binding<storage_buffer>,
@@ -139,15 +136,15 @@ func.func @mfma_matmul_1024x1024x1024() {
 
 // CHECK-LABEL: func.func @mfma_matmul_1024x1024x1024()
 // CHECK: linalg.matmul {{.*}}lowering_config = #iree_gpu.lowering_config
+// CHECK-SAME:                           mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
 // CHECK-SAME:                           reduction =  [0, 0, 64]
+// CHECK-SAME:                           subgroup_m_count = 2
+// CHECK-SAME:                           subgroup_n_count = 2
 // CHECK-SAME:                           workgroup =  [64, 128, 0]
 
 // -----
 
 // CHECK:      #iree_codegen.translation_info<LLVMGPUVectorDistribute
-// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule
-// CHECK-SAME:   intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
-// CHECK-SAME:   subgroup_m_count = 2, subgroup_n_count = 2
 
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   #hal.pipeline.binding<storage_buffer>,
@@ -190,15 +187,15 @@ func.func @conv_nchwc() {
 
 // CHECK-LABEL: func.func @conv_nchwc()
 // CHECK: linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config
+// CHECK-SAME:                           mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
 // CHECK-SAME:                           reduction =  [0, 0, 0, 0, 0, 1, 1, 1, 32]
+// CHECK-SAME:                           subgroup_m_count = 2
+// CHECK-SAME:                           subgroup_n_count = 2
 // CHECK-SAME:                           workgroup =  [1, 1, 1, 32, 32, 0, 0, 0, 0]
 
 // -----
 
 // CHECK:      #iree_codegen.translation_info<LLVMGPUPadAndVectorDistribute
-// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule
-// CHECK-SAME:   intrinsic =  #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
-// CHECK-SAME:   subgroup_m_count = 1, subgroup_n_count = 1
 
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   #hal.pipeline.binding<storage_buffer>,
@@ -222,15 +219,15 @@ func.func @unaligned_mk_batch_matmul() {
 // CHECK-LABEL: func.func @unaligned_mk_batch_matmul()
 // CHECK:         linalg.batch_matmul
 // CHECK-SAME:      lowering_config = #iree_gpu.lowering_config
+// CHECK-SAME:                           mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
 // CHECK-SAME:                           reduction =  [0, 0, 0, 16]
+// CHECK-SAME:                           subgroup_m_count = 1
+// CHECK-SAME:                           subgroup_n_count = 1
 // CHECK-SAME:                           workgroup =  [1, 16, 16, 0]
 
 // -----
 
 // CHECK:      #iree_codegen.translation_info<LLVMGPUPadAndVectorDistribute
-// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule
-// CHECK-SAME:   intrinsic =  #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
-// CHECK-SAME:   subgroup_m_count = 1, subgroup_n_count = 4
 
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   #hal.pipeline.binding<storage_buffer>,
@@ -254,7 +251,10 @@ func.func @unaligned_m_batch_matmul_64x72x1280x1280() {
 // CHECK-LABEL: func.func @unaligned_m_batch_matmul_64x72x1280x1280()
 // CHECK:         linalg.batch_matmul
 // CHECK-SAME:      lowering_config = #iree_gpu.lowering_config
+// CHECK-SAME:                           mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
 // CHECK-SAME:                           reduction =  [0, 0, 0, 128]
+// CHECK-SAME:                           subgroup_m_count = 1
+// CHECK-SAME:                           subgroup_n_count = 4
 // CHECK-SAME:                           workgroup =  [1, 16, 128, 0]
 
 // -----
@@ -318,7 +318,6 @@ func.func @matmul_dynamic_dim() {
 // -----
 
 // CHECK:       #iree_codegen.translation_info<LLVMGPUVectorDistribute
-// CHECK-SAME:  subgroup_m_count = 2, subgroup_n_count = 1
 // CHECK-NOT:   prefetch_shared_memory = true
 
 // CHECK-LABEL: func.func @attention_20x4096x64x4096x64()
@@ -354,13 +353,14 @@ func.func @attention_20x4096x64x4096x64() {
 }
 
 // CHECK:                #iree_gpu.lowering_config
+// CHECK-SAME:                           subgroup_m_count = 2
+// CHECK-SAME:                           subgroup_n_count = 1
 // CHECK-SAME:                           reduction =  [0, 0, 0, 64, 0]
 // CHECK-SAME:                           workgroup =  [1, 64, 0, 0, 64]
 
 // -----
 
 // CHECK:       #iree_codegen.translation_info<LLVMGPUVectorDistribute
-// CHECK-SAME:  subgroup_m_count = 2, subgroup_n_count = 1
 // CHECK-NOT:   prefetch_shared_memory = true
 
 // CHECK-LABEL: func.func @attention_large_head_dim_shared_mem()
@@ -399,5 +399,7 @@ func.func @attention_large_head_dim_shared_mem() {
 }
 
 // CHECK:                #iree_gpu.lowering_config
+// CHECK-SAME:                           subgroup_m_count = 2
+// CHECK-SAME:                           subgroup_n_count = 1
 // CHECK-SAME:                           reduction =  [0, 0, 16, 0]
 // CHECK-SAME:                           workgroup =  [32, 0, 0, 32]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx1100.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx1100.mlir
index 58bfb006ad5d..128a259f980c 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx1100.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx1100.mlir
@@ -3,8 +3,8 @@
 // RUN:   --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-llvmgpu-lower-executable-target)))))" \
 // RUN:   %s | FileCheck %s
 
-#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 128]}>
-#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 2, 1] subgroup_size = 32, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>, mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>, subgroup_m_count = 2, subgroup_n_count = 2>}>
+#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 128], mma_kind = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>, subgroup_m_count = 2, subgroup_n_count = 2}>
+#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 2, 1] subgroup_size = 32, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>
 
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   #hal.pipeline.binding<storage_buffer>,
@@ -50,8 +50,8 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) {
 
 // -----
 
-#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 128]}>
-#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 2, 1] subgroup_size = 32, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>, mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F16>, subgroup_m_count = 2, subgroup_n_count = 2>}>
+#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 128], mma_kind = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F16>, subgroup_m_count = 2, subgroup_n_count = 2}>
+#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 2, 1] subgroup_size = 32, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>
 
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   #hal.pipeline.binding<storage_buffer>,
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx940.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx940.mlir
index cedec2d21f2f..1722b894657e 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx940.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx940.mlir
@@ -8,8 +8,8 @@
 // RUN:   --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-llvmgpu-lower-executable-target)))))" \
 // RUN:   %s | FileCheck %s --check-prefix=MEMORY
 
-#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 128], promote_operands = [0, 1]}>
-#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>, mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 2, subgroup_n_count = 2>}>
+#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 128], promote_operands = [0, 1], mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 2, subgroup_n_count = 2}>
+#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>
 
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   #hal.pipeline.binding<storage_buffer>,
@@ -54,8 +54,8 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) {
 
 // -----
 
-#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 128], promote_operands = [0, 1]}>
-#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>, mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 2, subgroup_n_count = 2>}>
+#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 128], promote_operands = [0, 1], mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 2, subgroup_n_count = 2}>
+#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>
 
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   #hal.pipeline.binding<storage_buffer>,
@@ -98,8 +98,8 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) {
 
 // -----
 
-#config = #iree_gpu.lowering_config<{workgroup = [1, 1, 64, 64, 0], reduction = [0, 0, 0, 0, 128], promote_operands = [0, 1]}>
-#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>, mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 1, subgroup_n_count = 4>}>
+#config = #iree_gpu.lowering_config<{workgroup = [1, 1, 64, 64, 0], reduction = [0, 0, 0, 0, 128], promote_operands = [0, 1], mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 1, subgroup_n_count = 4}>
+#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>
 
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   #hal.pipeline.binding<storage_buffer>,
@@ -180,7 +180,7 @@ hal.executable @matmul_multiple_k {
       hal.return %x, %y, %z : index, index, index
     }
     builtin.module {
-      func.func @matmul_multiple_k() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 1, subgroup_n_count = 4>}>} {
+      func.func @matmul_multiple_k() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64>} {
         %cst = arith.constant 0.000000e+00 : f16
         %c0 = arith.constant 0 : index
         %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x128x64x2048xf16>>
@@ -190,7 +190,7 @@ hal.executable @matmul_multiple_k {
         %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [10, 128, 64, 2048], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x128x64x2048xf16>> -> tensor<10x128x64x2048xf16>
         %5 = tensor.empty() : tensor<2x10x64x64xf16>
         %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<2x10x64x64xf16>) -> tensor<2x10x64x64xf16>
-        %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d4, d2, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d4, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction"]} ins(%3, %4 : tensor<2x128x64x2048xf16>, tensor<10x128x64x2048xf16>) outs(%6 : tensor<2x10x64x64xf16>) attrs =  {lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 0, 0, 1, 128], workgroup = [1, 1, 64, 64, 0, 0], promote_operands = [0, 1]}>} {
+        %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d4, d2, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d4, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction"]} ins(%3, %4 : tensor<2x128x64x2048xf16>, tensor<10x128x64x2048xf16>) outs(%6 : tensor<2x10x64x64xf16>) attrs =  {lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 0, 0, 1, 128], workgroup = [1, 1, 64, 64, 0, 0], promote_operands = [0, 1], mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 1, subgroup_n_count = 4}>} {
         ^bb0(%in: f16, %in_0: f16, %out: f16):
           %8 = arith.mulf %in, %in_0 : f16
           %9 = arith.addf %8, %out : f16
@@ -217,8 +217,8 @@ hal.executable @matmul_multiple_k {
 
 // Basic f8, f8 -> f32 matmul.
 
-#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 256], promote_operands = [0, 1]}>
-#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>, mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ>, subgroup_m_count = 2, subgroup_n_count = 2>}>
+#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 256], promote_operands = [0, 1], mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ>, subgroup_m_count = 2, subgroup_n_count = 2}>
+#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>
 
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   #hal.pipeline.binding<storage_buffer>,
@@ -263,8 +263,8 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) {
 
 // Basic i8, i8 -> i32 matmul.
 
-#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 256], promote_operands = [0, 1]}>
-#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>, mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_I32_16x16x32_I8>, subgroup_m_count = 2, subgroup_n_count = 2>}>
+#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 256], promote_operands = [0, 1], mma_kind = #iree_gpu.mma_layout<MFMA_I32_16x16x32_I8>, subgroup_m_count = 2, subgroup_n_count = 2}>
+#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>
 
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   #hal.pipeline.binding<storage_buffer>,
@@ -309,8 +309,8 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) {
 
 // Basic i8, i8 -> i32 matmul_transpose_b.
 
-#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 256], promote_operands = [0, 1]}>
-#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>, mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_I32_16x16x32_I8>, subgroup_m_count = 2, subgroup_n_count = 2>}>
+#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 256], promote_operands = [0, 1], mma_kind = #iree_gpu.mma_layout<MFMA_I32_16x16x32_I8>, subgroup_m_count = 2, subgroup_n_count = 2}>
+#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>
 
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   #hal.pipeline.binding<storage_buffer>,
@@ -353,8 +353,8 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) {
 
 // -----
 
-#config = #iree_gpu.lowering_config<{workgroup = [1, 1, 64, 128, 0, 0, 0], reduction = [0, 0, 0, 0, 1, 1, 32], promote_operands = [0, 1]}>
-#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>, mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 2, subgroup_n_count = 2>}>
+#config = #iree_gpu.lowering_config<{workgroup = [1, 1, 64, 128, 0, 0, 0], reduction = [0, 0, 0, 0, 1, 1, 32], promote_operands = [0, 1], mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 2, subgroup_n_count = 2}>
+#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>
 
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   #hal.pipeline.binding<storage_buffer>,
@@ -396,8 +396,8 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) {
 
 // -----
 
-#config = #iree_gpu.lowering_config<{workgroup = [1, 64, 1, 64, 0], reduction = [0, 0, 0, 0, 128], promote_operands = [0, 1]}>
-#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>, mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 2, subgroup_n_count = 2>}>
+#config = #iree_gpu.lowering_config<{workgroup = [1, 64, 1, 64, 0], reduction = [0, 0, 0, 0, 128], promote_operands = [0, 1], mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 2, subgroup_n_count = 2}>
+#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>
 
 #pipeline_layout = #hal.pipeline.layout<constants = 2, bindings = [
   #hal.pipeline.binding<storage_buffer>,
@@ -462,8 +462,8 @@ hal.executable public @main_dispatch_expanded_matmul {
 
 // -----
 
-#config = #iree_gpu.lowering_config<{workgroup = [1, 16, 16, 0], reduction = [0, 0, 0, 16], promote_operands = [0, 1]}>
-#translation = #iree_codegen.translation_info<LLVMGPUPadAndVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>, mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 1, subgroup_n_count = 1>}>
+#config = #iree_gpu.lowering_config<{workgroup = [1, 16, 16, 0], reduction = [0, 0, 0, 16], promote_operands = [0, 1], mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 1, subgroup_n_count = 1}>
+#translation = #iree_codegen.translation_info<LLVMGPUPadAndVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>
 
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   #hal.pipeline.binding<storage_buffer>,
@@ -533,8 +533,8 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) {
 
 // -----
 
-#config = #iree_gpu.lowering_config<{workgroup = [1, 16, 32, 0], reduction = [0, 0, 0, 8], promote_operands = [0, 1]}>
-#translation = #iree_codegen.translation_info<LLVMGPUPadAndVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>, subgroup_m_count = 1, subgroup_n_count = 2>}>
+#config = #iree_gpu.lowering_config<{workgroup = [1, 16, 32, 0], reduction = [0, 0, 0, 8], promote_operands = [0, 1], mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>, subgroup_m_count = 1, subgroup_n_count = 2}>
+#translation = #iree_codegen.translation_info<LLVMGPUPadAndVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64>
 
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   #hal.pipeline.binding<storage_buffer>,
@@ -604,8 +604,8 @@ hal.executable public @pad_batch_matmul {
 // NOTE: This test is not exhaustive of all possible ways the above condition is breaking,
 //       but rather is an example of a matmul shape from a model that broke our compilation heuristic.
 
-#config = #iree_gpu.lowering_config<{workgroup = [1, 16, 128, 0], reduction = [0, 0, 0, 128], promote_operands = [0, 1]}>
-#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>, mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 1, subgroup_n_count = 4>}>
+#config = #iree_gpu.lowering_config<{workgroup = [1, 16, 128, 0], reduction = [0, 0, 0, 128], promote_operands = [0, 1], mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 1, subgroup_n_count = 4}>
+#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>
 
 #pipeline_layout = #hal.pipeline.layout<constants = 3, bindings = [
   #hal.pipeline.binding<storage_buffer>,
@@ -657,8 +657,8 @@ hal.executable public @contract_schedule_considering_read_layout {
 
 // This test ensures that we can generate and decompose the right instructions from V(Virtual) MFMAs.
 
-#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 128], promote_operands = [0, 1]}>
-#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>, mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<VMFMA_F32_32x32x16_F16>, subgroup_m_count = 2, subgroup_n_count = 2>}>
+#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 128], promote_operands = [0, 1], mma_kind = #iree_gpu.mma_layout<VMFMA_F32_32x32x16_F16>, subgroup_m_count = 2, subgroup_n_count = 2}>
+#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>}>
 
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   #hal.pipeline.binding<storage_buffer>,
@@ -719,7 +719,7 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) {
 // -----
 
 #config = #iree_gpu.lowering_config<{workgroup = [1, 64, 0, 0, 64], reduction = [0, 0, 0, 64, 0], promote_operands = [0, 1, 2]}>
-#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 2, subgroup_n_count = 1>}>
+#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64>
 
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   #hal.pipeline.binding<storage_buffer>,
@@ -753,8 +753,8 @@ hal.executable private @attention_20x4096x64x4096x64 {
                      affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>],
                      lowering_config = #config,
                      decomposition_config = {
-                      qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1]}>},
-                      pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{promote_operands = [1]}>}
+                      qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 2, subgroup_n_count = 1, promote_operands = [0, 1]}>},
+                      pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 2, subgroup_n_count = 1, promote_operands = [1]}>}
                      }}
                      ins(%4, %5, %6, %cst : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) outs(%7 : tensor<20x4096x64xf16>) {
                       ^bb0(%score: f32):
@@ -792,7 +792,7 @@ hal.executable private @attention_20x4096x64x4096x64 {
 // -----
 
 #config = #iree_gpu.lowering_config<{workgroup = [1, 1, 64, 0, 0, 64], reduction = [0, 0, 0, 0, 64, 0], promote_operands = [0, 1, 2]}>
-#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 2, subgroup_n_count = 1>}>
+#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64>
 
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   #hal.pipeline.binding<storage_buffer>,
@@ -827,8 +827,8 @@ hal.executable private @attention_multiple_m_transpose {
                                                          affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d5)>],
                                                          lowering_config = #config,
                                                          decomposition_config = {
-                                                          qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1]}>},
-                                                          pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{promote_operands = [1]}>}
+                                                          qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 2, subgroup_n_count = 1, promote_operands = [0, 1]}>},
+                                                          pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 2, subgroup_n_count = 1, promote_operands = [1]}>}
                                                          }}
         ins(%4, %5, %6, %cst : tensor<24x64x4608x128xf16>, tensor<24x4608x128xf16>, tensor<24x4608x128xf16>, f16) outs(%8 : tensor<24x64x4608x128xf16>) {
               ^bb0(%score: f32):
@@ -860,7 +860,7 @@ hal.executable private @attention_multiple_m_transpose {
 // -----
 
 #config = #iree_gpu.lowering_config<{workgroup = [1, 1, 128, 0, 0, 64], reduction = [0, 0, 0, 0, 32, 0], promote_operands = [0, 1, 2]}>
-#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64, {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1>}>
+#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64>
 
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   #hal.pipeline.binding<storage_buffer>,
@@ -895,8 +895,8 @@ hal.executable private @attention_mfma_32x32x8 {
                                                          affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d5)>],
                                                          lowering_config = #config,
                                                          decomposition_config = {
-                                                          qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1]}>},
-                                                          pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{promote_operands = [1]}>}
+                                                          qk_attrs = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1, promote_operands = [0, 1]}>},
+                                                          pv_attrs = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 4, subgroup_n_count = 1, promote_operands = [1]}>}
                                                          }}
         ins(%4, %5, %6, %cst : tensor<24x64x4608x128xf16>, tensor<24x4608x128xf16>, tensor<24x4608x128xf16>, f16) outs(%8 : tensor<24x64x4608x128xf16>) {
               ^bb0(%score: f32):
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_custom_op.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_custom_op.mlir
index 032bd68ce040..2dd57f30da4a 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_custom_op.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_custom_op.mlir
@@ -34,14 +34,13 @@ func.func @custom_op(%arg0 : tensor<384x512xf32>, %arg1 : tensor<512x128xf32>,
 }
 //      CHECK: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0]]>
 //      CHECK: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64,
-// CHECK-SAME:     mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>, subgroup_m_count = 2, subgroup_n_count = 2>
 //      CHECK: func @custom_op
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
 //      CHECK:   iree_linalg_ext.custom_op
 // CHECK-SAME:       lowering_config = #[[CONFIG]]
 //      CHECK:   ^bb
 //      CHECK:     linalg.matmul
-// CHECK-SAME:         lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 32], workgroup = [64, 64, 0]}>
+// CHECK-SAME:         lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>, promote_operands = [0, 1], reduction = [0, 0, 32], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 64, 0]}>
 //      CHECK:   iree_linalg_ext.yield
 
 // -----
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_tensor_layout.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_tensor_layout.mlir
index 6c96d1505e36..937174d13fb4 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_tensor_layout.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_tensor_layout.mlir
@@ -2,10 +2,7 @@
 
 #translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute
                                               workgroup_size = [64, 1, 1]
-                                              subgroup_size = 64,
-      {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>,
-                                             subgroup_m_count = 1,
-                                             subgroup_n_count = 1>}>
+                                              subgroup_size = 64>
 
 #maps = [
   affine_map<(m, n, k) -> (m, k)>,
@@ -15,7 +12,9 @@
 
 #traits = {
   indexing_maps = #maps,
-  iterator_types = ["parallel", "parallel", "reduction"]
+  iterator_types = ["parallel", "parallel", "reduction"],
+  lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>,
+                                              subgroup_m_count = 1, subgroup_n_count = 1}>
 }
 
 func.func @matmul_96x64x16_mfma(%lhs: tensor<96x16xf16>,
@@ -53,10 +52,7 @@ func.func @matmul_96x64x16_mfma(%lhs: tensor<96x16xf16>,
 
 #translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute
                                               workgroup_size = [64, 1, 1]
-                                              subgroup_size = 64,
-      {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>,
-                                             subgroup_m_count = 1,
-                                             subgroup_n_count = 1>}>
+                                              subgroup_size = 64>
 
 #maps = [
   affine_map<(m, n, k) -> (m, k)>,
@@ -66,7 +62,9 @@ func.func @matmul_96x64x16_mfma(%lhs: tensor<96x16xf16>,
 
 #traits = {
   indexing_maps = #maps,
-  iterator_types = ["parallel", "parallel", "reduction"]
+  iterator_types = ["parallel", "parallel", "reduction"],
+  lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>,
+                                              subgroup_m_count = 1 : i64, subgroup_n_count = 1 : i64}>
 }
 
 func.func @matmul_96x64x16_wmma(%lhs: tensor<96x16xf16>,
@@ -117,7 +115,9 @@ func.func @matmul_96x64x16_wmma(%lhs: tensor<96x16xf16>,
 
 #traits = {
   indexing_maps = #maps,
-  iterator_types = ["parallel", "parallel", "reduction"]
+  iterator_types = ["parallel", "parallel", "reduction"],
+  lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
+                                              subgroup_m_count = 4, subgroup_n_count = 1}>
 }
 
 func.func @matmul_128x64x16_multi_subgroup(%lhs: tensor<128x16xf16>,
@@ -155,10 +155,7 @@ func.func @matmul_128x64x16_multi_subgroup(%lhs: tensor<128x16xf16>,
 
 #translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute
                                               workgroup_size = [64, 1, 1]
-                                              subgroup_size = 64,
-      {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
-                                             subgroup_m_count = 2,
-                                             subgroup_n_count = 2>}>
+                                              subgroup_size = 64>
 
 #maps = [
   affine_map<(bm, bn, m, n, k) -> (bm, m, k)>,
@@ -169,7 +166,9 @@ func.func @matmul_128x64x16_multi_subgroup(%lhs: tensor<128x16xf16>,
 #traits = {
   indexing_maps = #maps,
   iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"],
-  lowering_config = #iree_gpu.lowering_config<{promote_operands = [0]}>
+  lowering_config = #iree_gpu.lowering_config<{promote_operands = [0],
+                                              mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
+                                              subgroup_m_count = 2, subgroup_n_count = 2}>
 }
 
 func.func @packed_matmul_128x128x128(%lhs: tensor<8x16x16xf16>,
@@ -205,13 +204,9 @@ func.func @packed_matmul_128x128x128(%lhs: tensor<8x16x16xf16>,
 
 // -----
 
-// TODO: We shouldn't have to specify mma_schedule here.
 #translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute
                                               workgroup_size = [64, 1, 1]
-                                              subgroup_size = 64,
-      {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
-                                             subgroup_m_count = 1,
-                                             subgroup_n_count = 1>}>
+                                              subgroup_size = 64>
 
 func.func @linalg_copy(%in : tensor<16x16x16xf16>) -> tensor<16x16x16xf16>
                       attributes { translation_info = #translation } {
@@ -233,10 +228,7 @@ func.func @linalg_copy(%in : tensor<16x16x16xf16>) -> tensor<16x16x16xf16>
 
 #translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute
                                               workgroup_size = [64, 1, 1]
-                                              subgroup_size = 64,
-      {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
-                                             subgroup_m_count = 1,
-                                             subgroup_n_count = 1>}>
+                                              subgroup_size = 64>
 
 #map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>
diff --git a/tests/e2e/matmul/generate_e2e_matmul_tests.py b/tests/e2e/matmul/generate_e2e_matmul_tests.py
index dd387f31141f..8761f1f3dc8b 100644
--- a/tests/e2e/matmul/generate_e2e_matmul_tests.py
+++ b/tests/e2e/matmul/generate_e2e_matmul_tests.py
@@ -135,9 +135,6 @@ def get_compilation_info_attr(self) -> str:
         requested_pipeline = self.dispatch_lowering_pass_pipeline
         compiler_pipeline = requested_pipeline
 
-        mma_schedule = ""
-        if self.mma_schedule is not None:
-            mma_schedule = "{}".format(self.mma_schedule)
         subgroup_size_str = ""
         if self.subgroup_size is not None:
             subgroup_size_str = f"subgroup_size = {self.subgroup_size}"
@@ -145,11 +142,13 @@ def get_compilation_info_attr(self) -> str:
         return (
             "#iree_codegen.compilation_info<\n"
             f"  lowering_config = #iree_gpu.lowering_config<{{"
+            f"  mma_kind = #iree_gpu.mma_layout<{self.mma_schedule.intrinsic}>, "
+            f"  subgroup_m_count = {self.mma_schedule.m_count}, "
+            f"  subgroup_n_count = {self.mma_schedule.n_count}, "
             f"  workgroup = {self.workgroup_tile}, "
             f"  reduction = {self.reduction_tile} }}>,\n"
             f"  translation_info = <{compiler_pipeline} {self.workgroup_size_str()}\n"
-            f"  {subgroup_size_str},\n"
-            f"  {{ {mma_schedule} }}>>\n"
+            f"  {subgroup_size_str}>>\n"
         )