From a2a2353e2233921d3b95ab48ed0f6686f4eb3076 Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Tue, 5 Dec 2023 01:11:30 -0500
Subject: [PATCH 1/4] [LLVMGPU] Add multi-row vector reduction configuration

This is to speed up matvec. The new configuration is experimental and
only applied on ROCm targets.
---
 .../Common/GPU/VectorReductionToGPU.cpp       |  2 +-
 .../GPU/test/vector_reduction_to_gpu.mlir     | 65 +++++++++++++++++++
 .../compiler/Codegen/LLVMGPU/KernelConfig.cpp | 23 +++++++
 third_party/llvm-project                      |  2 +-
 4 files changed, 90 insertions(+), 2 deletions(-)
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/VectorReductionToGPU.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/VectorReductionToGPU.cpp
index 723a954fe04b..21e186b702bf 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/VectorReductionToGPU.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/VectorReductionToGPU.cpp
@@ -196,7 +196,7 @@ class VectorReductionToGPUPass
       bool expandSubgroupReduction,
       std::function<int(func::FuncOp)> getWarpSize)
       : expandSubgroupReduction(expandSubgroupReduction),
-        getWarpSize(getWarpSize) {}
+        getWarpSize(std::move(getWarpSize)) {}
 
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<scf::SCFDialect, memref::MemRefDialect, gpu::GPUDialect,
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/vector_reduction_to_gpu.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/vector_reduction_to_gpu.mlir
index 51ff15ae84b1..b287642460d4 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/vector_reduction_to_gpu.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/vector_reduction_to_gpu.mlir
@@ -257,3 +257,68 @@ hal.executable private @shared_memory_copy  {
 //       CHECK:   vector.transfer_read %[[ALLOC]]{{.*}} : memref<32xf32, #gpu.address_space<workgroup>>, vector<1xf32>
 //       CHECK:   vector.transfer_write {{.*}} : vector<1xf32>, memref<128x32xf32>
 //       CHECK:   return
+
+
+// -----
+
+// Check that we multi-row matvec gets distributed across subgoroup threads.
+
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {target_arch = "gfx940"}>
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
+  #hal.descriptor_set.layout<0, bindings = [
+    #hal.descriptor_set.binding<0, storage_buffer>,
+    #hal.descriptor_set.binding<1, storage_buffer>,
+    #hal.descriptor_set.binding<2, storage_buffer>
+  ]>
+]>
+hal.executable private @multirow  {
+  hal.executable.variant @rocm target(#executable_target_rocm_hsaco_fb) {
+    hal.executable.export @multirow layout(#pipeline_layout) attributes {
+      workgroup_size = [64 : index, 1 : index, 1 : index]
+    }
+    builtin.module {
+      func.func @multirow() {
+        %cst = arith.constant dense<0.000000e+00> : vector<4x512xf16>
+        %c0 = arith.constant 0 : index
+        %cst_0 = arith.constant dense<0.000000e+00> : vector<1x4xf16>
+        %c4096 = arith.constant 4096 : index
+        %c512 = arith.constant 512 : index
+        %cst_1 = arith.constant 0.000000e+00 : f16
+        %id = gpu.thread_id  x
+        %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>
+        memref.assume_alignment %0, 64 : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>
+        %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>
+        memref.assume_alignment %1, 64 : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>
+        %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>>
+        memref.assume_alignment %2, 64 : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>>
+        %workgroup_id_x = hal.interface.workgroup.id[0] : index
+        %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
+        %4 = scf.for %arg0 = %c0 to %c4096 step %c512 iter_args(%arg1 = %cst) -> (vector<4x512xf16>) {
+          %8 = vector.transfer_read %0[%c0, %arg0], %cst_1 {in_bounds = [true, true], permutation_map = affine_map<(d0, d1) -> (0, d1)>} : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<4x512xf16>
+          %9 = vector.transfer_read %1[%3, %arg0], %cst_1 {in_bounds = [true, true]} : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<4x512xf16>
+          %10 = arith.mulf %8, %9 : vector<4x512xf16>
+          %11 = arith.addf %arg1, %10 : vector<4x512xf16>
+          scf.yield %11 : vector<4x512xf16>
+        }
+        %5 = vector.broadcast %4 : vector<4x512xf16> to vector<1x4x512xf16>
+        %6 = vector.multi_reduction <add>, %5, %cst_0 [2] : vector<1x4x512xf16> to vector<1x4xf16>
+        %7 = vector.extract %6[0] : vector<4xf16> from vector<1x4xf16>
+        vector.transfer_write %7, %2[%c0, %3] {in_bounds = [true]} : vector<4xf16>, memref<1x32000xf16, #hal.descriptor_type<storage_buffer>>
+        return
+      }
+    }
+  }
+}
+
+// CHECK-LABEL: func.func @multirow() {
+//       CHECK:   scf.for {{.*}} -> (vector<4x8xf16>) {
+//       CHECK:     vector.transfer_read {{.*}} {in_bounds = [true, true]} : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<4x8xf16>
+//       CHECK:     vector.transfer_read {{.*}} {in_bounds = [true, true]} : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<4x8xf16>
+//       CHECK:     arith.mulf %{{.*}}, %{{.*}} : vector<4x8xf16>
+//       CHECK:     arith.addf %{{.*}}, %{{.*}} : vector<4x8xf16>
+//       CHECK:   }
+//       CHECK:   gpu.shuffle xor
+//       CHECK:   scf.if {{.*}} {
+//  CHECK-NEXT:     vector.transfer_write {{.*}} : vector<4xf16>, memref<1x32000xf16, #hal.descriptor_type<storage_buffer>>
+//  CHECK-NEXT:   }
+//  CHECK-NEXT:   return
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index e2e08ab539d3..26c6b3919f1e 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -23,6 +23,7 @@
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/Types.h"
 #include "mlir/IR/Value.h"
@@ -926,6 +927,28 @@ static LogicalResult setWarpReductionConfig(func::FuncOp entryPoint,
   if ((groupSize / subgroupSize) > subgroupSize)
     return failure();
 
+  // With just one subgroup per workgroup, make each subgroup do more work and
+  // process a few reductions along the last parallel dimension.
+  // TODO: We should also check that this will result in data reuse for at least
+  //       one argument.
+  // TODO: This is experimental and rocm-only for now.
+  if (isRocmTarget(entryPoint)) {
+    if (*parallelSize && !parallelDims.empty() && groupSize == subgroupSize) {
+      int reductionsPerWorkgroup = 1;
+      int maxParallelFactor = 4; // Keeping this conservative for now.
+      int64_t lastParallelDim = parallelDims.back();
+      if (!ShapedType::isDynamic(lastParallelDim) &&
+          (lastParallelDim % maxParallelFactor == 0) &&
+          lastParallelDim > maxParallelFactor) {
+        for (int candidate = reductionsPerWorkgroup;
+             candidate <= maxParallelFactor; candidate *= 2) {
+          reductionsPerWorkgroup = candidate;
+        }
+      }
+      workgroupTileSizes.back() = reductionsPerWorkgroup;
+    }
+  }
+
   std::array<int64_t, 3> workgroupSize = {groupSize, 1, 1};
   SmallVector<int64_t> reductionTileSizes(op.getNumLoops(), 0);
   int64_t remainingGroupSize = groupSize;
diff --git a/third_party/llvm-project b/third_party/llvm-project
index 419c919605ed..eac2a61aba05 160000
--- a/third_party/llvm-project
+++ b/third_party/llvm-project
@@ -1 +1 @@
-Subproject commit 419c919605edd543b78c4aebda827db4e7cc22c2
+Subproject commit eac2a61aba0585cafed5f2114c9e250322c01511

From b7b3b7d0f0d4d357db667948f41f36286b7f320a Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Tue, 5 Dec 2023 17:21:42 -0500
Subject: [PATCH 2/4] Fix heuristic

---
 .../src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp    | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index 26c6b3919f1e..917ed4023273 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -936,10 +936,10 @@ static LogicalResult setWarpReductionConfig(func::FuncOp entryPoint,
     if (*parallelSize && !parallelDims.empty() && groupSize == subgroupSize) {
       int reductionsPerWorkgroup = 1;
       int maxParallelFactor = 4; // Keeping this conservative for now.
-      int64_t lastParallelDim = parallelDims.back();
-      if (!ShapedType::isDynamic(lastParallelDim) &&
-          (lastParallelDim % maxParallelFactor == 0) &&
-          lastParallelDim > maxParallelFactor) {
+      int64_t lastParallelBound = bounds[parallelDims.back()];
+      if (!ShapedType::isDynamic(lastParallelBound) &&
+          (lastParallelBound % maxParallelFactor == 0) &&
+          lastParallelBound > maxParallelFactor) {
         for (int candidate = reductionsPerWorkgroup;
              candidate <= maxParallelFactor; candidate *= 2) {
           reductionsPerWorkgroup = candidate;

From ee86d58f235ae4f8f736289040277dfd27966b08 Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Tue, 5 Dec 2023 18:53:32 -0500
Subject: [PATCH 3/4] Fix test

---
 .../Codegen/Common/GPU/test/vector_reduction_to_gpu.mlir  | 8 ++++----
 .../src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp    | 7 +++++--
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/vector_reduction_to_gpu.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/vector_reduction_to_gpu.mlir
index b287642460d4..6834438cbc27 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/vector_reduction_to_gpu.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/vector_reduction_to_gpu.mlir
@@ -312,13 +312,13 @@ hal.executable private @multirow  {
 
 // CHECK-LABEL: func.func @multirow() {
 //       CHECK:   scf.for {{.*}} -> (vector<4x8xf16>) {
-//       CHECK:     vector.transfer_read {{.*}} {in_bounds = [true, true]} : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<4x8xf16>
-//       CHECK:     vector.transfer_read {{.*}} {in_bounds = [true, true]} : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<4x8xf16>
+//       CHECK:     vector.transfer_read {{.*}} : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<4x8xf16>
+//       CHECK:     vector.transfer_read {{.*}} : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<4x8xf16>
 //       CHECK:     arith.mulf %{{.*}}, %{{.*}} : vector<4x8xf16>
 //       CHECK:     arith.addf %{{.*}}, %{{.*}} : vector<4x8xf16>
 //       CHECK:   }
 //       CHECK:   gpu.shuffle xor
 //       CHECK:   scf.if {{.*}} {
-//  CHECK-NEXT:     vector.transfer_write {{.*}} : vector<4xf16>, memref<1x32000xf16, #hal.descriptor_type<storage_buffer>>
-//  CHECK-NEXT:   }
+//       CHECK:     vector.transfer_write {{.*}} : vector<4xf16>, memref<1x32000xf16, #hal.descriptor_type<storage_buffer>>
+//       CHECK:   }
 //  CHECK-NEXT:   return
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index 917ed4023273..12ee9f909a56 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Support/Debug.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/IR/Matchers.h"
@@ -931,8 +932,10 @@ static LogicalResult setWarpReductionConfig(func::FuncOp entryPoint,
   // process a few reductions along the last parallel dimension.
   // TODO: We should also check that this will result in data reuse for at least
   //       one argument.
-  // TODO: This is experimental and rocm-only for now.
-  if (isRocmTarget(entryPoint)) {
+  // TODO: This is experimental for matvec (matmul_transpose_b) on rocm-only for
+  // now.
+  if (numDynamicReductionDims == 0 && numParallelDims == 2 &&
+      isRocmTarget(entryPoint)) {
     if (*parallelSize && !parallelDims.empty() && groupSize == subgroupSize) {
       int reductionsPerWorkgroup = 1;
       int maxParallelFactor = 4; // Keeping this conservative for now.

From 818d9c4ec184701d1f49af9f7d6bff349a427b56 Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Tue, 5 Dec 2023 19:11:08 -0500
Subject: [PATCH 4/4] Add kernel config test

---
 .../compiler/Codegen/LLVMGPU/KernelConfig.cpp |  7 +--
 .../Codegen/LLVMGPU/test/config_matvec.mlir   | 47 +++++++++++++++++++
 2 files changed, 48 insertions(+), 6 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index 12ee9f909a56..139bfc547f42 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -937,18 +937,13 @@ static LogicalResult setWarpReductionConfig(func::FuncOp entryPoint,
   if (numDynamicReductionDims == 0 && numParallelDims == 2 &&
       isRocmTarget(entryPoint)) {
     if (*parallelSize && !parallelDims.empty() && groupSize == subgroupSize) {
-      int reductionsPerWorkgroup = 1;
       int maxParallelFactor = 4; // Keeping this conservative for now.
       int64_t lastParallelBound = bounds[parallelDims.back()];
       if (!ShapedType::isDynamic(lastParallelBound) &&
           (lastParallelBound % maxParallelFactor == 0) &&
           lastParallelBound > maxParallelFactor) {
-        for (int candidate = reductionsPerWorkgroup;
-             candidate <= maxParallelFactor; candidate *= 2) {
-          reductionsPerWorkgroup = candidate;
-        }
+        workgroupTileSizes.back() = maxParallelFactor;
       }
-      workgroupTileSizes.back() = reductionsPerWorkgroup;
     }
   }
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir
index 47b34315160d..2cfa7a8b3aeb 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir
@@ -50,3 +50,50 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb", {target_arch = "gf
 //       CHECK: func.func @dynamic_batch_matvec()
 //       CHECK:   linalg.batch_matmul
 //  CHECK-SAME:     lowering_config = #[[$CONFIG]]
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
+  #hal.descriptor_set.layout<0, bindings = [
+    #hal.descriptor_set.binding<0, storage_buffer>,
+    #hal.descriptor_set.binding<1, storage_buffer>,
+    #hal.descriptor_set.binding<2, storage_buffer>
+  ]>
+]>
+
+hal.executable @vmt {
+hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb", {target_arch = "gfx940"}>) {
+    hal.executable.export @vmt layout(#pipeline_layout)
+    builtin.module {
+      func.func @vmt() {
+        %c0 = arith.constant 0 : index
+        %cst = arith.constant 0.000000e+00 : f16
+        %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>>
+        %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>>
+        %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>>
+        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16>
+        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16>
+        %5 = tensor.empty() : tensor<1x32000xf16>
+        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<1x32000xf16>) -> tensor<1x32000xf16>
+        %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<1x4096xf16>, tensor<32000x4096xf16>) outs(%6 : tensor<1x32000xf16>) {
+        ^bb0(%in: f16, %in_0: f16, %out: f16):
+          %8 = arith.mulf %in, %in_0 : f16
+          %9 = arith.addf %out, %8 : f16
+          linalg.yield %9 : f16
+        } -> tensor<1x32000xf16>
+        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>>
+        return
+      }
+    }
+  }
+}
+
+//   CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 4], [0, 0, 512]{{\]}}>
+//   CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction>
+// CHECK-LABEL: hal.executable.export public @vmt
+//  CHECK-SAME:   subgroup_size = 64 : index
+//  CHECK-SAME:   translation_info = #[[$TRANSLATION]]
+//  CHECK-SAME:   workgroup_size = [64 : index, 1 : index, 1 : index]
+//       CHECK: func.func @vmt()
+//       CHECK:   linalg.generic
+//  CHECK-SAME:     lowering_config = #[[$CONFIG]]