[LLVMGPU] Convert maximumf/minimumf to max/min for ROCM (iree-org#15069)

AMDGPU does not support the former
nod-ai · Sep 29, 2023 · 113f9d2 · 113f9d2
1 parent 14ce232
commit 113f9d2
Show file tree

Hide file tree

Showing 2 changed files with 47 additions and 1 deletion.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include "iree/compiler/Codegen/Common/GPU/GPUPatterns.h"
+#include "iree/compiler/Codegen/Common/Transforms.h"
 #include "iree/compiler/Codegen/LLVMGPU/ConvertToLLVM.h"
 #include "iree/compiler/Codegen/LLVMGPU/PassDetail.h"
 #include "iree/compiler/Codegen/LLVMGPU/Passes.h"
@@ -97,6 +98,17 @@ struct ConvertToROCDLPass : public ConvertToROCDLBase<ConvertToROCDLPass> {
         return signalPassFailure();
       }
     }
+    {
+      // Convert arith::maximumf/minimumf ops on AMD gpus since the lowering
+      // is faulty for them.
+      // TODO: Remove this once the lowering in LLVM is fixed
+      // (https://github.com/llvm/llvm-project/issues/67815).
+      RewritePatternSet patterns(&getContext());
+      populateReplaceSlowMinMaxOpsPatterns(patterns);
+      if (failed(applyPatternsAndFoldGreedily(m, std::move(patterns)))) {
+        return signalPassFailure();
+      }
+    }
     {
       RewritePatternSet llvmPatterns(&getContext());
       populateLowerHALInterfaceOp(llvmPatterns);

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/convert_to_rocdl.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/convert_to_rocdl.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-convert-to-rocdl))))" %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-convert-to-rocdl))))" %s | FileCheck %s
 
 // Test that that standard and GPU ops are converted to LLVM and NVVM.
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
@@ -39,3 +39,37 @@ hal.executable @abs_ex_dispatch_0 {
 //  CHECK-SAME:  %{{[a-zA-Z0-9]*}}: !llvm.ptr {llvm.align = 16 : i32, llvm.noalias})
 //      CHECK:    rocdl.workgroup.dim.x
 //      CHECK:    llvm.fadd
+
+
+// -----
+// Test that maximum and minum are converted to max and min on rocm
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
+  #hal.descriptor_set.layout<0, bindings = [
+    #hal.descriptor_set.binding<0, storage_buffer>,
+    #hal.descriptor_set.binding<4, storage_buffer>
+  ]>,
+  #hal.descriptor_set.layout<1, bindings = [
+    #hal.descriptor_set.binding<2, storage_buffer>
+  ]>
+]>
+hal.executable @abs_ex_dispatch_0 {
+  hal.executable.variant @cuda, target = <"cuda", "cuda-nvptx-fb"> {
+    hal.executable.export @abs_ex_dispatch_0 layout(#pipeline_layout)
+    builtin.module {
+      func.func @reduction_maximum() {
+      %c0 = arith.constant 0 : index 
+      %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : 
+            memref<32x64x64xf32, strided<[4096, 64, 1], offset: ?>> 
+      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<32x64x64xf32, 
+            strided<[4096, 64, 1], offset: ?>> 
+      %2 = vector.load %0[%c0, %c0, %c0] : memref<32x64x64xf32, strided<[4096, 64, 1], offset: ?>>, vector<2xf32> 
+      %3 = vector.reduction <maximumf>, %2 : vector<2xf32> into f32 
+      %4 = vector.splat %3 : vector<2xf32> 
+      vector.store %4, %1[%c0, %c0, %c0] : memref<32x64x64xf32, strided<[4096, 64, 1], offset: ?>>, vector<2xf32> 
+      return 
+      } 
+    }
+  }
+}
+// CHECK-LABEL: llvm.func @reduction_maximum
+// CHECK:  llvm.intr.vector.reduce.fmax({{.*}})  : (vector<2xf32>) -> f32