From cbb11f220c69e0106dbfd1533a00237c3a74e7e3 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Tue, 3 Dec 2024 15:50:06 -0500 Subject: [PATCH] Load ukernel bitcode as `executable_object` at the time of lowering to ukernels. (#19323) 1. Moves the time of loading ukernel bitcode from `serializeExecutable` to the `GPULowerToUKernels` pass. 2. The determination of whether an op can lower to a ukernel, is now based on whether the expected bitcode file is found. This allows removing several utility functions that implemented similar logic in different places. 3. The `GPULowerToUKernels` pass searches for existing bitcode in a `hal.executable.objects` attribute, and only loads the embedded ukernel bitcode if that wasn't found, and in either case ensures that that resulting ukernel op has a `hal.executable.objects` attribute containing the necessary IR. This has several nice implications: - The IR becomes completely self-contained: a ukernel op is no longer an opaque interface to some bitcode at-a-distance. - This solves the problem of allowing contributing one's own bitcode from the outside. Users can write their own `hal.executable.objects`. - De-duplication of bitcode is handled by the HoistExecutableObjects pass. - Linking bitcode is handled by generic linker code linking executable objects. - The only useful custom handling of ukernel symbols, was adding `AlwaysInline` function attributes. This PR moves these attributes to the ukernel source code: `[[clang::always_inline]]`. I verified that these result in the expected `alwaysinline` in the bitcode. 4. The ukernel bitcode is part of the ROCM plugin. The `serializeExecutable` implementation, which was the consumer of that data, is also in the ROCM plugin. But the `GPULowerToUKernels` pass, which is the new consumer, is outside of that plugin. So this required creating a mechanism to export such embedded data files from the ROCM plugin to the outside. That is solved by the new `EmbeddedDataDirectory` utility. --------- Signed-off-by: Benoit Jacob --- .../bazel_to_cmake_converter.py | 2 +- compiler/plugins/target/ROCM/BUILD.bazel | 5 +- compiler/plugins/target/ROCM/CMakeLists.txt | 5 +- compiler/plugins/target/ROCM/ROCMTarget.cpp | 36 ++-- .../plugins/target/ROCM/ROCMTargetUtils.cpp | 41 ----- .../target/ROCM/builtins/ukernel/BUILD.bazel | 21 +-- .../ROCM/builtins/ukernel/CMakeLists.txt | 116 ++++++------- .../ukernel/iree_uk_amdgpu_argmax_f16i32.c | 8 +- .../ukernel/iree_uk_amdgpu_argmax_f16i64.c | 8 +- .../ukernel/iree_uk_amdgpu_argmax_f32i32.c | 8 +- .../ukernel/iree_uk_amdgpu_argmax_f32i64.c | 8 +- compiler/plugins/target/ROCM/test/BUILD.bazel | 26 +++ .../plugins/target/ROCM/test/CMakeLists.txt | 31 +++- .../ROCM}/test/gpu_lower_to_ukernels.mlir | 53 +++++- .../test/ukernel_pipeline_transform.mlir | 24 +-- .../Codegen/Common/GPU/GPULowerToUKernels.cpp | 164 ++++++++++++------ .../compiler/Codegen/Common/GPU/Passes.td | 2 +- .../Codegen/Common/GPU/test/BUILD.bazel | 3 - .../Codegen/Common/GPU/test/CMakeLists.txt | 1 - .../Codegen/Dialect/Codegen/IR/UKernelOps.cpp | 7 +- .../compiler/Codegen/LLVMGPU/KernelConfig.cpp | 15 +- .../compiler/Codegen/LLVMGPU/test/BUILD.bazel | 1 - .../Codegen/LLVMGPU/test/CMakeLists.txt | 1 - .../iree/compiler/Codegen/Utils/GPUUtils.cpp | 36 ---- .../iree/compiler/Codegen/Utils/GPUUtils.h | 10 -- compiler/src/iree/compiler/Utils/BUILD.bazel | 1 + .../src/iree/compiler/Utils/CMakeLists.txt | 1 + .../compiler/Utils/EmbeddedDataDirectory.h | 59 +++++++ .../iree/compiler/Utils/unittests/BUILD.bazel | 1 + .../compiler/Utils/unittests/CMakeLists.txt | 1 + .../compiler/Utils/unittests/UtilsTest.cpp | 47 +++++ 31 files changed, 461 insertions(+), 281 deletions(-) create mode 100644 compiler/plugins/target/ROCM/test/BUILD.bazel rename compiler/{src/iree/compiler/Codegen/Common/GPU => plugins/target/ROCM}/test/gpu_lower_to_ukernels.mlir (84%) rename compiler/{src/iree/compiler/Codegen/LLVMGPU => plugins/target/ROCM}/test/ukernel_pipeline_transform.mlir (90%) create mode 100644 compiler/src/iree/compiler/Utils/EmbeddedDataDirectory.h diff --git a/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py b/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py index 0fb0fd85492f..8d2db1050da3 100644 --- a/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py +++ b/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py @@ -616,7 +616,7 @@ def iree_amdgpu_bitcode_library(self, name, gpu_arch, srcs, copts=None, out=None "GPU_ARCH", gpu_arch, quote=False ) srcs_block = self._convert_srcs_block(srcs) - out_block = self._convert_string_arg_block("OUT", out, quote=False) + out_block = self._convert_string_arg_block("OUT", out, quote=True) copts_block = self._convert_string_list_block("COPTS", copts, sort=False) self._converter.body += ( diff --git a/compiler/plugins/target/ROCM/BUILD.bazel b/compiler/plugins/target/ROCM/BUILD.bazel index 6ae9b95c4714..48dfeb3ff401 100644 --- a/compiler/plugins/target/ROCM/BUILD.bazel +++ b/compiler/plugins/target/ROCM/BUILD.bazel @@ -27,10 +27,7 @@ iree_compiler_cc_library( "ROCMTargetUtils.h", ], deps = [ - "//compiler/plugins/target/ROCM/builtins/ukernel:iree_uk_amdgpu_gfx1030", - "//compiler/plugins/target/ROCM/builtins/ukernel:iree_uk_amdgpu_gfx1100", - "//compiler/plugins/target/ROCM/builtins/ukernel:iree_uk_amdgpu_gfx90a", - "//compiler/plugins/target/ROCM/builtins/ukernel:iree_uk_amdgpu_gfx942", + "//compiler/plugins/target/ROCM/builtins/ukernel:iree_uk_amdgpu_bitcode", "//compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR:IREECodegenDialect", "//compiler/src/iree/compiler/Codegen/Dialect/GPU/IR:IREEGPUDialect", "//compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils:KnownTargets", diff --git a/compiler/plugins/target/ROCM/CMakeLists.txt b/compiler/plugins/target/ROCM/CMakeLists.txt index 0efc3df479e6..96c3305d936d 100644 --- a/compiler/plugins/target/ROCM/CMakeLists.txt +++ b/compiler/plugins/target/ROCM/CMakeLists.txt @@ -64,10 +64,7 @@ iree_cc_library( iree::compiler::Dialect::HAL::Utils::LLVMLinkerUtils iree::compiler::PluginAPI iree::compiler::Utils - iree::compiler::plugins::target::ROCM::builtins::ukernel::iree_uk_amdgpu_gfx1030 - iree::compiler::plugins::target::ROCM::builtins::ukernel::iree_uk_amdgpu_gfx1100 - iree::compiler::plugins::target::ROCM::builtins::ukernel::iree_uk_amdgpu_gfx90a - iree::compiler::plugins::target::ROCM::builtins::ukernel::iree_uk_amdgpu_gfx942 + iree::compiler::plugins::target::ROCM::builtins::ukernel::iree_uk_amdgpu_bitcode iree::schemas::amdgpu_executable_def_c_fbs iree::schemas::executable_debug_info_c_fbs iree::schemas::hip_executable_def_c_fbs diff --git a/compiler/plugins/target/ROCM/ROCMTarget.cpp b/compiler/plugins/target/ROCM/ROCMTarget.cpp index a49780fbbcf4..48ef62e07220 100644 --- a/compiler/plugins/target/ROCM/ROCMTarget.cpp +++ b/compiler/plugins/target/ROCM/ROCMTarget.cpp @@ -8,6 +8,7 @@ #include +#include "compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_bitcode.h" #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h" #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h" #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h" @@ -21,6 +22,7 @@ #include "iree/compiler/Dialect/HAL/Utils/ExecutableDebugInfoUtils.h" #include "iree/compiler/Dialect/HAL/Utils/LLVMLinkerUtils.h" #include "iree/compiler/PluginAPI/Client.h" +#include "iree/compiler/Utils/EmbeddedDataDirectory.h" #include "iree/compiler/Utils/FlatbufferUtils.h" #include "iree/compiler/Utils/ToolUtils.h" #include "iree/schemas/amdgpu_executable_def_builder.h" @@ -206,6 +208,7 @@ static std::string translateModuleToISA(llvm::Module &module, } return targetISA; } + } // namespace class ROCMTargetBackend final : public TargetBackend { @@ -513,20 +516,6 @@ class ROCMTargetBackend final : public TargetBackend { return failure(); } - // Link module to any enabled ukernels. - StringRef bitcodeDirectory = options.bitcodeDirectory; - StringRef enabledUkernels; - if (auto attr = getConfigStringAttr(targetAttr, "ukernels")) - enabledUkernels = attr->getValue(); - if (!enabledUkernels.empty() && enabledUkernels != "none") { - if (failed(linkUkernelBitcodeFiles( - variantOp.getLoc(), llvmModule.get(), enabledUkernels, - targetArch, bitcodeDirectory, llvm::Linker::OverrideFromSrc, - *targetMachine))) { - return failure(); - } - } - // Link bitcode (*.bc) object attrs specified by the input program. // Note that this happens after the command-line files so that the command // line ones override the symbols coming from the embedded files. @@ -548,14 +537,15 @@ class ROCMTargetBackend final : public TargetBackend { } // Link module to HIP device library. - if (bitcodeDirectory.empty()) { + if (options.bitcodeDirectory.empty()) { return variantOp.emitError() << "cannot find ROCM bitcode files. Check your installation " "consistency and in the worst case, set " "--iree-hip-bc-dir= to a path on your system."; } if (failed(linkHIPBitcodeIfNeeded(variantOp.getLoc(), llvmModule.get(), - targetArch, bitcodeDirectory))) { + targetArch, + options.bitcodeDirectory))) { return failure(); } @@ -881,6 +871,7 @@ class HIPTargetDevice final : public TargetDevice { }; namespace { + struct ROCMSession final : PluginSession { @@ -910,10 +901,23 @@ struct ROCMSession final } // namespace mlir::iree_compiler::IREE::HAL +// Iterate over ukernel bitcode embedded-data files, and insert them into the +// EmbeddedDataDirectory singleton. +static void addAMDGPUUkernelBitcodeToGlobalEmbeddedDataDirectory() { + using mlir::iree_compiler::EmbeddedDataDirectory; + EmbeddedDataDirectory::withGlobal([](EmbeddedDataDirectory &dir) { + const iree_file_toc_t *toc = iree_uk_amdgpu_bitcode_create(); + for (size_t i = 0; i < iree_uk_amdgpu_bitcode_size(); ++i) { + dir.addFile(toc[i].name, llvm::StringRef{toc[i].data, toc[i].size}); + } + }); +} + extern "C" bool iree_register_compiler_plugin_hal_target_rocm( mlir::iree_compiler::PluginRegistrar *registrar) { registrar->registerPlugin( "hal_target_rocm"); + addAMDGPUUkernelBitcodeToGlobalEmbeddedDataDirectory(); return true; } diff --git a/compiler/plugins/target/ROCM/ROCMTargetUtils.cpp b/compiler/plugins/target/ROCM/ROCMTargetUtils.cpp index 792de8e4a4b0..2cf9f20c0de5 100644 --- a/compiler/plugins/target/ROCM/ROCMTargetUtils.cpp +++ b/compiler/plugins/target/ROCM/ROCMTargetUtils.cpp @@ -6,10 +6,6 @@ #include "compiler/plugins/target/ROCM/ROCMTargetUtils.h" -#include "compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_gfx1030.h" -#include "compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_gfx1100.h" -#include "compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_gfx90a.h" -#include "compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_gfx942.h" #include "iree/compiler/Codegen/Utils/GPUUtils.h" #include "iree/compiler/Dialect/HAL/Utils/LLVMLinkerUtils.h" #include "iree/compiler/Utils/ToolUtils.h" @@ -185,43 +181,6 @@ LogicalResult linkHIPBitcodeIfNeeded(Location loc, llvm::Module *module, return linkWithBitcodeFiles(loc, module, bitcodePaths); } -static std::tuple -getUkernelBitcodeTOC(StringRef gpuArch) { - return llvm::StringSwitch>(gpuArch) - .Case("gfx90a", - {iree_uk_amdgpu_gfx90a_create(), iree_uk_amdgpu_gfx90a_size()}) - .Case("gfx942", - {iree_uk_amdgpu_gfx942_create(), iree_uk_amdgpu_gfx942_size()}) - .Case("gfx1030", - {iree_uk_amdgpu_gfx1030_create(), iree_uk_amdgpu_gfx1030_size()}) - .Case("gfx1100", - {iree_uk_amdgpu_gfx1100_create(), iree_uk_amdgpu_gfx1100_size()}) - .Default({nullptr, 0}); -} - -// Links optimized Ukernel bitcode into the given module if the module needs it. -LogicalResult linkUkernelBitcodeFiles(Location loc, llvm::Module *module, - StringRef enabledUkernelsStr, - StringRef targetChip, - StringRef bitcodePath, - unsigned linkerFlags, - llvm::TargetMachine &targetMachine) { - auto [toc, toc_size] = getUkernelBitcodeTOC(targetChip); - if (!toc) { - return failure(); - } - - llvm::Linker linker(*module); - for (int i = 0; i < toc_size; ++i) { - if (failed(linkBitcodeFile(loc, linker, linkerFlags, toc[i].name, - llvm::StringRef(toc[i].data, toc[i].size), - targetMachine, module->getContext()))) - return failure(); - } - - return success(); -} - // Link object file using lld lnker to generate code object // Inspiration from this section comes from LLVM-PROJECT-MLIR by // ROCmSoftwarePlatform diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/BUILD.bazel b/compiler/plugins/target/ROCM/builtins/ukernel/BUILD.bazel index 93e6c86bd4a3..aff7b8965b32 100644 --- a/compiler/plugins/target/ROCM/builtins/ukernel/BUILD.bazel +++ b/compiler/plugins/target/ROCM/builtins/ukernel/BUILD.bazel @@ -49,19 +49,20 @@ argmax_types = [ "iree_uk_amdgpu_argmax_%s.c" % type, "common.h", ], + out = "iree_uk_amdgpu_argmax_%s.%s.bc" % (type, gpu_arch), gpu_arch = gpu_arch, ) for type in argmax_types for gpu_arch in gpu_archs] -argmax_bc_files = {gpu_arch: [ - ":iree_uk_amdgpu_argmax_%s.c.%s.bc" % (type, gpu_arch) +argmax_bc_files = [ + ":iree_uk_amdgpu_argmax_%s.%s.bc" % (type, gpu_arch) for type in argmax_types -] for gpu_arch in gpu_archs} + for gpu_arch in gpu_archs +] -[iree_c_embed_data( - name = "iree_uk_amdgpu_%s" % gpu_arch, - srcs = argmax_bc_files[gpu_arch], - c_file_output = "iree_uk_amdgpu_%s.c" % gpu_arch, +iree_c_embed_data( + name = "iree_uk_amdgpu_bitcode", + srcs = argmax_bc_files, + c_file_output = "iree_uk_amdgpu_bitcode.c", flatten = True, - h_file_output = "iree_uk_amdgpu_%s.h" % gpu_arch, - identifier = "iree_uk_amdgpu_%s" % gpu_arch, -) for gpu_arch in gpu_archs] + h_file_output = "iree_uk_amdgpu_bitcode.h", +) diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt b/compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt index 6b3014f3bd53..71d4705eed1a 100644 --- a/compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt +++ b/compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt @@ -22,6 +22,8 @@ iree_amdgpu_bitcode_library( SRCS "common.h" "iree_uk_amdgpu_argmax_f16i32.c" + OUT + "iree_uk_amdgpu_argmax_f16i32.gfx90a.bc" ) iree_amdgpu_bitcode_library( @@ -32,6 +34,8 @@ iree_amdgpu_bitcode_library( SRCS "common.h" "iree_uk_amdgpu_argmax_f16i32.c" + OUT + "iree_uk_amdgpu_argmax_f16i32.gfx942.bc" ) iree_amdgpu_bitcode_library( @@ -42,6 +46,8 @@ iree_amdgpu_bitcode_library( SRCS "common.h" "iree_uk_amdgpu_argmax_f16i32.c" + OUT + "iree_uk_amdgpu_argmax_f16i32.gfx1030.bc" ) iree_amdgpu_bitcode_library( @@ -52,6 +58,8 @@ iree_amdgpu_bitcode_library( SRCS "common.h" "iree_uk_amdgpu_argmax_f16i32.c" + OUT + "iree_uk_amdgpu_argmax_f16i32.gfx1100.bc" ) iree_amdgpu_bitcode_library( @@ -62,6 +70,8 @@ iree_amdgpu_bitcode_library( SRCS "common.h" "iree_uk_amdgpu_argmax_f16i64.c" + OUT + "iree_uk_amdgpu_argmax_f16i64.gfx90a.bc" ) iree_amdgpu_bitcode_library( @@ -72,6 +82,8 @@ iree_amdgpu_bitcode_library( SRCS "common.h" "iree_uk_amdgpu_argmax_f16i64.c" + OUT + "iree_uk_amdgpu_argmax_f16i64.gfx942.bc" ) iree_amdgpu_bitcode_library( @@ -82,6 +94,8 @@ iree_amdgpu_bitcode_library( SRCS "common.h" "iree_uk_amdgpu_argmax_f16i64.c" + OUT + "iree_uk_amdgpu_argmax_f16i64.gfx1030.bc" ) iree_amdgpu_bitcode_library( @@ -92,6 +106,8 @@ iree_amdgpu_bitcode_library( SRCS "common.h" "iree_uk_amdgpu_argmax_f16i64.c" + OUT + "iree_uk_amdgpu_argmax_f16i64.gfx1100.bc" ) iree_amdgpu_bitcode_library( @@ -102,6 +118,8 @@ iree_amdgpu_bitcode_library( SRCS "common.h" "iree_uk_amdgpu_argmax_f32i32.c" + OUT + "iree_uk_amdgpu_argmax_f32i32.gfx90a.bc" ) iree_amdgpu_bitcode_library( @@ -112,6 +130,8 @@ iree_amdgpu_bitcode_library( SRCS "common.h" "iree_uk_amdgpu_argmax_f32i32.c" + OUT + "iree_uk_amdgpu_argmax_f32i32.gfx942.bc" ) iree_amdgpu_bitcode_library( @@ -122,6 +142,8 @@ iree_amdgpu_bitcode_library( SRCS "common.h" "iree_uk_amdgpu_argmax_f32i32.c" + OUT + "iree_uk_amdgpu_argmax_f32i32.gfx1030.bc" ) iree_amdgpu_bitcode_library( @@ -132,6 +154,8 @@ iree_amdgpu_bitcode_library( SRCS "common.h" "iree_uk_amdgpu_argmax_f32i32.c" + OUT + "iree_uk_amdgpu_argmax_f32i32.gfx1100.bc" ) iree_amdgpu_bitcode_library( @@ -142,6 +166,8 @@ iree_amdgpu_bitcode_library( SRCS "common.h" "iree_uk_amdgpu_argmax_f32i64.c" + OUT + "iree_uk_amdgpu_argmax_f32i64.gfx90a.bc" ) iree_amdgpu_bitcode_library( @@ -152,6 +178,8 @@ iree_amdgpu_bitcode_library( SRCS "common.h" "iree_uk_amdgpu_argmax_f32i64.c" + OUT + "iree_uk_amdgpu_argmax_f32i64.gfx942.bc" ) iree_amdgpu_bitcode_library( @@ -162,6 +190,8 @@ iree_amdgpu_bitcode_library( SRCS "common.h" "iree_uk_amdgpu_argmax_f32i64.c" + OUT + "iree_uk_amdgpu_argmax_f32i64.gfx1030.bc" ) iree_amdgpu_bitcode_library( @@ -172,76 +202,34 @@ iree_amdgpu_bitcode_library( SRCS "common.h" "iree_uk_amdgpu_argmax_f32i64.c" + OUT + "iree_uk_amdgpu_argmax_f32i64.gfx1100.bc" ) iree_c_embed_data( NAME - iree_uk_amdgpu_gfx90a - SRCS - "iree_uk_amdgpu_argmax_f16i32.c.gfx90a.bc" - "iree_uk_amdgpu_argmax_f16i64.c.gfx90a.bc" - "iree_uk_amdgpu_argmax_f32i32.c.gfx90a.bc" - "iree_uk_amdgpu_argmax_f32i64.c.gfx90a.bc" - C_FILE_OUTPUT - "iree_uk_amdgpu_gfx90a.c" - H_FILE_OUTPUT - "iree_uk_amdgpu_gfx90a.h" - IDENTIFIER - "iree_uk_amdgpu_gfx90a" - FLATTEN - PUBLIC -) - -iree_c_embed_data( - NAME - iree_uk_amdgpu_gfx942 - SRCS - "iree_uk_amdgpu_argmax_f16i32.c.gfx942.bc" - "iree_uk_amdgpu_argmax_f16i64.c.gfx942.bc" - "iree_uk_amdgpu_argmax_f32i32.c.gfx942.bc" - "iree_uk_amdgpu_argmax_f32i64.c.gfx942.bc" - C_FILE_OUTPUT - "iree_uk_amdgpu_gfx942.c" - H_FILE_OUTPUT - "iree_uk_amdgpu_gfx942.h" - IDENTIFIER - "iree_uk_amdgpu_gfx942" - FLATTEN - PUBLIC -) - -iree_c_embed_data( - NAME - iree_uk_amdgpu_gfx1030 - SRCS - "iree_uk_amdgpu_argmax_f16i32.c.gfx1030.bc" - "iree_uk_amdgpu_argmax_f16i64.c.gfx1030.bc" - "iree_uk_amdgpu_argmax_f32i32.c.gfx1030.bc" - "iree_uk_amdgpu_argmax_f32i64.c.gfx1030.bc" - C_FILE_OUTPUT - "iree_uk_amdgpu_gfx1030.c" - H_FILE_OUTPUT - "iree_uk_amdgpu_gfx1030.h" - IDENTIFIER - "iree_uk_amdgpu_gfx1030" - FLATTEN - PUBLIC -) - -iree_c_embed_data( - NAME - iree_uk_amdgpu_gfx1100 - SRCS - "iree_uk_amdgpu_argmax_f16i32.c.gfx1100.bc" - "iree_uk_amdgpu_argmax_f16i64.c.gfx1100.bc" - "iree_uk_amdgpu_argmax_f32i32.c.gfx1100.bc" - "iree_uk_amdgpu_argmax_f32i64.c.gfx1100.bc" + iree_uk_amdgpu_bitcode + SRCS + "iree_uk_amdgpu_argmax_f16i32.gfx1030.bc" + "iree_uk_amdgpu_argmax_f16i32.gfx1100.bc" + "iree_uk_amdgpu_argmax_f16i32.gfx90a.bc" + "iree_uk_amdgpu_argmax_f16i32.gfx942.bc" + "iree_uk_amdgpu_argmax_f16i64.gfx1030.bc" + "iree_uk_amdgpu_argmax_f16i64.gfx1100.bc" + "iree_uk_amdgpu_argmax_f16i64.gfx90a.bc" + "iree_uk_amdgpu_argmax_f16i64.gfx942.bc" + "iree_uk_amdgpu_argmax_f32i32.gfx1030.bc" + "iree_uk_amdgpu_argmax_f32i32.gfx1100.bc" + "iree_uk_amdgpu_argmax_f32i32.gfx90a.bc" + "iree_uk_amdgpu_argmax_f32i32.gfx942.bc" + "iree_uk_amdgpu_argmax_f32i64.gfx1030.bc" + "iree_uk_amdgpu_argmax_f32i64.gfx1100.bc" + "iree_uk_amdgpu_argmax_f32i64.gfx90a.bc" + "iree_uk_amdgpu_argmax_f32i64.gfx942.bc" C_FILE_OUTPUT - "iree_uk_amdgpu_gfx1100.c" + "iree_uk_amdgpu_bitcode.c" H_FILE_OUTPUT - "iree_uk_amdgpu_gfx1100.h" - IDENTIFIER - "iree_uk_amdgpu_gfx1100" + "iree_uk_amdgpu_bitcode.h" FLATTEN PUBLIC ) diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f16i32.c b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f16i32.c index 41fe50a6528d..4a6beefa9198 100644 --- a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f16i32.c +++ b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f16i32.c @@ -6,10 +6,10 @@ #include "compiler/plugins/target/ROCM/builtins/ukernel/common.h" -void iree_uk_amdgpu_argmax_f16i32(const _Float16 *inputBuffer, - int64_t input_offset, int32_t *outputBuffer, - int64_t output_offset, - int64_t reductionSize) { +[[clang::always_inline]] void +iree_uk_amdgpu_argmax_f16i32(const _Float16 *inputBuffer, int64_t input_offset, + int32_t *outputBuffer, int64_t output_offset, + int64_t reductionSize) { const int warpSize = __builtin_amdgcn_wavefrontsize(); _Float16 NEG_F16_MAX = (_Float16)(-65504.0f); int32_t laneID = __builtin_amdgcn_workitem_id_x(); diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f16i64.c b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f16i64.c index 823fc3a4f296..33c1522d143d 100644 --- a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f16i64.c +++ b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f16i64.c @@ -6,10 +6,10 @@ #include "compiler/plugins/target/ROCM/builtins/ukernel/common.h" -void iree_uk_amdgpu_argmax_f16i64(const _Float16 *inputBuffer, - int64_t input_offset, int64_t *outputBuffer, - int64_t output_offset, - int64_t reductionSize) { +[[clang::always_inline]] void +iree_uk_amdgpu_argmax_f16i64(const _Float16 *inputBuffer, int64_t input_offset, + int64_t *outputBuffer, int64_t output_offset, + int64_t reductionSize) { const int warpSize = __builtin_amdgcn_wavefrontsize(); _Float16 NEG_F16_MAX = (_Float16)(-65504.0f); int32_t laneID = __builtin_amdgcn_workitem_id_x(); diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f32i32.c b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f32i32.c index 41aad8ba05c5..f39d62372799 100644 --- a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f32i32.c +++ b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f32i32.c @@ -6,10 +6,10 @@ #include "compiler/plugins/target/ROCM/builtins/ukernel/common.h" -void iree_uk_amdgpu_argmax_f32i32(const float *inputBuffer, - int64_t input_offset, int32_t *outputBuffer, - int64_t output_offset, - int64_t reductionSize) { +[[clang::always_inline]] void +iree_uk_amdgpu_argmax_f32i32(const float *inputBuffer, int64_t input_offset, + int32_t *outputBuffer, int64_t output_offset, + int64_t reductionSize) { const int warpSize = __builtin_amdgcn_wavefrontsize(); int32_t laneID = __builtin_amdgcn_workitem_id_x(); // Set identity value to handle problem non divisible by subgroupSize. diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f32i64.c b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f32i64.c index 5899322d7407..d6a9afbcf2d6 100644 --- a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f32i64.c +++ b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f32i64.c @@ -6,10 +6,10 @@ #include "compiler/plugins/target/ROCM/builtins/ukernel/common.h" -void iree_uk_amdgpu_argmax_f32i64(const float *inputBuffer, - int64_t input_offset, int64_t *outputBuffer, - int64_t output_offset, - int64_t reductionSize) { +[[clang::always_inline]] void +iree_uk_amdgpu_argmax_f32i64(const float *inputBuffer, int64_t input_offset, + int64_t *outputBuffer, int64_t output_offset, + int64_t reductionSize) { const int warpSize = __builtin_amdgcn_wavefrontsize(); int32_t laneID = __builtin_amdgcn_workitem_id_x(); // Set identity value to handle problem non divisible by subgroupSize. diff --git a/compiler/plugins/target/ROCM/test/BUILD.bazel b/compiler/plugins/target/ROCM/test/BUILD.bazel new file mode 100644 index 000000000000..bf9a18d582bd --- /dev/null +++ b/compiler/plugins/target/ROCM/test/BUILD.bazel @@ -0,0 +1,26 @@ +# Copyright 2024 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +load("//build_tools/bazel:build_defs.oss.bzl", "iree_cmake_extra_content") +load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite") + +package( + features = ["layering_check"], + licenses = ["notice"], # Apache 2.0 +) + +iree_lit_test_suite( + name = "lit", + srcs = [ + "gpu_lower_to_ukernels.mlir", + "ukernel_pipeline_transform.mlir", + ], + cfg = "//compiler:lit.cfg.py", + tools = [ + "//tools:iree-opt", + "@llvm-project//llvm:FileCheck", + ], +) diff --git a/compiler/plugins/target/ROCM/test/CMakeLists.txt b/compiler/plugins/target/ROCM/test/CMakeLists.txt index df185a05e72b..6d2199d8c4bb 100644 --- a/compiler/plugins/target/ROCM/test/CMakeLists.txt +++ b/compiler/plugins/target/ROCM/test/CMakeLists.txt @@ -1,10 +1,33 @@ -# NOTE: Bazel testing of this backend is impossible because there is no way -# for Bazel to bundle the AMD bitcode files that the backend depends on. Users -# of the compiler can pass explicit flags, but we prefer that default tests -# exercise default flags, which cannot be supported properly on Bazel builds. +################################################################################ +# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from # +# compiler/plugins/target/ROCM/test/BUILD.bazel # +# # +# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary # +# CMake-only content. # +# # +# To disable autogeneration for this file entirely, delete this header. # +################################################################################ iree_add_all_subdirs() +iree_lit_test_suite( + NAME + lit + SRCS + "gpu_lower_to_ukernels.mlir" + "ukernel_pipeline_transform.mlir" + TOOLS + FileCheck + iree-opt +) + +### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ### + +# NOTE: The following tests are CMake-only because they depend on AMD device +# bitcode libraries that are provided by custom CMake code in target/ROCM. +# By contrast, the above tests that only require ukernel bitcode are part of the +# Bazel build because ukernel bitcode is something that we generate ourselves. + iree_lit_test_suite( NAME lit diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_lower_to_ukernels.mlir b/compiler/plugins/target/ROCM/test/gpu_lower_to_ukernels.mlir similarity index 84% rename from compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_lower_to_ukernels.mlir rename to compiler/plugins/target/ROCM/test/gpu_lower_to_ukernels.mlir index cc71c379959f..177bd0b36f7c 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_lower_to_ukernels.mlir +++ b/compiler/plugins/target/ROCM/test/gpu_lower_to_ukernels.mlir @@ -1,5 +1,4 @@ -// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx1100 --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-lower-to-ukernels,cse,canonicalize))" %s | FileCheck %s -// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx90a --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-lower-to-ukernels,cse,canonicalize))" %s | FileCheck %s --check-prefix=CDNA2 +// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-lower-to-ukernels,cse,canonicalize))" %s | FileCheck %s // RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx908 --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-lower-to-ukernels,cse,canonicalize))" %s | FileCheck %s --check-prefix=CDNA1 func.func @argmax_2d_f32i64(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes { @@ -28,7 +27,7 @@ func.func @argmax_2d_f32i64(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes // CHECK-DAG: %[[C1_index:.+]] = arith.constant 1 : index // CHECK-DAG: %[[C0_i64:.+]] = arith.constant 0 // CHECK-DAG: %[[FILL:.+]] = linalg.fill ins(%[[C0_i64]] -// CHECK: %[[MICRO_KERNEL:.+]] = iree_codegen.ukernel.generic "iree_uk_amdgpu_argmax_f32i64" +// CHECK: %[[MICRO_KERNEL:.+]] = iree_codegen.ukernel.generic {hal.executable.objects = [{{.*}}]} "iree_uk_amdgpu_argmax_f32i64" // CHECK-SAME: ins(%[[ARG0]] : // CHECK-SAME: outs(%[[FILL]] : // CHECK: return %[[MICRO_KERNEL]] @@ -284,3 +283,51 @@ func.func @argmax_ukernel_unsupported_arch(%arg0 : tensor<1x?xf32>) -> tensor<1x // CDNA1-LABEL: func @argmax_ukernel_unsupported_arch( // CDNA1-NOT: iree_codegen.ukernel.generic // CDNA1: linalg.generic + +// ----- + +// Test user-provided bitcode in the source IR. + +func.func @argmax_2d_f32i64(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes { + hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "all"}>, + // Dummy bitcode with an unusual length of 12. The first 4 bytes are the .bc file format signature. + hal.executable.objects = [ + #hal.executable.object<{ + path = "iree_uk_amdgpu_argmax_f32i64.gfx942.bc", + data = dense<[66, 67, -64, -34, 1, 35, 69, 103, -119, -85, -51, -17]> : tensor<12xi8> + }> + ] +} { + %c0_i64 = arith.constant 0 : i64 + %cst = arith.constant 0xFF800000 : f32 + %0 = tensor.empty() : tensor<1xi64> + %1 = linalg.fill ins(%c0_i64 : i64) outs(%0 : tensor<1xi64>) -> tensor<1xi64> + %2 = tensor.empty() : tensor<1xf32> + %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<1xf32>) -> tensor<1xf32> + %4:2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%arg0 : tensor<1x?xf32>) outs(%3, %1 : tensor<1xf32>, tensor<1xi64>) { + ^bb0(%in: f32, %out: f32, %out_0: i64): + %5 = linalg.index 1 : index + %6 = arith.index_cast %5 : index to i64 + %7 = arith.maximumf %in, %out : f32 + %8 = arith.cmpf ogt, %in, %out : f32 + %9 = arith.select %8, %6, %out_0 : i64 + linalg.yield %7, %9 : f32, i64 + } -> (tensor<1xf32>, tensor<1xi64>) + return %4#1 : tensor<1xi64> +} + +//CHECK-LABEL: func @argmax_2d_f32i64( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor<1x?xf32> +// CHECK-DAG: %[[C1_index:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C0_i64:.+]] = arith.constant 0 +// CHECK-DAG: %[[FILL:.+]] = linalg.fill ins(%[[C0_i64]] +// CHECK: %[[MICRO_KERNEL:.+]] = iree_codegen.ukernel.generic { +// CHECK-SAME: hal.executable.objects = [ +// CHECK-SAME: #hal.executable.object<{ +// CHECK-SAME: path = "iree_uk_amdgpu_argmax_f32i64.gfx942.bc", +// CHECK-SAME: data = dense<[66, 67, -64, -34, 1, 35, 69, 103, -119, -85, -51, -17]> : tensor<12xi8> +// CHECK-SAME: }> +// CHECK-SAME: ]} "iree_uk_amdgpu_argmax_f32i64" +// CHECK-SAME: ins(%[[ARG0]] : +// CHECK-SAME: outs(%[[FILL]] : +// CHECK: return %[[MICRO_KERNEL]] diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ukernel_pipeline_transform.mlir b/compiler/plugins/target/ROCM/test/ukernel_pipeline_transform.mlir similarity index 90% rename from compiler/src/iree/compiler/Codegen/LLVMGPU/test/ukernel_pipeline_transform.mlir rename to compiler/plugins/target/ROCM/test/ukernel_pipeline_transform.mlir index af8ecf196115..26ce4c8959f4 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ukernel_pipeline_transform.mlir +++ b/compiler/plugins/target/ROCM/test/ukernel_pipeline_transform.mlir @@ -4,10 +4,11 @@ #hal.pipeline.binding, #hal.pipeline.binding ]> -#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "argmax"}> #map = affine_map<(d0) -> (d0)> #map1 = affine_map<(d0) -> ()> -func.func @argmax_1d_f16i64() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} { +func.func @argmax_1d_f16i64() attributes { + hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "argmax"}> +} { %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0xFC00 : f16 %c0_i64 = arith.constant 0 : i64 @@ -43,7 +44,7 @@ func.func @argmax_1d_f16i64() attributes {hal.executable.target = #executable_ta // CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info // CHECK: func.func @argmax_1d_f16i64() // CHECK-SAME: translation_info = #[[$TRANSLATION]] -// CHECK: iree_codegen.ukernel.generic "iree_uk_amdgpu_argmax_f16i64" +// CHECK: iree_codegen.ukernel.generic {hal.executable.objects = [{{.*}}]} "iree_uk_amdgpu_argmax_f16i64" // ----- @@ -51,10 +52,11 @@ func.func @argmax_1d_f16i64() attributes {hal.executable.target = #executable_ta #hal.pipeline.binding, #hal.pipeline.binding ]> -#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "argmax"}> #map = affine_map<(d0, d1) -> (d0, d1)> #map1 = affine_map<(d0, d1) -> (d0)> -func.func @argmax_2d_f32i64() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} { +func.func @argmax_2d_f32i64() attributes { + hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "argmax"}> +} { %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0xFF800000 : f32 %c0_i64 = arith.constant 0 : i64 @@ -92,7 +94,7 @@ func.func @argmax_2d_f32i64() attributes {hal.executable.target = #executable_ta // CHECK-SAME: translation_info = #[[$TRANSLATION]] // CHECK: %[[SUBVIEW:.*]] = memref.subview{{.*}} memref<16x?xf32 // CHECK-SAME: to memref<1x?xf32 -// CHECK: iree_codegen.ukernel.generic "iree_uk_amdgpu_argmax_f32i64" ins(%[[SUBVIEW]] +// CHECK: iree_codegen.ukernel.generic {hal.executable.objects = [{{.*}}]} "iree_uk_amdgpu_argmax_f32i64" ins(%[[SUBVIEW]] // ----- @@ -100,10 +102,11 @@ func.func @argmax_2d_f32i64() attributes {hal.executable.target = #executable_ta #hal.pipeline.binding, #hal.pipeline.binding ]> -#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb"> #map = affine_map<(d0) -> (d0)> #map1 = affine_map<(d0) -> ()> -func.func @no_ukernel_argmax_1d_f16i64() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} { +func.func @no_ukernel_argmax_1d_f16i64() attributes { + hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb"> +} { %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0xFC00 : f16 %c0_i64 = arith.constant 0 : i64 @@ -147,10 +150,11 @@ func.func @no_ukernel_argmax_1d_f16i64() attributes {hal.executable.target = #ex #hal.pipeline.binding, #hal.pipeline.binding ]> -#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "argmax"}> #map = affine_map<(d0) -> (d0)> #map1 = affine_map<(d0) -> ()> -func.func @not_neg_inf_init_argmax_1d() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} { +func.func @not_neg_inf_init_argmax_1d() attributes { + hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "argmax"}> +} { %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f16 %c0_i64 = arith.constant 0 : i64 diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPULowerToUKernels.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPULowerToUKernels.cpp index f76cdd1c6ccf..a72b4ff8e180 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPULowerToUKernels.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPULowerToUKernels.cpp @@ -9,9 +9,12 @@ #include "iree/compiler/Codegen/Dialect/Codegen/IR/UKernelOps.h" #include "iree/compiler/Codegen/Utils/GPUUtils.h" #include "iree/compiler/Codegen/Utils/Utils.h" +#include "iree/compiler/Utils/EmbeddedDataDirectory.h" +#include "llvm/Support/FormatVariadic.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/Linalg/Utils/Utils.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" +#include "mlir/IR/AsmState.h" #include "mlir/IR/Attributes.h" #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/MLIRContext.h" @@ -24,22 +27,101 @@ namespace mlir::iree_compiler { namespace { +// Returns a ExecutableObjectAttr carrying the bitcode for the given ukernel. +// +// First tries finding the bitcode in the input `sourceExecutableObjects`, which +// must be an array of ExecutableObjectAttr's and is typically coming from a +// hal.executable.objects array attribute in the source IR, which is the +// mechanism by which source programs may provide their own ukernel bitcode. +// +// If no matching bitcode was found in `sourceExecutableObjects`, this function +// will then search in bitcode files that we have embedded as static data. +static IREE::HAL::ExecutableObjectAttr +getUKernelBitcode(OpBuilder &builder, + IREE::HAL::ExecutableTargetAttr execTarget, + ArrayAttr sourceExecutableObjects, StringRef ukernelName) { + IREE::GPU::TargetAttr gpuTarget = getGPUTargetAttr(execTarget); + if (!gpuTarget) { + return {}; + } + StringRef gpuArch = gpuTarget.getArch(); + std::string bitcodeFilename = + llvm::formatv("{0}.{1}.bc", ukernelName, gpuArch); + + // Early-return if the source executable.objects already contain an object + // with the expected file name. This happens with user-provided bitcode in the + // source IR. + if (sourceExecutableObjects) { + for (Attribute a : sourceExecutableObjects) { + if (auto object = dyn_cast(a)) { + if (object.getPath() == bitcodeFilename) { + return object; + } + } + } + } + + // No user-provided bitcode, so we search our embedded bitcode files in the + // EmbeddedDataDirectory singleton. + std::optional bitcode; + EmbeddedDataDirectory::withGlobal([&](EmbeddedDataDirectory &dir) { + bitcode = dir.getFile(bitcodeFilename); + }); + if (!bitcode) { + return {}; + } + MLIRContext *context = builder.getContext(); + auto blob = HeapAsmResourceBlob::allocateAndCopyInferAlign( + ArrayRef(bitcode->data(), bitcode->size())); + auto bitcodeDenseAttr = DenseI8ResourceElementsAttr::get( + VectorType::get({static_cast(bitcode->size())}, + builder.getI8Type()), + bitcodeFilename, std::move(blob)); + return IREE::HAL::ExecutableObjectAttr::get( + context, StringAttr::get(context, bitcodeFilename), + cast(bitcodeDenseAttr)); +} + +// Walks parents ops from `op` to return the nearest hal.executable.objects +// array attribute. If the parent hal.executable.variant is reached, its objects +// attribute is returned. +// Adapted from ExecutableTargetAttr::lookup. +static ArrayAttr lookUpExecutableObjects(Operation *op) { + MLIRContext *context = op->getContext(); + auto attrId = StringAttr::get(context, "hal.executable.objects"); + while (op) { + // Take directly from the enclosing variant. + if (auto variantOp = dyn_cast(op)) { + if (std::optional objects = variantOp.getObjects()) { + return *objects; + } + } + // Take from op attributes. + if (auto attr = op->getAttrOfType(attrId)) { + return attr; + } + // Continue walk. + op = op->getParentOp(); + } + return {}; +} + /// Holds a function name and attributes. struct FnNameAndDefAttrs { std::string name; SmallVector defAttrs; + explicit operator bool() const { return !name.empty(); } }; /// Returns the function name and attributes to use for a ukernel with given -/// `ukernelName` on the target described by `targetAttr`. +/// `name` and `suffix` on the target described by `targetAttr`. static FnNameAndDefAttrs -getFnNameAndDefAttrs(const char *ukernelName, std::string &typeSuffixID, +getFnNameAndDefAttrs(const char *name, std::string &suffix, RewriterBase &rewriter, IREE::HAL::ExecutableTargetAttr targetAttr) { FnNameAndDefAttrs result; if (isROCMBackend(targetAttr)) { - result.name = - std::string("iree_uk_amdgpu_") + ukernelName + "_" + typeSuffixID; + result.name = llvm::formatv("iree_uk_amdgpu_{0}_{1}", name, suffix); result.defAttrs.emplace_back(rewriter.getStringAttr("vm.import.module"), rewriter.getStringAttr("rocm")); } @@ -54,9 +136,21 @@ static FailureOr matchArgmaxDAGForUKernel(RewriterBase &rewriter, linalg::GenericOp op) { auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(op); const char ukernelName[] = "argmax"; - if (!hasUkernel(targetAttr, ukernelName) || - !hasUkernelSupportedGpuArch(targetAttr)) { - return failure(); + Value input = op.getDpsInputOperand(0)->get(); + auto inputType = cast(input.getType()); + Value index = op.getDpsInitOperand(1)->get(); + auto indexType = cast(index.getType()); + std::string suffix; + llvm::raw_string_ostream(suffix) + << inputType.getElementType() << indexType.getElementType(); + FnNameAndDefAttrs fn = + getFnNameAndDefAttrs(ukernelName, suffix, rewriter, targetAttr); + if (!fn) { + return rewriter.notifyMatchFailure(op, "no ukernels on this backend"); + } + + if (!hasUkernel(targetAttr, ukernelName)) { + return rewriter.notifyMatchFailure(op, "ukernel not enabled"); } // Currently only support argmax where parallel dims are 1. @@ -74,68 +168,40 @@ matchArgmaxDAGForUKernel(RewriterBase &rewriter, linalg::GenericOp op) { } parallelSize *= bounds[dim]; } - if (parallelSize != 1) - return failure(); - - // Get value/input type. - Value input = op.getDpsInputOperand(0)->get(); - auto inputType = llvm::cast(input.getType()); - Type inputElemType = inputType.getElementType(); - // Only support f16 and f32 values. - if (!inputElemType.isF16() && !inputElemType.isF32()) { - return failure(); - } - - // Get index type. - Value index = op.getDpsInitOperand(1)->get(); - auto indexType = llvm::cast(index.getType()); - Type indexElemType = indexType.getElementType(); - // Only support i32 and i64 index. - if (!indexElemType.isInteger(32) && !indexElemType.isInteger(64)) { + if (parallelSize != 1) { return failure(); } - - std::string typeSuffixID; - llvm::raw_string_ostream(typeSuffixID) << inputElemType << indexElemType; - // TODO(bjacob): this check won't be needed one this code will be updated to - // look up the table of contents of embedded bitcode files, one per symbol. - if (!(typeSuffixID == "f16i32" || typeSuffixID == "f16i64" || - typeSuffixID == "f32i32" || typeSuffixID == "f32i64")) { - return rewriter.notifyMatchFailure( - op, "unsupported combination of element types"); + auto execTarget = IREE::HAL::ExecutableTargetAttr::lookup(op); + ArrayAttr sourceExecutableObjects = lookUpExecutableObjects(op); + IREE::HAL::ExecutableObjectAttr bitcodeObject = + getUKernelBitcode(rewriter, execTarget, sourceExecutableObjects, fn.name); + if (!bitcodeObject) { + return rewriter.notifyMatchFailure(op, "no ukernel bitcode for this op"); } - Location loc = op.getLoc(); // Currently only support 1D reduction, where reduc is on fastest dim. // Tiling argmax ukernel is also set to enforce this structure. const int kReductionDim = op.getNumLoops() - 1; Value reductionDimSize = rewriter.create(loc, input, kReductionDim); - auto fn = - getFnNameAndDefAttrs(ukernelName, typeSuffixID, rewriter, targetAttr); auto genericMicroKernelOp = rewriter.create( loc, indexType, fn.name, ValueRange{input}, index, ValueRange{reductionDimSize}, /*fn_def_attrs=*/rewriter.getDictionaryAttr(fn.defAttrs), /*strided_outer_dims=*/rewriter.getIndexAttr(0)); + genericMicroKernelOp->setAttr( + "hal.executable.objects", + ArrayAttr::get(rewriter.getContext(), bitcodeObject)); return cast( genericMicroKernelOp.getOperation()); } -using TargetPredicate = std::function; - struct LowerArgmaxToUKernelPattern : OpRewritePattern { - LowerArgmaxToUKernelPattern(MLIRContext *context, - TargetPredicate targetPredicate) - : OpRewritePattern(context), - targetPredicate(targetPredicate) {} + LowerArgmaxToUKernelPattern(MLIRContext *context) + : OpRewritePattern(context) {} LogicalResult matchAndRewrite(linalg::GenericOp op, PatternRewriter &rewriter) const override { - if (targetPredicate && - !targetPredicate(IREE::HAL::ExecutableTargetAttr::lookup(op))) { - return failure(); - } if (failed(isArgmaxOp(op))) { return failure(); } @@ -149,8 +215,6 @@ struct LowerArgmaxToUKernelPattern : OpRewritePattern { ukernelOp.value()->getResults()); return success(); } - - TargetPredicate targetPredicate; }; struct GPULowerToUKernelsPass final @@ -170,7 +234,7 @@ struct GPULowerToUKernelsPass final // evidence that it is difficult for codegen to consistently approach // microkernels performance, and that consideration overrides the benefit of // fusions for these ops. - patterns.insert(context, isROCMBackend); + patterns.insert(context); if (failed(applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)))) { return signalPassFailure(); diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td index e8f1551c477a..15c6e9c23c93 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td @@ -107,7 +107,7 @@ def GPUInferMemorySpacePass : def GPULowerToUKernelsPass : Pass<"iree-codegen-gpu-lower-to-ukernels", ""> { - let summary = "Separate out parts of the IR that lower to a micro-kernel"; + let summary = "Lower suitable ops to microkernels."; let dependentDialects = [ "::mlir::iree_compiler::IREE::Codegen::IREECodegenDialect", "::mlir::iree_compiler::IREE::GPU::IREEGPUDialect", diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel index 7d0e6887d717..7fe33161bca1 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel @@ -30,7 +30,6 @@ iree_lit_test_suite( "gpu_fuse_and_hoist_forall.mlir", "gpu_greedily_distribute_to_threads.mlir", "gpu_infer_memory_space.mlir", - "gpu_lower_to_ukernels.mlir", "gpu_combine_value_barriers.mlir", "gpu_materialize_encoding_gfx908.mlir", "gpu_materialize_encoding_gfx90a.mlir", @@ -58,8 +57,6 @@ iree_lit_test_suite( "vector_reduction_to_gpu.mlir", ], include = ["*.mlir"], - exclude = [ - ], ), cfg = "//compiler:lit.cfg.py", tools = [ diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt index a9c584acd96d..4b9853df8213 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt @@ -26,7 +26,6 @@ iree_lit_test_suite( "gpu_generalize_named_ops.mlir" "gpu_greedily_distribute_to_threads.mlir" "gpu_infer_memory_space.mlir" - "gpu_lower_to_ukernels.mlir" "gpu_materialize_encoding_gfx1100.mlir" "gpu_materialize_encoding_gfx908.mlir" "gpu_materialize_encoding_gfx90a.mlir" diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/UKernelOps.cpp b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/UKernelOps.cpp index b044244dbc0a..dbf082b5be03 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/UKernelOps.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/UKernelOps.cpp @@ -61,7 +61,12 @@ createFunctionCall(RewriterBase &rewriter, Operation *op, StringRef fnName, } // Insert the function call. - return rewriter.create(loc, fnDecl, callOperands); + auto callOp = rewriter.create(loc, fnDecl, callOperands); + if (op->hasAttr("hal.executable.objects")) { + callOp->setAttr("hal.executable.objects", + op->getAttr("hal.executable.objects")); + } + return callOp; } //===---------------------------------------------------------------------===// diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp index d63fd2d5d258..23e5cbb13e27 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp @@ -1824,18 +1824,25 @@ static LogicalResult setTransposeConfig(mlir::FunctionOpInterface entryPoint, // UKernel Pipeline Configuration //====---------------------------------------------------------------------===// -/// Set the configuration for argmax that can be mapped to argmax uKernel. +/// Set the configuration for argmax when ukernels are enabled. /// Distribute all parallel dim across different workgroups, and only use single /// subgroup per workgroup. +/// +/// TODO(bjacob): This is fragile, as we can't know yet if this argmax will be +/// lowered to a ukernel. We need instead a config that works regardless of +/// ukernels. For now, we use the looser condition that the argmax ukernel is +/// enabled, a necessary but not sufficient condition for this particular op to +/// lower to the ukernel. This is good enough for now for a couple of reasons: +/// 1. Even if a argmax does not actually lower to a ukernel, this config should +/// still work. +/// 2. Ukernels are not enabled by default. static LogicalResult setArgmaxUkernelConfig(IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPoint, linalg::GenericOp op) { // Checks if UKernels are enabled. if (auto target = IREE::HAL::ExecutableTargetAttr::lookup(entryPoint)) { - const char ukernelName[] = "argmax"; - if (!hasUkernel(target, ukernelName) || - !hasUkernelSupportedGpuArch(target)) { + if (!hasUkernel(target, "argmax")) { return failure(); } } diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel index 756327ca4475..5d4042975f29 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel @@ -65,7 +65,6 @@ iree_lit_test_suite( "transform_gpu_pipelining.mlir", "transform_vector_to_mma.mlir", "transpose_pipeline_test.mlir", - "ukernel_pipeline_transform.mlir", "configure_tensor_layout.mlir", "vector_lowering.mlir", "vector_to_gpu.mlir", diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt index 1d0dcc979a56..fb9e495d9535 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt @@ -62,7 +62,6 @@ iree_lit_test_suite( "transform_gpu_pipelining.mlir" "transform_vector_to_mma.mlir" "transpose_pipeline_test.mlir" - "ukernel_pipeline_transform.mlir" "vector_lowering.mlir" "vector_to_gpu.mlir" "winograd_pipeline_test.mlir" diff --git a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp index 612183d94eda..8f09f6f932f8 100644 --- a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp +++ b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp @@ -936,42 +936,6 @@ bool sharedMemTransposeFilter(AffineMap indexMap) { return false; } -//===----------------------------------------------------------------------===// -// GPU UKernel Utils -//===----------------------------------------------------------------------===// - -// TODO: Add more popular kernels into this list and the ukernel cmake. -// No real technical reason to only allow these aside from compile -// time and diskspace. -bool hasUkernelSupportedRocmArch(StringRef targetChip) { - const char *kSupportedTargetChip[] = {"gfx90a", "gfx942", "gfx1030", - "gfx1100"}; - size_t arraySize = - sizeof(kSupportedTargetChip) / sizeof(kSupportedTargetChip[0]); - for (int i = 0; i < arraySize; i++) { - // return true if targetChip is found inside kSupportedTargetChip. - if (targetChip.compare(kSupportedTargetChip[i]) == 0) - return true; - } - return false; -} - -bool hasUkernelSupportedRocmArch(IREE::HAL::ExecutableTargetAttr targetAttr) { - auto targetArch = getGPUTargetAttr(targetAttr).getArch(); - if (targetArch.empty()) - return false; - return hasUkernelSupportedRocmArch(targetArch); -} - -/// Checks if target GPU has UKernel support. -bool hasUkernelSupportedGpuArch(IREE::HAL::ExecutableTargetAttr targetAttr) { - if (isROCMBackend(targetAttr) && hasUkernelSupportedRocmArch(targetAttr)) { - return true; - } - // TODO: Once plumbed, add a CUDA backend and supported cuda arch check. - return false; -} - //===----------------------------------------------------------------------===// // GPU Target Information //===----------------------------------------------------------------------===// diff --git a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h index 133d7246a5fa..1bd088588af8 100644 --- a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h +++ b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h @@ -174,16 +174,6 @@ combiningKindToAllReduce(vector::CombiningKind kind); /// using shared memory when CodeGen towards the GPU. bool sharedMemTransposeFilter(AffineMap indexMap); -//===----------------------------------------------------------------------===// -// GPU UKernel Utils -//===----------------------------------------------------------------------===// - -/// Checks if target Chip(StringRef) has UKernel support. -bool hasUkernelSupportedRocmArch(StringRef targetChip); - -/// Checks if targetAttr's GPU target has UKernel support. -bool hasUkernelSupportedGpuArch(IREE::HAL::ExecutableTargetAttr targetAttr); - //===----------------------------------------------------------------------===// // GPU Target Information //===----------------------------------------------------------------------===// diff --git a/compiler/src/iree/compiler/Utils/BUILD.bazel b/compiler/src/iree/compiler/Utils/BUILD.bazel index dbcdc0156dae..c7c2acc2a8fd 100644 --- a/compiler/src/iree/compiler/Utils/BUILD.bazel +++ b/compiler/src/iree/compiler/Utils/BUILD.bazel @@ -31,6 +31,7 @@ iree_compiler_cc_library( hdrs = [ "ConversionUtils.h", "ElementPackingUtils.h", + "EmbeddedDataDirectory.h", "EquivalenceUtils.h", "FlatbufferUtils.h", "Folding.h", diff --git a/compiler/src/iree/compiler/Utils/CMakeLists.txt b/compiler/src/iree/compiler/Utils/CMakeLists.txt index c4f20b2ac74f..84be0745bbf6 100644 --- a/compiler/src/iree/compiler/Utils/CMakeLists.txt +++ b/compiler/src/iree/compiler/Utils/CMakeLists.txt @@ -16,6 +16,7 @@ iree_cc_library( HDRS "ConversionUtils.h" "ElementPackingUtils.h" + "EmbeddedDataDirectory.h" "EquivalenceUtils.h" "FlatbufferUtils.h" "Folding.h" diff --git a/compiler/src/iree/compiler/Utils/EmbeddedDataDirectory.h b/compiler/src/iree/compiler/Utils/EmbeddedDataDirectory.h new file mode 100644 index 000000000000..130db1d68bf1 --- /dev/null +++ b/compiler/src/iree/compiler/Utils/EmbeddedDataDirectory.h @@ -0,0 +1,59 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_COMPILER_UTILS_EMBEDDEDDATADIRECTORY_H_ +#define IREE_COMPILER_UTILS_EMBEDDEDDATADIRECTORY_H_ + +#include +#include "llvm/ADT/StringMap.h" + +namespace mlir::iree_compiler { + +// A string-to-StringRef map that acts as a virtual filesystem: the keys are +// "filenames" and the values are file contents. +class EmbeddedDataDirectory { +public: + // Calls the given `callback` on a global singleton object, guarded by a + // global mutex. + // + // Only use this for use cases that require a global object, such as when + // exporting data between parts of the compiler that can't directly link to + // each other (e.g. from a plugin to outside of the plugin). + static void + withGlobal(llvm::function_ref callback) { + static EmbeddedDataDirectory dir; + static std::mutex mutex; + std::lock_guard lock(mutex); + callback(dir); + } + + // Add a new entry if it didn't already exist. Return `true` if it was added. + bool addFile(llvm::StringRef fileName, llvm::StringRef contents) { + auto [_iter, success] = map.insert({fileName, contents}); + return success; + } + + // Get an existing entry if it exists, otherwise return nullopt. + std::optional getFile(llvm::StringRef fileName) const { + auto iter = map.find(fileName); + if (iter == map.end()) { + return std::nullopt; + } + return iter->getValue(); + } + + // Direct access to the underlying StringMap, for use cases that are not well + // served by convenience methods like addFile and getFile. For example, + // iterating over all entries. + llvm::StringMap &getMap() { return map; } + +private: + llvm::StringMap map; +}; + +} // namespace mlir::iree_compiler + +#endif // IREE_COMPILER_UTILS_EMBEDDEDDATADIRECTORY_H_ diff --git a/compiler/src/iree/compiler/Utils/unittests/BUILD.bazel b/compiler/src/iree/compiler/Utils/unittests/BUILD.bazel index 197037a2e5ba..3581fddea250 100644 --- a/compiler/src/iree/compiler/Utils/unittests/BUILD.bazel +++ b/compiler/src/iree/compiler/Utils/unittests/BUILD.bazel @@ -19,5 +19,6 @@ iree_compiler_cc_test( "//compiler/src/iree/compiler/Utils", "//compiler/src/iree/testing:gtest_main", "@com_google_googletest//:gtest", + "@llvm-project//llvm:Support", ], ) diff --git a/compiler/src/iree/compiler/Utils/unittests/CMakeLists.txt b/compiler/src/iree/compiler/Utils/unittests/CMakeLists.txt index 421262fd5d71..a850b2d7c83c 100644 --- a/compiler/src/iree/compiler/Utils/unittests/CMakeLists.txt +++ b/compiler/src/iree/compiler/Utils/unittests/CMakeLists.txt @@ -16,6 +16,7 @@ iree_cc_test( SRCS "UtilsTest.cpp" DEPS + LLVMSupport gmock gtest iree::compiler::Utils diff --git a/compiler/src/iree/compiler/Utils/unittests/UtilsTest.cpp b/compiler/src/iree/compiler/Utils/unittests/UtilsTest.cpp index d3ad37ca7512..39dc6cd1e712 100644 --- a/compiler/src/iree/compiler/Utils/unittests/UtilsTest.cpp +++ b/compiler/src/iree/compiler/Utils/unittests/UtilsTest.cpp @@ -6,8 +6,11 @@ #include #include +#include +#include "iree/compiler/Utils/EmbeddedDataDirectory.h" #include "iree/compiler/Utils/Permutation.h" +#include "llvm/Support/FormatVariadic.h" using namespace mlir::iree_compiler; using namespace testing; @@ -19,3 +22,47 @@ TEST(Permutation, MakeMovePermutation) { EXPECT_THAT(makeMovePermutation(3, 1, 2), ElementsAre(0, 2, 1)); EXPECT_THAT(makeMovePermutation(3, 2, 0), ElementsAre(2, 0, 1)); } + +TEST(EmbeddedDataDirectory, AddFileGetFile) { + EmbeddedDataDirectory dir; + EXPECT_TRUE(dir.addFile("filename1", "file contents 1")); + EXPECT_TRUE(dir.addFile("filename2", "file contents 2")); + EXPECT_FALSE(dir.addFile("filename1", "file contents 3")); + EXPECT_EQ(dir.getFile("filename1"), "file contents 1"); + EXPECT_EQ(dir.getFile("filename2"), "file contents 2"); + EXPECT_EQ(dir.getFile("filename3"), std::nullopt); +} + +TEST(EmbeddedDataDirectory, WithGlobal) { + std::vector threads; + for (int i = 0; i < 3; ++i) { + threads.emplace_back([i] { + EmbeddedDataDirectory::withGlobal([i](EmbeddedDataDirectory &globalDir) { + EXPECT_TRUE(globalDir.addFile(llvm::formatv("filename{0}", i).str(), + "file contents xxx")); + }); + }); + } + for (std::thread &thread : threads) { + thread.join(); + } + EmbeddedDataDirectory::withGlobal([](EmbeddedDataDirectory &globalDir) { + std::vector keys; + for (auto iter : globalDir.getMap().keys()) { + keys.push_back(iter.str()); + } + EXPECT_THAT(keys, + UnorderedElementsAre("filename0", "filename1", "filename2")); + }); +} + +TEST(EmbeddedDataDirectory, GetMap) { + EmbeddedDataDirectory dir; + EXPECT_TRUE(dir.addFile("filename1", "file contents 1")); + EXPECT_TRUE(dir.addFile("filename2", "file contents 2")); + std::vector keys; + for (auto iter : dir.getMap().keys()) { + keys.push_back(iter.str()); + } + EXPECT_THAT(keys, UnorderedElementsAre("filename1", "filename2")); +}