Skip to content

Commit

Permalink
Load ukernel bitcode as executable_object at the time of lowering t…
Browse files Browse the repository at this point in the history
…o ukernels. (iree-org#19323)

1. Moves the time of loading ukernel bitcode from `serializeExecutable`
to the `GPULowerToUKernels` pass.
2. The determination of whether an op can lower to a ukernel, is now
based on whether the expected bitcode file is found. This allows
removing several utility functions that implemented similar logic in
different places.
3. The `GPULowerToUKernels` pass searches for existing bitcode in a
`hal.executable.objects` attribute, and only loads the embedded ukernel
bitcode if that wasn't found, and in either case ensures that that
resulting ukernel op has a `hal.executable.objects` attribute containing
the necessary IR. This has several nice implications:
- The IR becomes completely self-contained: a ukernel op is no longer an
opaque interface to some bitcode at-a-distance.
- This solves the problem of allowing contributing one's own bitcode
from the outside. Users can write their own `hal.executable.objects`.
- De-duplication of bitcode is handled by the HoistExecutableObjects
pass.
- Linking bitcode is handled by generic linker code linking executable
objects.
- The only useful custom handling of ukernel symbols, was adding
`AlwaysInline` function attributes. This PR moves these attributes to
the ukernel source code: `[[clang::always_inline]]`. I verified that
these result in the expected `alwaysinline` in the bitcode.
4. The ukernel bitcode is part of the ROCM plugin. The
`serializeExecutable` implementation, which was the consumer of that
data, is also in the ROCM plugin. But the `GPULowerToUKernels` pass,
which is the new consumer, is outside of that plugin. So this required
creating a mechanism to export such embedded data files from the ROCM
plugin to the outside. That is solved by the new `EmbeddedDataDirectory`
utility.

---------

Signed-off-by: Benoit Jacob <[email protected]>
  • Loading branch information
bjacob authored Dec 3, 2024
1 parent 263dcf0 commit cbb11f2
Show file tree
Hide file tree
Showing 31 changed files with 461 additions and 281 deletions.
2 changes: 1 addition & 1 deletion build_tools/bazel_to_cmake/bazel_to_cmake_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -616,7 +616,7 @@ def iree_amdgpu_bitcode_library(self, name, gpu_arch, srcs, copts=None, out=None
"GPU_ARCH", gpu_arch, quote=False
)
srcs_block = self._convert_srcs_block(srcs)
out_block = self._convert_string_arg_block("OUT", out, quote=False)
out_block = self._convert_string_arg_block("OUT", out, quote=True)
copts_block = self._convert_string_list_block("COPTS", copts, sort=False)

self._converter.body += (
Expand Down
5 changes: 1 addition & 4 deletions compiler/plugins/target/ROCM/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,7 @@ iree_compiler_cc_library(
"ROCMTargetUtils.h",
],
deps = [
"//compiler/plugins/target/ROCM/builtins/ukernel:iree_uk_amdgpu_gfx1030",
"//compiler/plugins/target/ROCM/builtins/ukernel:iree_uk_amdgpu_gfx1100",
"//compiler/plugins/target/ROCM/builtins/ukernel:iree_uk_amdgpu_gfx90a",
"//compiler/plugins/target/ROCM/builtins/ukernel:iree_uk_amdgpu_gfx942",
"//compiler/plugins/target/ROCM/builtins/ukernel:iree_uk_amdgpu_bitcode",
"//compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR:IREECodegenDialect",
"//compiler/src/iree/compiler/Codegen/Dialect/GPU/IR:IREEGPUDialect",
"//compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils:KnownTargets",
Expand Down
5 changes: 1 addition & 4 deletions compiler/plugins/target/ROCM/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,7 @@ iree_cc_library(
iree::compiler::Dialect::HAL::Utils::LLVMLinkerUtils
iree::compiler::PluginAPI
iree::compiler::Utils
iree::compiler::plugins::target::ROCM::builtins::ukernel::iree_uk_amdgpu_gfx1030
iree::compiler::plugins::target::ROCM::builtins::ukernel::iree_uk_amdgpu_gfx1100
iree::compiler::plugins::target::ROCM::builtins::ukernel::iree_uk_amdgpu_gfx90a
iree::compiler::plugins::target::ROCM::builtins::ukernel::iree_uk_amdgpu_gfx942
iree::compiler::plugins::target::ROCM::builtins::ukernel::iree_uk_amdgpu_bitcode
iree::schemas::amdgpu_executable_def_c_fbs
iree::schemas::executable_debug_info_c_fbs
iree::schemas::hip_executable_def_c_fbs
Expand Down
36 changes: 20 additions & 16 deletions compiler/plugins/target/ROCM/ROCMTarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include <cstdint>

#include "compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_bitcode.h"
#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h"
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h"
Expand All @@ -21,6 +22,7 @@
#include "iree/compiler/Dialect/HAL/Utils/ExecutableDebugInfoUtils.h"
#include "iree/compiler/Dialect/HAL/Utils/LLVMLinkerUtils.h"
#include "iree/compiler/PluginAPI/Client.h"
#include "iree/compiler/Utils/EmbeddedDataDirectory.h"
#include "iree/compiler/Utils/FlatbufferUtils.h"
#include "iree/compiler/Utils/ToolUtils.h"
#include "iree/schemas/amdgpu_executable_def_builder.h"
Expand Down Expand Up @@ -206,6 +208,7 @@ static std::string translateModuleToISA(llvm::Module &module,
}
return targetISA;
}

} // namespace

class ROCMTargetBackend final : public TargetBackend {
Expand Down Expand Up @@ -513,20 +516,6 @@ class ROCMTargetBackend final : public TargetBackend {
return failure();
}

// Link module to any enabled ukernels.
StringRef bitcodeDirectory = options.bitcodeDirectory;
StringRef enabledUkernels;
if (auto attr = getConfigStringAttr(targetAttr, "ukernels"))
enabledUkernels = attr->getValue();
if (!enabledUkernels.empty() && enabledUkernels != "none") {
if (failed(linkUkernelBitcodeFiles(
variantOp.getLoc(), llvmModule.get(), enabledUkernels,
targetArch, bitcodeDirectory, llvm::Linker::OverrideFromSrc,
*targetMachine))) {
return failure();
}
}

// Link bitcode (*.bc) object attrs specified by the input program.
// Note that this happens after the command-line files so that the command
// line ones override the symbols coming from the embedded files.
Expand All @@ -548,14 +537,15 @@ class ROCMTargetBackend final : public TargetBackend {
}

// Link module to HIP device library.
if (bitcodeDirectory.empty()) {
if (options.bitcodeDirectory.empty()) {
return variantOp.emitError()
<< "cannot find ROCM bitcode files. Check your installation "
"consistency and in the worst case, set "
"--iree-hip-bc-dir= to a path on your system.";
}
if (failed(linkHIPBitcodeIfNeeded(variantOp.getLoc(), llvmModule.get(),
targetArch, bitcodeDirectory))) {
targetArch,
options.bitcodeDirectory))) {
return failure();
}

Expand Down Expand Up @@ -881,6 +871,7 @@ class HIPTargetDevice final : public TargetDevice {
};

namespace {

struct ROCMSession final
: PluginSession<ROCMSession, ROCMOptions,
PluginActivationPolicy::DefaultActivated> {
Expand Down Expand Up @@ -910,10 +901,23 @@ struct ROCMSession final

} // namespace mlir::iree_compiler::IREE::HAL

// Iterate over ukernel bitcode embedded-data files, and insert them into the
// EmbeddedDataDirectory singleton.
static void addAMDGPUUkernelBitcodeToGlobalEmbeddedDataDirectory() {
using mlir::iree_compiler::EmbeddedDataDirectory;
EmbeddedDataDirectory::withGlobal([](EmbeddedDataDirectory &dir) {
const iree_file_toc_t *toc = iree_uk_amdgpu_bitcode_create();
for (size_t i = 0; i < iree_uk_amdgpu_bitcode_size(); ++i) {
dir.addFile(toc[i].name, llvm::StringRef{toc[i].data, toc[i].size});
}
});
}

extern "C" bool iree_register_compiler_plugin_hal_target_rocm(
mlir::iree_compiler::PluginRegistrar *registrar) {
registrar->registerPlugin<mlir::iree_compiler::IREE::HAL::ROCMSession>(
"hal_target_rocm");
addAMDGPUUkernelBitcodeToGlobalEmbeddedDataDirectory();
return true;
}

Expand Down
41 changes: 0 additions & 41 deletions compiler/plugins/target/ROCM/ROCMTargetUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@

#include "compiler/plugins/target/ROCM/ROCMTargetUtils.h"

#include "compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_gfx1030.h"
#include "compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_gfx1100.h"
#include "compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_gfx90a.h"
#include "compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_gfx942.h"
#include "iree/compiler/Codegen/Utils/GPUUtils.h"
#include "iree/compiler/Dialect/HAL/Utils/LLVMLinkerUtils.h"
#include "iree/compiler/Utils/ToolUtils.h"
Expand Down Expand Up @@ -185,43 +181,6 @@ LogicalResult linkHIPBitcodeIfNeeded(Location loc, llvm::Module *module,
return linkWithBitcodeFiles(loc, module, bitcodePaths);
}

static std::tuple<const iree_file_toc_t *, int>
getUkernelBitcodeTOC(StringRef gpuArch) {
return llvm::StringSwitch<std::tuple<const iree_file_toc_t *, int>>(gpuArch)
.Case("gfx90a",
{iree_uk_amdgpu_gfx90a_create(), iree_uk_amdgpu_gfx90a_size()})
.Case("gfx942",
{iree_uk_amdgpu_gfx942_create(), iree_uk_amdgpu_gfx942_size()})
.Case("gfx1030",
{iree_uk_amdgpu_gfx1030_create(), iree_uk_amdgpu_gfx1030_size()})
.Case("gfx1100",
{iree_uk_amdgpu_gfx1100_create(), iree_uk_amdgpu_gfx1100_size()})
.Default({nullptr, 0});
}

// Links optimized Ukernel bitcode into the given module if the module needs it.
LogicalResult linkUkernelBitcodeFiles(Location loc, llvm::Module *module,
StringRef enabledUkernelsStr,
StringRef targetChip,
StringRef bitcodePath,
unsigned linkerFlags,
llvm::TargetMachine &targetMachine) {
auto [toc, toc_size] = getUkernelBitcodeTOC(targetChip);
if (!toc) {
return failure();
}

llvm::Linker linker(*module);
for (int i = 0; i < toc_size; ++i) {
if (failed(linkBitcodeFile(loc, linker, linkerFlags, toc[i].name,
llvm::StringRef(toc[i].data, toc[i].size),
targetMachine, module->getContext())))
return failure();
}

return success();
}

// Link object file using lld lnker to generate code object
// Inspiration from this section comes from LLVM-PROJECT-MLIR by
// ROCmSoftwarePlatform
Expand Down
21 changes: 11 additions & 10 deletions compiler/plugins/target/ROCM/builtins/ukernel/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -49,19 +49,20 @@ argmax_types = [
"iree_uk_amdgpu_argmax_%s.c" % type,
"common.h",
],
out = "iree_uk_amdgpu_argmax_%s.%s.bc" % (type, gpu_arch),
gpu_arch = gpu_arch,
) for type in argmax_types for gpu_arch in gpu_archs]

argmax_bc_files = {gpu_arch: [
":iree_uk_amdgpu_argmax_%s.c.%s.bc" % (type, gpu_arch)
argmax_bc_files = [
":iree_uk_amdgpu_argmax_%s.%s.bc" % (type, gpu_arch)
for type in argmax_types
] for gpu_arch in gpu_archs}
for gpu_arch in gpu_archs
]

[iree_c_embed_data(
name = "iree_uk_amdgpu_%s" % gpu_arch,
srcs = argmax_bc_files[gpu_arch],
c_file_output = "iree_uk_amdgpu_%s.c" % gpu_arch,
iree_c_embed_data(
name = "iree_uk_amdgpu_bitcode",
srcs = argmax_bc_files,
c_file_output = "iree_uk_amdgpu_bitcode.c",
flatten = True,
h_file_output = "iree_uk_amdgpu_%s.h" % gpu_arch,
identifier = "iree_uk_amdgpu_%s" % gpu_arch,
) for gpu_arch in gpu_archs]
h_file_output = "iree_uk_amdgpu_bitcode.h",
)
Loading

0 comments on commit cbb11f2

Please sign in to comment.