From cbb11f220c69e0106dbfd1533a00237c3a74e7e3 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Tue, 3 Dec 2024 15:50:06 -0500
Subject: [PATCH] Load ukernel bitcode as `executable_object` at the time of
 lowering to ukernels. (#19323)

1. Moves the time of loading ukernel bitcode from `serializeExecutable`
to the `GPULowerToUKernels` pass.
2. The determination of whether an op can lower to a ukernel, is now
based on whether the expected bitcode file is found. This allows
removing several utility functions that implemented similar logic in
different places.
3. The `GPULowerToUKernels` pass searches for existing bitcode in a
`hal.executable.objects` attribute, and only loads the embedded ukernel
bitcode if that wasn't found, and in either case ensures that that
resulting ukernel op has a `hal.executable.objects` attribute containing
the necessary IR. This has several nice implications:
- The IR becomes completely self-contained: a ukernel op is no longer an
opaque interface to some bitcode at-a-distance.
- This solves the problem of allowing contributing one's own bitcode
from the outside. Users can write their own `hal.executable.objects`.
- De-duplication of bitcode is handled by the HoistExecutableObjects
pass.
- Linking bitcode is handled by generic linker code linking executable
objects.
- The only useful custom handling of ukernel symbols, was adding
`AlwaysInline` function attributes. This PR moves these attributes to
the ukernel source code: `[[clang::always_inline]]`. I verified that
these result in the expected `alwaysinline` in the bitcode.
4. The ukernel bitcode is part of the ROCM plugin. The
`serializeExecutable` implementation, which was the consumer of that
data, is also in the ROCM plugin. But the `GPULowerToUKernels` pass,
which is the new consumer, is outside of that plugin. So this required
creating a mechanism to export such embedded data files from the ROCM
plugin to the outside. That is solved by the new `EmbeddedDataDirectory`
utility.

---------

Signed-off-by: Benoit Jacob <jacob.benoit.1@gmail.com>
---
 .../bazel_to_cmake_converter.py               |   2 +-
 compiler/plugins/target/ROCM/BUILD.bazel      |   5 +-
 compiler/plugins/target/ROCM/CMakeLists.txt   |   5 +-
 compiler/plugins/target/ROCM/ROCMTarget.cpp   |  36 ++--
 .../plugins/target/ROCM/ROCMTargetUtils.cpp   |  41 -----
 .../target/ROCM/builtins/ukernel/BUILD.bazel  |  21 +--
 .../ROCM/builtins/ukernel/CMakeLists.txt      | 116 ++++++-------
 .../ukernel/iree_uk_amdgpu_argmax_f16i32.c    |   8 +-
 .../ukernel/iree_uk_amdgpu_argmax_f16i64.c    |   8 +-
 .../ukernel/iree_uk_amdgpu_argmax_f32i32.c    |   8 +-
 .../ukernel/iree_uk_amdgpu_argmax_f32i64.c    |   8 +-
 compiler/plugins/target/ROCM/test/BUILD.bazel |  26 +++
 .../plugins/target/ROCM/test/CMakeLists.txt   |  31 +++-
 .../ROCM}/test/gpu_lower_to_ukernels.mlir     |  53 +++++-
 .../test/ukernel_pipeline_transform.mlir      |  24 +--
 .../Codegen/Common/GPU/GPULowerToUKernels.cpp | 164 ++++++++++++------
 .../compiler/Codegen/Common/GPU/Passes.td     |   2 +-
 .../Codegen/Common/GPU/test/BUILD.bazel       |   3 -
 .../Codegen/Common/GPU/test/CMakeLists.txt    |   1 -
 .../Codegen/Dialect/Codegen/IR/UKernelOps.cpp |   7 +-
 .../compiler/Codegen/LLVMGPU/KernelConfig.cpp |  15 +-
 .../compiler/Codegen/LLVMGPU/test/BUILD.bazel |   1 -
 .../Codegen/LLVMGPU/test/CMakeLists.txt       |   1 -
 .../iree/compiler/Codegen/Utils/GPUUtils.cpp  |  36 ----
 .../iree/compiler/Codegen/Utils/GPUUtils.h    |  10 --
 compiler/src/iree/compiler/Utils/BUILD.bazel  |   1 +
 .../src/iree/compiler/Utils/CMakeLists.txt    |   1 +
 .../compiler/Utils/EmbeddedDataDirectory.h    |  59 +++++++
 .../iree/compiler/Utils/unittests/BUILD.bazel |   1 +
 .../compiler/Utils/unittests/CMakeLists.txt   |   1 +
 .../compiler/Utils/unittests/UtilsTest.cpp    |  47 +++++
 31 files changed, 461 insertions(+), 281 deletions(-)
 create mode 100644 compiler/plugins/target/ROCM/test/BUILD.bazel
 rename compiler/{src/iree/compiler/Codegen/Common/GPU => plugins/target/ROCM}/test/gpu_lower_to_ukernels.mlir (84%)
 rename compiler/{src/iree/compiler/Codegen/LLVMGPU => plugins/target/ROCM}/test/ukernel_pipeline_transform.mlir (90%)
 create mode 100644 compiler/src/iree/compiler/Utils/EmbeddedDataDirectory.h

diff --git a/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py b/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py
index 0fb0fd85492f..8d2db1050da3 100644
--- a/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py
+++ b/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py
@@ -616,7 +616,7 @@ def iree_amdgpu_bitcode_library(self, name, gpu_arch, srcs, copts=None, out=None
             "GPU_ARCH", gpu_arch, quote=False
         )
         srcs_block = self._convert_srcs_block(srcs)
-        out_block = self._convert_string_arg_block("OUT", out, quote=False)
+        out_block = self._convert_string_arg_block("OUT", out, quote=True)
         copts_block = self._convert_string_list_block("COPTS", copts, sort=False)
 
         self._converter.body += (
diff --git a/compiler/plugins/target/ROCM/BUILD.bazel b/compiler/plugins/target/ROCM/BUILD.bazel
index 6ae9b95c4714..48dfeb3ff401 100644
--- a/compiler/plugins/target/ROCM/BUILD.bazel
+++ b/compiler/plugins/target/ROCM/BUILD.bazel
@@ -27,10 +27,7 @@ iree_compiler_cc_library(
         "ROCMTargetUtils.h",
     ],
     deps = [
-        "//compiler/plugins/target/ROCM/builtins/ukernel:iree_uk_amdgpu_gfx1030",
-        "//compiler/plugins/target/ROCM/builtins/ukernel:iree_uk_amdgpu_gfx1100",
-        "//compiler/plugins/target/ROCM/builtins/ukernel:iree_uk_amdgpu_gfx90a",
-        "//compiler/plugins/target/ROCM/builtins/ukernel:iree_uk_amdgpu_gfx942",
+        "//compiler/plugins/target/ROCM/builtins/ukernel:iree_uk_amdgpu_bitcode",
         "//compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR:IREECodegenDialect",
         "//compiler/src/iree/compiler/Codegen/Dialect/GPU/IR:IREEGPUDialect",
         "//compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils:KnownTargets",
diff --git a/compiler/plugins/target/ROCM/CMakeLists.txt b/compiler/plugins/target/ROCM/CMakeLists.txt
index 0efc3df479e6..96c3305d936d 100644
--- a/compiler/plugins/target/ROCM/CMakeLists.txt
+++ b/compiler/plugins/target/ROCM/CMakeLists.txt
@@ -64,10 +64,7 @@ iree_cc_library(
     iree::compiler::Dialect::HAL::Utils::LLVMLinkerUtils
     iree::compiler::PluginAPI
     iree::compiler::Utils
-    iree::compiler::plugins::target::ROCM::builtins::ukernel::iree_uk_amdgpu_gfx1030
-    iree::compiler::plugins::target::ROCM::builtins::ukernel::iree_uk_amdgpu_gfx1100
-    iree::compiler::plugins::target::ROCM::builtins::ukernel::iree_uk_amdgpu_gfx90a
-    iree::compiler::plugins::target::ROCM::builtins::ukernel::iree_uk_amdgpu_gfx942
+    iree::compiler::plugins::target::ROCM::builtins::ukernel::iree_uk_amdgpu_bitcode
     iree::schemas::amdgpu_executable_def_c_fbs
     iree::schemas::executable_debug_info_c_fbs
     iree::schemas::hip_executable_def_c_fbs
diff --git a/compiler/plugins/target/ROCM/ROCMTarget.cpp b/compiler/plugins/target/ROCM/ROCMTarget.cpp
index a49780fbbcf4..48ef62e07220 100644
--- a/compiler/plugins/target/ROCM/ROCMTarget.cpp
+++ b/compiler/plugins/target/ROCM/ROCMTarget.cpp
@@ -8,6 +8,7 @@
 
 #include <cstdint>
 
+#include "compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_bitcode.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h"
@@ -21,6 +22,7 @@
 #include "iree/compiler/Dialect/HAL/Utils/ExecutableDebugInfoUtils.h"
 #include "iree/compiler/Dialect/HAL/Utils/LLVMLinkerUtils.h"
 #include "iree/compiler/PluginAPI/Client.h"
+#include "iree/compiler/Utils/EmbeddedDataDirectory.h"
 #include "iree/compiler/Utils/FlatbufferUtils.h"
 #include "iree/compiler/Utils/ToolUtils.h"
 #include "iree/schemas/amdgpu_executable_def_builder.h"
@@ -206,6 +208,7 @@ static std::string translateModuleToISA(llvm::Module &module,
   }
   return targetISA;
 }
+
 } // namespace
 
 class ROCMTargetBackend final : public TargetBackend {
@@ -513,20 +516,6 @@ class ROCMTargetBackend final : public TargetBackend {
         return failure();
       }
 
-      // Link module to any enabled ukernels.
-      StringRef bitcodeDirectory = options.bitcodeDirectory;
-      StringRef enabledUkernels;
-      if (auto attr = getConfigStringAttr(targetAttr, "ukernels"))
-        enabledUkernels = attr->getValue();
-      if (!enabledUkernels.empty() && enabledUkernels != "none") {
-        if (failed(linkUkernelBitcodeFiles(
-                variantOp.getLoc(), llvmModule.get(), enabledUkernels,
-                targetArch, bitcodeDirectory, llvm::Linker::OverrideFromSrc,
-                *targetMachine))) {
-          return failure();
-        }
-      }
-
       // Link bitcode (*.bc) object attrs specified by the input program.
       // Note that this happens after the command-line files so that the command
       // line ones override the symbols coming from the embedded files.
@@ -548,14 +537,15 @@ class ROCMTargetBackend final : public TargetBackend {
       }
 
       // Link module to HIP device library.
-      if (bitcodeDirectory.empty()) {
+      if (options.bitcodeDirectory.empty()) {
         return variantOp.emitError()
                << "cannot find ROCM bitcode files. Check your installation "
                   "consistency and in the worst case, set "
                   "--iree-hip-bc-dir= to a path on your system.";
       }
       if (failed(linkHIPBitcodeIfNeeded(variantOp.getLoc(), llvmModule.get(),
-                                        targetArch, bitcodeDirectory))) {
+                                        targetArch,
+                                        options.bitcodeDirectory))) {
         return failure();
       }
 
@@ -881,6 +871,7 @@ class HIPTargetDevice final : public TargetDevice {
 };
 
 namespace {
+
 struct ROCMSession final
     : PluginSession<ROCMSession, ROCMOptions,
                     PluginActivationPolicy::DefaultActivated> {
@@ -910,10 +901,23 @@ struct ROCMSession final
 
 } // namespace mlir::iree_compiler::IREE::HAL
 
+// Iterate over ukernel bitcode embedded-data files, and insert them into the
+// EmbeddedDataDirectory singleton.
+static void addAMDGPUUkernelBitcodeToGlobalEmbeddedDataDirectory() {
+  using mlir::iree_compiler::EmbeddedDataDirectory;
+  EmbeddedDataDirectory::withGlobal([](EmbeddedDataDirectory &dir) {
+    const iree_file_toc_t *toc = iree_uk_amdgpu_bitcode_create();
+    for (size_t i = 0; i < iree_uk_amdgpu_bitcode_size(); ++i) {
+      dir.addFile(toc[i].name, llvm::StringRef{toc[i].data, toc[i].size});
+    }
+  });
+}
+
 extern "C" bool iree_register_compiler_plugin_hal_target_rocm(
     mlir::iree_compiler::PluginRegistrar *registrar) {
   registrar->registerPlugin<mlir::iree_compiler::IREE::HAL::ROCMSession>(
       "hal_target_rocm");
+  addAMDGPUUkernelBitcodeToGlobalEmbeddedDataDirectory();
   return true;
 }
 
diff --git a/compiler/plugins/target/ROCM/ROCMTargetUtils.cpp b/compiler/plugins/target/ROCM/ROCMTargetUtils.cpp
index 792de8e4a4b0..2cf9f20c0de5 100644
--- a/compiler/plugins/target/ROCM/ROCMTargetUtils.cpp
+++ b/compiler/plugins/target/ROCM/ROCMTargetUtils.cpp
@@ -6,10 +6,6 @@
 
 #include "compiler/plugins/target/ROCM/ROCMTargetUtils.h"
 
-#include "compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_gfx1030.h"
-#include "compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_gfx1100.h"
-#include "compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_gfx90a.h"
-#include "compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_gfx942.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "iree/compiler/Dialect/HAL/Utils/LLVMLinkerUtils.h"
 #include "iree/compiler/Utils/ToolUtils.h"
@@ -185,43 +181,6 @@ LogicalResult linkHIPBitcodeIfNeeded(Location loc, llvm::Module *module,
   return linkWithBitcodeFiles(loc, module, bitcodePaths);
 }
 
-static std::tuple<const iree_file_toc_t *, int>
-getUkernelBitcodeTOC(StringRef gpuArch) {
-  return llvm::StringSwitch<std::tuple<const iree_file_toc_t *, int>>(gpuArch)
-      .Case("gfx90a",
-            {iree_uk_amdgpu_gfx90a_create(), iree_uk_amdgpu_gfx90a_size()})
-      .Case("gfx942",
-            {iree_uk_amdgpu_gfx942_create(), iree_uk_amdgpu_gfx942_size()})
-      .Case("gfx1030",
-            {iree_uk_amdgpu_gfx1030_create(), iree_uk_amdgpu_gfx1030_size()})
-      .Case("gfx1100",
-            {iree_uk_amdgpu_gfx1100_create(), iree_uk_amdgpu_gfx1100_size()})
-      .Default({nullptr, 0});
-}
-
-// Links optimized Ukernel bitcode into the given module if the module needs it.
-LogicalResult linkUkernelBitcodeFiles(Location loc, llvm::Module *module,
-                                      StringRef enabledUkernelsStr,
-                                      StringRef targetChip,
-                                      StringRef bitcodePath,
-                                      unsigned linkerFlags,
-                                      llvm::TargetMachine &targetMachine) {
-  auto [toc, toc_size] = getUkernelBitcodeTOC(targetChip);
-  if (!toc) {
-    return failure();
-  }
-
-  llvm::Linker linker(*module);
-  for (int i = 0; i < toc_size; ++i) {
-    if (failed(linkBitcodeFile(loc, linker, linkerFlags, toc[i].name,
-                               llvm::StringRef(toc[i].data, toc[i].size),
-                               targetMachine, module->getContext())))
-      return failure();
-  }
-
-  return success();
-}
-
 // Link object file using lld lnker to generate code object
 // Inspiration from this section comes from LLVM-PROJECT-MLIR by
 // ROCmSoftwarePlatform
diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/BUILD.bazel b/compiler/plugins/target/ROCM/builtins/ukernel/BUILD.bazel
index 93e6c86bd4a3..aff7b8965b32 100644
--- a/compiler/plugins/target/ROCM/builtins/ukernel/BUILD.bazel
+++ b/compiler/plugins/target/ROCM/builtins/ukernel/BUILD.bazel
@@ -49,19 +49,20 @@ argmax_types = [
         "iree_uk_amdgpu_argmax_%s.c" % type,
         "common.h",
     ],
+    out = "iree_uk_amdgpu_argmax_%s.%s.bc" % (type, gpu_arch),
     gpu_arch = gpu_arch,
 ) for type in argmax_types for gpu_arch in gpu_archs]
 
-argmax_bc_files = {gpu_arch: [
-    ":iree_uk_amdgpu_argmax_%s.c.%s.bc" % (type, gpu_arch)
+argmax_bc_files = [
+    ":iree_uk_amdgpu_argmax_%s.%s.bc" % (type, gpu_arch)
     for type in argmax_types
-] for gpu_arch in gpu_archs}
+    for gpu_arch in gpu_archs
+]
 
-[iree_c_embed_data(
-    name = "iree_uk_amdgpu_%s" % gpu_arch,
-    srcs = argmax_bc_files[gpu_arch],
-    c_file_output = "iree_uk_amdgpu_%s.c" % gpu_arch,
+iree_c_embed_data(
+    name = "iree_uk_amdgpu_bitcode",
+    srcs = argmax_bc_files,
+    c_file_output = "iree_uk_amdgpu_bitcode.c",
     flatten = True,
-    h_file_output = "iree_uk_amdgpu_%s.h" % gpu_arch,
-    identifier = "iree_uk_amdgpu_%s" % gpu_arch,
-) for gpu_arch in gpu_archs]
+    h_file_output = "iree_uk_amdgpu_bitcode.h",
+)
diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt b/compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt
index 6b3014f3bd53..71d4705eed1a 100644
--- a/compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt
+++ b/compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt
@@ -22,6 +22,8 @@ iree_amdgpu_bitcode_library(
   SRCS
     "common.h"
     "iree_uk_amdgpu_argmax_f16i32.c"
+  OUT
+    "iree_uk_amdgpu_argmax_f16i32.gfx90a.bc"
 )
 
 iree_amdgpu_bitcode_library(
@@ -32,6 +34,8 @@ iree_amdgpu_bitcode_library(
   SRCS
     "common.h"
     "iree_uk_amdgpu_argmax_f16i32.c"
+  OUT
+    "iree_uk_amdgpu_argmax_f16i32.gfx942.bc"
 )
 
 iree_amdgpu_bitcode_library(
@@ -42,6 +46,8 @@ iree_amdgpu_bitcode_library(
   SRCS
     "common.h"
     "iree_uk_amdgpu_argmax_f16i32.c"
+  OUT
+    "iree_uk_amdgpu_argmax_f16i32.gfx1030.bc"
 )
 
 iree_amdgpu_bitcode_library(
@@ -52,6 +58,8 @@ iree_amdgpu_bitcode_library(
   SRCS
     "common.h"
     "iree_uk_amdgpu_argmax_f16i32.c"
+  OUT
+    "iree_uk_amdgpu_argmax_f16i32.gfx1100.bc"
 )
 
 iree_amdgpu_bitcode_library(
@@ -62,6 +70,8 @@ iree_amdgpu_bitcode_library(
   SRCS
     "common.h"
     "iree_uk_amdgpu_argmax_f16i64.c"
+  OUT
+    "iree_uk_amdgpu_argmax_f16i64.gfx90a.bc"
 )
 
 iree_amdgpu_bitcode_library(
@@ -72,6 +82,8 @@ iree_amdgpu_bitcode_library(
   SRCS
     "common.h"
     "iree_uk_amdgpu_argmax_f16i64.c"
+  OUT
+    "iree_uk_amdgpu_argmax_f16i64.gfx942.bc"
 )
 
 iree_amdgpu_bitcode_library(
@@ -82,6 +94,8 @@ iree_amdgpu_bitcode_library(
   SRCS
     "common.h"
     "iree_uk_amdgpu_argmax_f16i64.c"
+  OUT
+    "iree_uk_amdgpu_argmax_f16i64.gfx1030.bc"
 )
 
 iree_amdgpu_bitcode_library(
@@ -92,6 +106,8 @@ iree_amdgpu_bitcode_library(
   SRCS
     "common.h"
     "iree_uk_amdgpu_argmax_f16i64.c"
+  OUT
+    "iree_uk_amdgpu_argmax_f16i64.gfx1100.bc"
 )
 
 iree_amdgpu_bitcode_library(
@@ -102,6 +118,8 @@ iree_amdgpu_bitcode_library(
   SRCS
     "common.h"
     "iree_uk_amdgpu_argmax_f32i32.c"
+  OUT
+    "iree_uk_amdgpu_argmax_f32i32.gfx90a.bc"
 )
 
 iree_amdgpu_bitcode_library(
@@ -112,6 +130,8 @@ iree_amdgpu_bitcode_library(
   SRCS
     "common.h"
     "iree_uk_amdgpu_argmax_f32i32.c"
+  OUT
+    "iree_uk_amdgpu_argmax_f32i32.gfx942.bc"
 )
 
 iree_amdgpu_bitcode_library(
@@ -122,6 +142,8 @@ iree_amdgpu_bitcode_library(
   SRCS
     "common.h"
     "iree_uk_amdgpu_argmax_f32i32.c"
+  OUT
+    "iree_uk_amdgpu_argmax_f32i32.gfx1030.bc"
 )
 
 iree_amdgpu_bitcode_library(
@@ -132,6 +154,8 @@ iree_amdgpu_bitcode_library(
   SRCS
     "common.h"
     "iree_uk_amdgpu_argmax_f32i32.c"
+  OUT
+    "iree_uk_amdgpu_argmax_f32i32.gfx1100.bc"
 )
 
 iree_amdgpu_bitcode_library(
@@ -142,6 +166,8 @@ iree_amdgpu_bitcode_library(
   SRCS
     "common.h"
     "iree_uk_amdgpu_argmax_f32i64.c"
+  OUT
+    "iree_uk_amdgpu_argmax_f32i64.gfx90a.bc"
 )
 
 iree_amdgpu_bitcode_library(
@@ -152,6 +178,8 @@ iree_amdgpu_bitcode_library(
   SRCS
     "common.h"
     "iree_uk_amdgpu_argmax_f32i64.c"
+  OUT
+    "iree_uk_amdgpu_argmax_f32i64.gfx942.bc"
 )
 
 iree_amdgpu_bitcode_library(
@@ -162,6 +190,8 @@ iree_amdgpu_bitcode_library(
   SRCS
     "common.h"
     "iree_uk_amdgpu_argmax_f32i64.c"
+  OUT
+    "iree_uk_amdgpu_argmax_f32i64.gfx1030.bc"
 )
 
 iree_amdgpu_bitcode_library(
@@ -172,76 +202,34 @@ iree_amdgpu_bitcode_library(
   SRCS
     "common.h"
     "iree_uk_amdgpu_argmax_f32i64.c"
+  OUT
+    "iree_uk_amdgpu_argmax_f32i64.gfx1100.bc"
 )
 
 iree_c_embed_data(
   NAME
-    iree_uk_amdgpu_gfx90a
-  SRCS
-    "iree_uk_amdgpu_argmax_f16i32.c.gfx90a.bc"
-    "iree_uk_amdgpu_argmax_f16i64.c.gfx90a.bc"
-    "iree_uk_amdgpu_argmax_f32i32.c.gfx90a.bc"
-    "iree_uk_amdgpu_argmax_f32i64.c.gfx90a.bc"
-  C_FILE_OUTPUT
-    "iree_uk_amdgpu_gfx90a.c"
-  H_FILE_OUTPUT
-    "iree_uk_amdgpu_gfx90a.h"
-  IDENTIFIER
-    "iree_uk_amdgpu_gfx90a"
-  FLATTEN
-  PUBLIC
-)
-
-iree_c_embed_data(
-  NAME
-    iree_uk_amdgpu_gfx942
-  SRCS
-    "iree_uk_amdgpu_argmax_f16i32.c.gfx942.bc"
-    "iree_uk_amdgpu_argmax_f16i64.c.gfx942.bc"
-    "iree_uk_amdgpu_argmax_f32i32.c.gfx942.bc"
-    "iree_uk_amdgpu_argmax_f32i64.c.gfx942.bc"
-  C_FILE_OUTPUT
-    "iree_uk_amdgpu_gfx942.c"
-  H_FILE_OUTPUT
-    "iree_uk_amdgpu_gfx942.h"
-  IDENTIFIER
-    "iree_uk_amdgpu_gfx942"
-  FLATTEN
-  PUBLIC
-)
-
-iree_c_embed_data(
-  NAME
-    iree_uk_amdgpu_gfx1030
-  SRCS
-    "iree_uk_amdgpu_argmax_f16i32.c.gfx1030.bc"
-    "iree_uk_amdgpu_argmax_f16i64.c.gfx1030.bc"
-    "iree_uk_amdgpu_argmax_f32i32.c.gfx1030.bc"
-    "iree_uk_amdgpu_argmax_f32i64.c.gfx1030.bc"
-  C_FILE_OUTPUT
-    "iree_uk_amdgpu_gfx1030.c"
-  H_FILE_OUTPUT
-    "iree_uk_amdgpu_gfx1030.h"
-  IDENTIFIER
-    "iree_uk_amdgpu_gfx1030"
-  FLATTEN
-  PUBLIC
-)
-
-iree_c_embed_data(
-  NAME
-    iree_uk_amdgpu_gfx1100
-  SRCS
-    "iree_uk_amdgpu_argmax_f16i32.c.gfx1100.bc"
-    "iree_uk_amdgpu_argmax_f16i64.c.gfx1100.bc"
-    "iree_uk_amdgpu_argmax_f32i32.c.gfx1100.bc"
-    "iree_uk_amdgpu_argmax_f32i64.c.gfx1100.bc"
+    iree_uk_amdgpu_bitcode
+  SRCS
+    "iree_uk_amdgpu_argmax_f16i32.gfx1030.bc"
+    "iree_uk_amdgpu_argmax_f16i32.gfx1100.bc"
+    "iree_uk_amdgpu_argmax_f16i32.gfx90a.bc"
+    "iree_uk_amdgpu_argmax_f16i32.gfx942.bc"
+    "iree_uk_amdgpu_argmax_f16i64.gfx1030.bc"
+    "iree_uk_amdgpu_argmax_f16i64.gfx1100.bc"
+    "iree_uk_amdgpu_argmax_f16i64.gfx90a.bc"
+    "iree_uk_amdgpu_argmax_f16i64.gfx942.bc"
+    "iree_uk_amdgpu_argmax_f32i32.gfx1030.bc"
+    "iree_uk_amdgpu_argmax_f32i32.gfx1100.bc"
+    "iree_uk_amdgpu_argmax_f32i32.gfx90a.bc"
+    "iree_uk_amdgpu_argmax_f32i32.gfx942.bc"
+    "iree_uk_amdgpu_argmax_f32i64.gfx1030.bc"
+    "iree_uk_amdgpu_argmax_f32i64.gfx1100.bc"
+    "iree_uk_amdgpu_argmax_f32i64.gfx90a.bc"
+    "iree_uk_amdgpu_argmax_f32i64.gfx942.bc"
   C_FILE_OUTPUT
-    "iree_uk_amdgpu_gfx1100.c"
+    "iree_uk_amdgpu_bitcode.c"
   H_FILE_OUTPUT
-    "iree_uk_amdgpu_gfx1100.h"
-  IDENTIFIER
-    "iree_uk_amdgpu_gfx1100"
+    "iree_uk_amdgpu_bitcode.h"
   FLATTEN
   PUBLIC
 )
diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f16i32.c b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f16i32.c
index 41fe50a6528d..4a6beefa9198 100644
--- a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f16i32.c
+++ b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f16i32.c
@@ -6,10 +6,10 @@
 
 #include "compiler/plugins/target/ROCM/builtins/ukernel/common.h"
 
-void iree_uk_amdgpu_argmax_f16i32(const _Float16 *inputBuffer,
-                                  int64_t input_offset, int32_t *outputBuffer,
-                                  int64_t output_offset,
-                                  int64_t reductionSize) {
+[[clang::always_inline]] void
+iree_uk_amdgpu_argmax_f16i32(const _Float16 *inputBuffer, int64_t input_offset,
+                             int32_t *outputBuffer, int64_t output_offset,
+                             int64_t reductionSize) {
   const int warpSize = __builtin_amdgcn_wavefrontsize();
   _Float16 NEG_F16_MAX = (_Float16)(-65504.0f);
   int32_t laneID = __builtin_amdgcn_workitem_id_x();
diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f16i64.c b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f16i64.c
index 823fc3a4f296..33c1522d143d 100644
--- a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f16i64.c
+++ b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f16i64.c
@@ -6,10 +6,10 @@
 
 #include "compiler/plugins/target/ROCM/builtins/ukernel/common.h"
 
-void iree_uk_amdgpu_argmax_f16i64(const _Float16 *inputBuffer,
-                                  int64_t input_offset, int64_t *outputBuffer,
-                                  int64_t output_offset,
-                                  int64_t reductionSize) {
+[[clang::always_inline]] void
+iree_uk_amdgpu_argmax_f16i64(const _Float16 *inputBuffer, int64_t input_offset,
+                             int64_t *outputBuffer, int64_t output_offset,
+                             int64_t reductionSize) {
   const int warpSize = __builtin_amdgcn_wavefrontsize();
   _Float16 NEG_F16_MAX = (_Float16)(-65504.0f);
   int32_t laneID = __builtin_amdgcn_workitem_id_x();
diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f32i32.c b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f32i32.c
index 41aad8ba05c5..f39d62372799 100644
--- a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f32i32.c
+++ b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f32i32.c
@@ -6,10 +6,10 @@
 
 #include "compiler/plugins/target/ROCM/builtins/ukernel/common.h"
 
-void iree_uk_amdgpu_argmax_f32i32(const float *inputBuffer,
-                                  int64_t input_offset, int32_t *outputBuffer,
-                                  int64_t output_offset,
-                                  int64_t reductionSize) {
+[[clang::always_inline]] void
+iree_uk_amdgpu_argmax_f32i32(const float *inputBuffer, int64_t input_offset,
+                             int32_t *outputBuffer, int64_t output_offset,
+                             int64_t reductionSize) {
   const int warpSize = __builtin_amdgcn_wavefrontsize();
   int32_t laneID = __builtin_amdgcn_workitem_id_x();
   // Set identity value to handle problem non divisible by subgroupSize.
diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f32i64.c b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f32i64.c
index 5899322d7407..d6a9afbcf2d6 100644
--- a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f32i64.c
+++ b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f32i64.c
@@ -6,10 +6,10 @@
 
 #include "compiler/plugins/target/ROCM/builtins/ukernel/common.h"
 
-void iree_uk_amdgpu_argmax_f32i64(const float *inputBuffer,
-                                  int64_t input_offset, int64_t *outputBuffer,
-                                  int64_t output_offset,
-                                  int64_t reductionSize) {
+[[clang::always_inline]] void
+iree_uk_amdgpu_argmax_f32i64(const float *inputBuffer, int64_t input_offset,
+                             int64_t *outputBuffer, int64_t output_offset,
+                             int64_t reductionSize) {
   const int warpSize = __builtin_amdgcn_wavefrontsize();
   int32_t laneID = __builtin_amdgcn_workitem_id_x();
   // Set identity value to handle problem non divisible by subgroupSize.
diff --git a/compiler/plugins/target/ROCM/test/BUILD.bazel b/compiler/plugins/target/ROCM/test/BUILD.bazel
new file mode 100644
index 000000000000..bf9a18d582bd
--- /dev/null
+++ b/compiler/plugins/target/ROCM/test/BUILD.bazel
@@ -0,0 +1,26 @@
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//build_tools/bazel:build_defs.oss.bzl", "iree_cmake_extra_content")
+load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
+
+package(
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+iree_lit_test_suite(
+    name = "lit",
+    srcs = [
+        "gpu_lower_to_ukernels.mlir",
+        "ukernel_pipeline_transform.mlir",
+    ],
+    cfg = "//compiler:lit.cfg.py",
+    tools = [
+        "//tools:iree-opt",
+        "@llvm-project//llvm:FileCheck",
+    ],
+)
diff --git a/compiler/plugins/target/ROCM/test/CMakeLists.txt b/compiler/plugins/target/ROCM/test/CMakeLists.txt
index df185a05e72b..6d2199d8c4bb 100644
--- a/compiler/plugins/target/ROCM/test/CMakeLists.txt
+++ b/compiler/plugins/target/ROCM/test/CMakeLists.txt
@@ -1,10 +1,33 @@
-# NOTE: Bazel testing of this backend is impossible because there is no way
-# for Bazel to bundle the AMD bitcode files that the backend depends on. Users
-# of the compiler can pass explicit flags, but we prefer that default tests
-# exercise default flags, which cannot be supported properly on Bazel builds.
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# compiler/plugins/target/ROCM/test/BUILD.bazel                                #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
 
 iree_add_all_subdirs()
 
+iree_lit_test_suite(
+  NAME
+    lit
+  SRCS
+    "gpu_lower_to_ukernels.mlir"
+    "ukernel_pipeline_transform.mlir"
+  TOOLS
+    FileCheck
+    iree-opt
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
+
+# NOTE: The following tests are CMake-only because they depend on AMD device
+# bitcode libraries that are provided by custom CMake code in target/ROCM.
+# By contrast, the above tests that only require ukernel bitcode are part of the
+# Bazel build because ukernel bitcode is something that we generate ourselves.
+
 iree_lit_test_suite(
   NAME
     lit
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_lower_to_ukernels.mlir b/compiler/plugins/target/ROCM/test/gpu_lower_to_ukernels.mlir
similarity index 84%
rename from compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_lower_to_ukernels.mlir
rename to compiler/plugins/target/ROCM/test/gpu_lower_to_ukernels.mlir
index cc71c379959f..177bd0b36f7c 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_lower_to_ukernels.mlir
+++ b/compiler/plugins/target/ROCM/test/gpu_lower_to_ukernels.mlir
@@ -1,5 +1,4 @@
-// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx1100 --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-lower-to-ukernels,cse,canonicalize))" %s | FileCheck %s
-// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx90a --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-lower-to-ukernels,cse,canonicalize))" %s | FileCheck %s --check-prefix=CDNA2
+// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-lower-to-ukernels,cse,canonicalize))" %s | FileCheck %s
 // RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx908 --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-lower-to-ukernels,cse,canonicalize))" %s | FileCheck %s --check-prefix=CDNA1
 
 func.func @argmax_2d_f32i64(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes {
@@ -28,7 +27,7 @@ func.func @argmax_2d_f32i64(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes
 //  CHECK-DAG:   %[[C1_index:.+]] = arith.constant 1 : index
 //  CHECK-DAG:   %[[C0_i64:.+]] = arith.constant 0
 //  CHECK-DAG:   %[[FILL:.+]] = linalg.fill ins(%[[C0_i64]]
-//      CHECK:   %[[MICRO_KERNEL:.+]] = iree_codegen.ukernel.generic "iree_uk_amdgpu_argmax_f32i64"
+//      CHECK:   %[[MICRO_KERNEL:.+]] = iree_codegen.ukernel.generic {hal.executable.objects = [{{.*}}]} "iree_uk_amdgpu_argmax_f32i64"
 // CHECK-SAME:       ins(%[[ARG0]] :
 // CHECK-SAME:       outs(%[[FILL]] :
 //      CHECK:   return %[[MICRO_KERNEL]]
@@ -284,3 +283,51 @@ func.func @argmax_ukernel_unsupported_arch(%arg0 : tensor<1x?xf32>) -> tensor<1x
 //      CDNA1-LABEL: func @argmax_ukernel_unsupported_arch(
 //      CDNA1-NOT: iree_codegen.ukernel.generic
 //      CDNA1: linalg.generic
+
+// -----
+
+// Test user-provided bitcode in the source IR.
+
+func.func @argmax_2d_f32i64(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes {
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "all"}>,
+  // Dummy bitcode with an unusual length of 12. The first 4 bytes are the .bc file format signature.
+  hal.executable.objects = [
+    #hal.executable.object<{
+      path = "iree_uk_amdgpu_argmax_f32i64.gfx942.bc",
+      data = dense<[66, 67, -64, -34, 1, 35, 69, 103, -119, -85, -51, -17]> : tensor<12xi8>
+    }>
+  ]
+} {
+  %c0_i64 = arith.constant 0 : i64
+  %cst = arith.constant 0xFF800000 : f32
+  %0 = tensor.empty() : tensor<1xi64>
+  %1 = linalg.fill ins(%c0_i64 : i64) outs(%0 : tensor<1xi64>) -> tensor<1xi64>
+  %2 = tensor.empty() : tensor<1xf32>
+  %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<1xf32>) -> tensor<1xf32>
+  %4:2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%arg0 : tensor<1x?xf32>) outs(%3, %1 : tensor<1xf32>, tensor<1xi64>) {
+  ^bb0(%in: f32, %out: f32, %out_0: i64):
+    %5 = linalg.index 1 : index
+    %6 = arith.index_cast %5 : index to i64
+    %7 = arith.maximumf %in, %out : f32
+    %8 = arith.cmpf ogt, %in, %out : f32
+    %9 = arith.select %8, %6, %out_0 : i64
+    linalg.yield %7, %9 : f32, i64
+  } -> (tensor<1xf32>, tensor<1xi64>)
+  return %4#1 : tensor<1xi64>
+}
+
+//CHECK-LABEL: func @argmax_2d_f32i64(
+// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<1x?xf32>
+//  CHECK-DAG:   %[[C1_index:.+]] = arith.constant 1 : index
+//  CHECK-DAG:   %[[C0_i64:.+]] = arith.constant 0
+//  CHECK-DAG:   %[[FILL:.+]] = linalg.fill ins(%[[C0_i64]]
+//      CHECK:   %[[MICRO_KERNEL:.+]] = iree_codegen.ukernel.generic {
+// CHECK-SAME:     hal.executable.objects = [
+// CHECK-SAME:       #hal.executable.object<{
+// CHECK-SAME:         path = "iree_uk_amdgpu_argmax_f32i64.gfx942.bc",
+// CHECK-SAME:         data = dense<[66, 67, -64, -34, 1, 35, 69, 103, -119, -85, -51, -17]> : tensor<12xi8>
+// CHECK-SAME:       }>
+// CHECK-SAME:     ]} "iree_uk_amdgpu_argmax_f32i64"
+// CHECK-SAME:       ins(%[[ARG0]] :
+// CHECK-SAME:       outs(%[[FILL]] :
+//      CHECK:   return %[[MICRO_KERNEL]]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ukernel_pipeline_transform.mlir b/compiler/plugins/target/ROCM/test/ukernel_pipeline_transform.mlir
similarity index 90%
rename from compiler/src/iree/compiler/Codegen/LLVMGPU/test/ukernel_pipeline_transform.mlir
rename to compiler/plugins/target/ROCM/test/ukernel_pipeline_transform.mlir
index af8ecf196115..26ce4c8959f4 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ukernel_pipeline_transform.mlir
+++ b/compiler/plugins/target/ROCM/test/ukernel_pipeline_transform.mlir
@@ -4,10 +4,11 @@
   #hal.pipeline.binding<storage_buffer>,
   #hal.pipeline.binding<storage_buffer>
 ]>
-#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "argmax"}>
 #map = affine_map<(d0) -> (d0)>
 #map1 = affine_map<(d0) -> ()>
-func.func @argmax_1d_f16i64() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+func.func @argmax_1d_f16i64() attributes {
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "argmax"}>
+} {
   %c32_i64 = arith.constant 32 : i64
   %cst = arith.constant 0xFC00 : f16
   %c0_i64 = arith.constant 0 : i64
@@ -43,7 +44,7 @@ func.func @argmax_1d_f16i64() attributes {hal.executable.target = #executable_ta
 //       CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUDefault workgroup_size = [32, 1, 1]>
 //       CHECK: func.func @argmax_1d_f16i64()
 //  CHECK-SAME:     translation_info = #[[$TRANSLATION]]
-//       CHECK:   iree_codegen.ukernel.generic  "iree_uk_amdgpu_argmax_f16i64"
+//       CHECK:   iree_codegen.ukernel.generic {hal.executable.objects = [{{.*}}]} "iree_uk_amdgpu_argmax_f16i64"
 
 // -----
 
@@ -51,10 +52,11 @@ func.func @argmax_1d_f16i64() attributes {hal.executable.target = #executable_ta
   #hal.pipeline.binding<storage_buffer>,
   #hal.pipeline.binding<storage_buffer>
 ]>
-#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "argmax"}>
 #map = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> (d0)>
-func.func @argmax_2d_f32i64() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+func.func @argmax_2d_f32i64() attributes {
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "argmax"}>
+} {
   %c32_i64 = arith.constant 32 : i64
   %cst = arith.constant 0xFF800000 : f32
   %c0_i64 = arith.constant 0 : i64
@@ -92,7 +94,7 @@ func.func @argmax_2d_f32i64() attributes {hal.executable.target = #executable_ta
 // CHECK-SAME:     translation_info = #[[$TRANSLATION]]
 //      CHECK:   %[[SUBVIEW:.*]] = memref.subview{{.*}} memref<16x?xf32
 // CHECK-SAME:        to memref<1x?xf32
-//      CHECK:   iree_codegen.ukernel.generic  "iree_uk_amdgpu_argmax_f32i64" ins(%[[SUBVIEW]]
+//      CHECK:   iree_codegen.ukernel.generic {hal.executable.objects = [{{.*}}]} "iree_uk_amdgpu_argmax_f32i64" ins(%[[SUBVIEW]]
 
 // -----
 
@@ -100,10 +102,11 @@ func.func @argmax_2d_f32i64() attributes {hal.executable.target = #executable_ta
   #hal.pipeline.binding<storage_buffer>,
   #hal.pipeline.binding<storage_buffer>
 ]>
-#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb">
 #map = affine_map<(d0) -> (d0)>
 #map1 = affine_map<(d0) -> ()>
-func.func @no_ukernel_argmax_1d_f16i64() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+func.func @no_ukernel_argmax_1d_f16i64() attributes {
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb">
+} {
   %c32_i64 = arith.constant 32 : i64
   %cst = arith.constant 0xFC00 : f16
   %c0_i64 = arith.constant 0 : i64
@@ -147,10 +150,11 @@ func.func @no_ukernel_argmax_1d_f16i64() attributes {hal.executable.target = #ex
   #hal.pipeline.binding<storage_buffer>,
   #hal.pipeline.binding<storage_buffer>
 ]>
-#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "argmax"}>
 #map = affine_map<(d0) -> (d0)>
 #map1 = affine_map<(d0) -> ()>
-func.func @not_neg_inf_init_argmax_1d() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+func.func @not_neg_inf_init_argmax_1d() attributes {
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "argmax"}>
+} {
   %c32_i64 = arith.constant 32 : i64
   %cst = arith.constant 0.000000e+00 : f16
   %c0_i64 = arith.constant 0 : i64
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPULowerToUKernels.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPULowerToUKernels.cpp
index f76cdd1c6ccf..a72b4ff8e180 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPULowerToUKernels.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPULowerToUKernels.cpp
@@ -9,9 +9,12 @@
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/UKernelOps.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
+#include "iree/compiler/Utils/EmbeddedDataDirectory.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/AsmState.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/MLIRContext.h"
@@ -24,22 +27,101 @@ namespace mlir::iree_compiler {
 
 namespace {
 
+// Returns a ExecutableObjectAttr carrying the bitcode for the given ukernel.
+//
+// First tries finding the bitcode in the input `sourceExecutableObjects`, which
+// must be an array of ExecutableObjectAttr's and is typically coming from a
+// hal.executable.objects array attribute in the source IR, which is the
+// mechanism by which source programs may provide their own ukernel bitcode.
+//
+// If no matching bitcode was found in `sourceExecutableObjects`, this function
+// will then search in bitcode files that we have embedded as static data.
+static IREE::HAL::ExecutableObjectAttr
+getUKernelBitcode(OpBuilder &builder,
+                  IREE::HAL::ExecutableTargetAttr execTarget,
+                  ArrayAttr sourceExecutableObjects, StringRef ukernelName) {
+  IREE::GPU::TargetAttr gpuTarget = getGPUTargetAttr(execTarget);
+  if (!gpuTarget) {
+    return {};
+  }
+  StringRef gpuArch = gpuTarget.getArch();
+  std::string bitcodeFilename =
+      llvm::formatv("{0}.{1}.bc", ukernelName, gpuArch);
+
+  // Early-return if the source executable.objects already contain an object
+  // with the expected file name. This happens with user-provided bitcode in the
+  // source IR.
+  if (sourceExecutableObjects) {
+    for (Attribute a : sourceExecutableObjects) {
+      if (auto object = dyn_cast<IREE::HAL::ExecutableObjectAttr>(a)) {
+        if (object.getPath() == bitcodeFilename) {
+          return object;
+        }
+      }
+    }
+  }
+
+  // No user-provided bitcode, so we search our embedded bitcode files in the
+  // EmbeddedDataDirectory singleton.
+  std::optional<StringRef> bitcode;
+  EmbeddedDataDirectory::withGlobal([&](EmbeddedDataDirectory &dir) {
+    bitcode = dir.getFile(bitcodeFilename);
+  });
+  if (!bitcode) {
+    return {};
+  }
+  MLIRContext *context = builder.getContext();
+  auto blob = HeapAsmResourceBlob::allocateAndCopyInferAlign(
+      ArrayRef<char>(bitcode->data(), bitcode->size()));
+  auto bitcodeDenseAttr = DenseI8ResourceElementsAttr::get(
+      VectorType::get({static_cast<int64_t>(bitcode->size())},
+                      builder.getI8Type()),
+      bitcodeFilename, std::move(blob));
+  return IREE::HAL::ExecutableObjectAttr::get(
+      context, StringAttr::get(context, bitcodeFilename),
+      cast<IREE::Util::SerializableAttrInterface>(bitcodeDenseAttr));
+}
+
+// Walks parents ops from `op` to return the nearest hal.executable.objects
+// array attribute. If the parent hal.executable.variant is reached, its objects
+// attribute is returned.
+// Adapted from ExecutableTargetAttr::lookup.
+static ArrayAttr lookUpExecutableObjects(Operation *op) {
+  MLIRContext *context = op->getContext();
+  auto attrId = StringAttr::get(context, "hal.executable.objects");
+  while (op) {
+    // Take directly from the enclosing variant.
+    if (auto variantOp = dyn_cast<IREE::HAL::ExecutableVariantOp>(op)) {
+      if (std::optional<ArrayAttr> objects = variantOp.getObjects()) {
+        return *objects;
+      }
+    }
+    // Take from op attributes.
+    if (auto attr = op->getAttrOfType<ArrayAttr>(attrId)) {
+      return attr;
+    }
+    // Continue walk.
+    op = op->getParentOp();
+  }
+  return {};
+}
+
 /// Holds a function name and attributes.
 struct FnNameAndDefAttrs {
   std::string name;
   SmallVector<NamedAttribute> defAttrs;
+  explicit operator bool() const { return !name.empty(); }
 };
 
 /// Returns the function name and attributes to use for a ukernel with given
-/// `ukernelName` on the target described by `targetAttr`.
+/// `name` and `suffix` on the target described by `targetAttr`.
 static FnNameAndDefAttrs
-getFnNameAndDefAttrs(const char *ukernelName, std::string &typeSuffixID,
+getFnNameAndDefAttrs(const char *name, std::string &suffix,
                      RewriterBase &rewriter,
                      IREE::HAL::ExecutableTargetAttr targetAttr) {
   FnNameAndDefAttrs result;
   if (isROCMBackend(targetAttr)) {
-    result.name =
-        std::string("iree_uk_amdgpu_") + ukernelName + "_" + typeSuffixID;
+    result.name = llvm::formatv("iree_uk_amdgpu_{0}_{1}", name, suffix);
     result.defAttrs.emplace_back(rewriter.getStringAttr("vm.import.module"),
                                  rewriter.getStringAttr("rocm"));
   }
@@ -54,9 +136,21 @@ static FailureOr<IREE::Codegen::UKernelOpInterface>
 matchArgmaxDAGForUKernel(RewriterBase &rewriter, linalg::GenericOp op) {
   auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(op);
   const char ukernelName[] = "argmax";
-  if (!hasUkernel(targetAttr, ukernelName) ||
-      !hasUkernelSupportedGpuArch(targetAttr)) {
-    return failure();
+  Value input = op.getDpsInputOperand(0)->get();
+  auto inputType = cast<ShapedType>(input.getType());
+  Value index = op.getDpsInitOperand(1)->get();
+  auto indexType = cast<ShapedType>(index.getType());
+  std::string suffix;
+  llvm::raw_string_ostream(suffix)
+      << inputType.getElementType() << indexType.getElementType();
+  FnNameAndDefAttrs fn =
+      getFnNameAndDefAttrs(ukernelName, suffix, rewriter, targetAttr);
+  if (!fn) {
+    return rewriter.notifyMatchFailure(op, "no ukernels on this backend");
+  }
+
+  if (!hasUkernel(targetAttr, ukernelName)) {
+    return rewriter.notifyMatchFailure(op, "ukernel not enabled");
   }
 
   // Currently only support argmax where parallel dims are 1.
@@ -74,68 +168,40 @@ matchArgmaxDAGForUKernel(RewriterBase &rewriter, linalg::GenericOp op) {
     }
     parallelSize *= bounds[dim];
   }
-  if (parallelSize != 1)
-    return failure();
-
-  // Get value/input type.
-  Value input = op.getDpsInputOperand(0)->get();
-  auto inputType = llvm::cast<ShapedType>(input.getType());
-  Type inputElemType = inputType.getElementType();
-  // Only support f16 and f32 values.
-  if (!inputElemType.isF16() && !inputElemType.isF32()) {
-    return failure();
-  }
-
-  // Get index type.
-  Value index = op.getDpsInitOperand(1)->get();
-  auto indexType = llvm::cast<ShapedType>(index.getType());
-  Type indexElemType = indexType.getElementType();
-  // Only support i32 and i64 index.
-  if (!indexElemType.isInteger(32) && !indexElemType.isInteger(64)) {
+  if (parallelSize != 1) {
     return failure();
   }
-
-  std::string typeSuffixID;
-  llvm::raw_string_ostream(typeSuffixID) << inputElemType << indexElemType;
-  // TODO(bjacob): this check won't be needed one this code will be updated to
-  // look up the table of contents of embedded bitcode files, one per symbol.
-  if (!(typeSuffixID == "f16i32" || typeSuffixID == "f16i64" ||
-        typeSuffixID == "f32i32" || typeSuffixID == "f32i64")) {
-    return rewriter.notifyMatchFailure(
-        op, "unsupported combination of element types");
+  auto execTarget = IREE::HAL::ExecutableTargetAttr::lookup(op);
+  ArrayAttr sourceExecutableObjects = lookUpExecutableObjects(op);
+  IREE::HAL::ExecutableObjectAttr bitcodeObject =
+      getUKernelBitcode(rewriter, execTarget, sourceExecutableObjects, fn.name);
+  if (!bitcodeObject) {
+    return rewriter.notifyMatchFailure(op, "no ukernel bitcode for this op");
   }
-
   Location loc = op.getLoc();
   // Currently only support 1D reduction, where reduc is on fastest dim.
   // Tiling argmax ukernel is also set to enforce this structure.
   const int kReductionDim = op.getNumLoops() - 1;
   Value reductionDimSize =
       rewriter.create<tensor::DimOp>(loc, input, kReductionDim);
-  auto fn =
-      getFnNameAndDefAttrs(ukernelName, typeSuffixID, rewriter, targetAttr);
   auto genericMicroKernelOp = rewriter.create<IREE::Codegen::UKernelGenericOp>(
       loc, indexType, fn.name, ValueRange{input}, index,
       ValueRange{reductionDimSize},
       /*fn_def_attrs=*/rewriter.getDictionaryAttr(fn.defAttrs),
       /*strided_outer_dims=*/rewriter.getIndexAttr(0));
+  genericMicroKernelOp->setAttr(
+      "hal.executable.objects",
+      ArrayAttr::get(rewriter.getContext(), bitcodeObject));
   return cast<IREE::Codegen::UKernelOpInterface>(
       genericMicroKernelOp.getOperation());
 }
 
-using TargetPredicate = std::function<bool(IREE::HAL::ExecutableTargetAttr)>;
-
 struct LowerArgmaxToUKernelPattern : OpRewritePattern<linalg::GenericOp> {
-  LowerArgmaxToUKernelPattern(MLIRContext *context,
-                              TargetPredicate targetPredicate)
-      : OpRewritePattern<linalg::GenericOp>(context),
-        targetPredicate(targetPredicate) {}
+  LowerArgmaxToUKernelPattern(MLIRContext *context)
+      : OpRewritePattern<linalg::GenericOp>(context) {}
 
   LogicalResult matchAndRewrite(linalg::GenericOp op,
                                 PatternRewriter &rewriter) const override {
-    if (targetPredicate &&
-        !targetPredicate(IREE::HAL::ExecutableTargetAttr::lookup(op))) {
-      return failure();
-    }
     if (failed(isArgmaxOp(op))) {
       return failure();
     }
@@ -149,8 +215,6 @@ struct LowerArgmaxToUKernelPattern : OpRewritePattern<linalg::GenericOp> {
                                 ukernelOp.value()->getResults());
     return success();
   }
-
-  TargetPredicate targetPredicate;
 };
 
 struct GPULowerToUKernelsPass final
@@ -170,7 +234,7 @@ struct GPULowerToUKernelsPass final
     // evidence that it is difficult for codegen to consistently approach
     // microkernels performance, and that consideration overrides the benefit of
     // fusions for these ops.
-    patterns.insert<LowerArgmaxToUKernelPattern>(context, isROCMBackend);
+    patterns.insert<LowerArgmaxToUKernelPattern>(context);
     if (failed(applyPatternsAndFoldGreedily(getOperation(),
                                             std::move(patterns)))) {
       return signalPassFailure();
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
index e8f1551c477a..15c6e9c23c93 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
@@ -107,7 +107,7 @@ def GPUInferMemorySpacePass :
 
 def GPULowerToUKernelsPass :
     Pass<"iree-codegen-gpu-lower-to-ukernels", ""> {
-  let summary = "Separate out parts of the IR that lower to a micro-kernel";
+  let summary = "Lower suitable ops to microkernels.";
   let dependentDialects = [
     "::mlir::iree_compiler::IREE::Codegen::IREECodegenDialect",
     "::mlir::iree_compiler::IREE::GPU::IREEGPUDialect",
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
index 7d0e6887d717..7fe33161bca1 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
@@ -30,7 +30,6 @@ iree_lit_test_suite(
             "gpu_fuse_and_hoist_forall.mlir",
             "gpu_greedily_distribute_to_threads.mlir",
             "gpu_infer_memory_space.mlir",
-            "gpu_lower_to_ukernels.mlir",
             "gpu_combine_value_barriers.mlir",
             "gpu_materialize_encoding_gfx908.mlir",
             "gpu_materialize_encoding_gfx90a.mlir",
@@ -58,8 +57,6 @@ iree_lit_test_suite(
             "vector_reduction_to_gpu.mlir",
         ],
         include = ["*.mlir"],
-        exclude = [
-        ],
     ),
     cfg = "//compiler:lit.cfg.py",
     tools = [
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt
index a9c584acd96d..4b9853df8213 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt
@@ -26,7 +26,6 @@ iree_lit_test_suite(
     "gpu_generalize_named_ops.mlir"
     "gpu_greedily_distribute_to_threads.mlir"
     "gpu_infer_memory_space.mlir"
-    "gpu_lower_to_ukernels.mlir"
     "gpu_materialize_encoding_gfx1100.mlir"
     "gpu_materialize_encoding_gfx908.mlir"
     "gpu_materialize_encoding_gfx90a.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/UKernelOps.cpp b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/UKernelOps.cpp
index b044244dbc0a..dbf082b5be03 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/UKernelOps.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/UKernelOps.cpp
@@ -61,7 +61,12 @@ createFunctionCall(RewriterBase &rewriter, Operation *op, StringRef fnName,
   }
 
   // Insert the function call.
-  return rewriter.create<func::CallOp>(loc, fnDecl, callOperands);
+  auto callOp = rewriter.create<func::CallOp>(loc, fnDecl, callOperands);
+  if (op->hasAttr("hal.executable.objects")) {
+    callOp->setAttr("hal.executable.objects",
+                    op->getAttr("hal.executable.objects"));
+  }
+  return callOp;
 }
 
 //===---------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index d63fd2d5d258..23e5cbb13e27 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -1824,18 +1824,25 @@ static LogicalResult setTransposeConfig(mlir::FunctionOpInterface entryPoint,
 // UKernel Pipeline Configuration
 //====---------------------------------------------------------------------===//
 
-/// Set the configuration for argmax that can be mapped to argmax uKernel.
+/// Set the configuration for argmax when ukernels are enabled.
 /// Distribute all parallel dim across different workgroups, and only use single
 /// subgroup per workgroup.
+///
+/// TODO(bjacob): This is fragile, as we can't know yet if this argmax will be
+/// lowered to a ukernel. We need instead a config that works regardless of
+/// ukernels. For now, we use the looser condition that the argmax ukernel is
+/// enabled, a necessary but not sufficient condition for this particular op to
+/// lower to the ukernel. This is good enough for now for a couple of reasons:
+/// 1. Even if a argmax does not actually lower to a ukernel, this config should
+///    still work.
+/// 2. Ukernels are not enabled by default.
 static LogicalResult
 setArgmaxUkernelConfig(IREE::GPU::TargetAttr target,
                        mlir::FunctionOpInterface entryPoint,
                        linalg::GenericOp op) {
   // Checks if UKernels are enabled.
   if (auto target = IREE::HAL::ExecutableTargetAttr::lookup(entryPoint)) {
-    const char ukernelName[] = "argmax";
-    if (!hasUkernel(target, ukernelName) ||
-        !hasUkernelSupportedGpuArch(target)) {
+    if (!hasUkernel(target, "argmax")) {
       return failure();
     }
   }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
index 756327ca4475..5d4042975f29 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
@@ -65,7 +65,6 @@ iree_lit_test_suite(
             "transform_gpu_pipelining.mlir",
             "transform_vector_to_mma.mlir",
             "transpose_pipeline_test.mlir",
-            "ukernel_pipeline_transform.mlir",
             "configure_tensor_layout.mlir",
             "vector_lowering.mlir",
             "vector_to_gpu.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
index 1d0dcc979a56..fb9e495d9535 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
@@ -62,7 +62,6 @@ iree_lit_test_suite(
     "transform_gpu_pipelining.mlir"
     "transform_vector_to_mma.mlir"
     "transpose_pipeline_test.mlir"
-    "ukernel_pipeline_transform.mlir"
     "vector_lowering.mlir"
     "vector_to_gpu.mlir"
     "winograd_pipeline_test.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp
index 612183d94eda..8f09f6f932f8 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp
@@ -936,42 +936,6 @@ bool sharedMemTransposeFilter(AffineMap indexMap) {
   return false;
 }
 
-//===----------------------------------------------------------------------===//
-// GPU UKernel Utils
-//===----------------------------------------------------------------------===//
-
-// TODO: Add more popular kernels into this list and the ukernel cmake.
-//       No real technical reason to only allow these aside from compile
-//       time and diskspace.
-bool hasUkernelSupportedRocmArch(StringRef targetChip) {
-  const char *kSupportedTargetChip[] = {"gfx90a", "gfx942", "gfx1030",
-                                        "gfx1100"};
-  size_t arraySize =
-      sizeof(kSupportedTargetChip) / sizeof(kSupportedTargetChip[0]);
-  for (int i = 0; i < arraySize; i++) {
-    // return true if targetChip is found inside kSupportedTargetChip.
-    if (targetChip.compare(kSupportedTargetChip[i]) == 0)
-      return true;
-  }
-  return false;
-}
-
-bool hasUkernelSupportedRocmArch(IREE::HAL::ExecutableTargetAttr targetAttr) {
-  auto targetArch = getGPUTargetAttr(targetAttr).getArch();
-  if (targetArch.empty())
-    return false;
-  return hasUkernelSupportedRocmArch(targetArch);
-}
-
-/// Checks if target GPU has UKernel support.
-bool hasUkernelSupportedGpuArch(IREE::HAL::ExecutableTargetAttr targetAttr) {
-  if (isROCMBackend(targetAttr) && hasUkernelSupportedRocmArch(targetAttr)) {
-    return true;
-  }
-  // TODO: Once plumbed, add a CUDA backend and supported cuda arch check.
-  return false;
-}
-
 //===----------------------------------------------------------------------===//
 // GPU Target Information
 //===----------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h
index 133d7246a5fa..1bd088588af8 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h
+++ b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h
@@ -174,16 +174,6 @@ combiningKindToAllReduce(vector::CombiningKind kind);
 /// using shared memory when CodeGen towards the GPU.
 bool sharedMemTransposeFilter(AffineMap indexMap);
 
-//===----------------------------------------------------------------------===//
-// GPU UKernel Utils
-//===----------------------------------------------------------------------===//
-
-/// Checks if target Chip(StringRef) has UKernel support.
-bool hasUkernelSupportedRocmArch(StringRef targetChip);
-
-/// Checks if targetAttr's GPU target has UKernel support.
-bool hasUkernelSupportedGpuArch(IREE::HAL::ExecutableTargetAttr targetAttr);
-
 //===----------------------------------------------------------------------===//
 // GPU Target Information
 //===----------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Utils/BUILD.bazel b/compiler/src/iree/compiler/Utils/BUILD.bazel
index dbcdc0156dae..c7c2acc2a8fd 100644
--- a/compiler/src/iree/compiler/Utils/BUILD.bazel
+++ b/compiler/src/iree/compiler/Utils/BUILD.bazel
@@ -31,6 +31,7 @@ iree_compiler_cc_library(
     hdrs = [
         "ConversionUtils.h",
         "ElementPackingUtils.h",
+        "EmbeddedDataDirectory.h",
         "EquivalenceUtils.h",
         "FlatbufferUtils.h",
         "Folding.h",
diff --git a/compiler/src/iree/compiler/Utils/CMakeLists.txt b/compiler/src/iree/compiler/Utils/CMakeLists.txt
index c4f20b2ac74f..84be0745bbf6 100644
--- a/compiler/src/iree/compiler/Utils/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Utils/CMakeLists.txt
@@ -16,6 +16,7 @@ iree_cc_library(
   HDRS
     "ConversionUtils.h"
     "ElementPackingUtils.h"
+    "EmbeddedDataDirectory.h"
     "EquivalenceUtils.h"
     "FlatbufferUtils.h"
     "Folding.h"
diff --git a/compiler/src/iree/compiler/Utils/EmbeddedDataDirectory.h b/compiler/src/iree/compiler/Utils/EmbeddedDataDirectory.h
new file mode 100644
index 000000000000..130db1d68bf1
--- /dev/null
+++ b/compiler/src/iree/compiler/Utils/EmbeddedDataDirectory.h
@@ -0,0 +1,59 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_COMPILER_UTILS_EMBEDDEDDATADIRECTORY_H_
+#define IREE_COMPILER_UTILS_EMBEDDEDDATADIRECTORY_H_
+
+#include <mutex>
+#include "llvm/ADT/StringMap.h"
+
+namespace mlir::iree_compiler {
+
+// A string-to-StringRef map that acts as a virtual filesystem: the keys are
+// "filenames" and the values are file contents.
+class EmbeddedDataDirectory {
+public:
+  // Calls the given `callback` on a global singleton object, guarded by a
+  // global mutex.
+  //
+  // Only use this for use cases that require a global object, such as when
+  // exporting data between parts of the compiler that can't directly link to
+  // each other (e.g. from a plugin to outside of the plugin).
+  static void
+  withGlobal(llvm::function_ref<void(EmbeddedDataDirectory &)> callback) {
+    static EmbeddedDataDirectory dir;
+    static std::mutex mutex;
+    std::lock_guard<std::mutex> lock(mutex);
+    callback(dir);
+  }
+
+  // Add a new entry if it didn't already exist. Return `true` if it was added.
+  bool addFile(llvm::StringRef fileName, llvm::StringRef contents) {
+    auto [_iter, success] = map.insert({fileName, contents});
+    return success;
+  }
+
+  // Get an existing entry if it exists, otherwise return nullopt.
+  std::optional<llvm::StringRef> getFile(llvm::StringRef fileName) const {
+    auto iter = map.find(fileName);
+    if (iter == map.end()) {
+      return std::nullopt;
+    }
+    return iter->getValue();
+  }
+
+  // Direct access to the underlying StringMap, for use cases that are not well
+  // served by convenience methods like addFile and getFile. For example,
+  // iterating over all entries.
+  llvm::StringMap<llvm::StringRef> &getMap() { return map; }
+
+private:
+  llvm::StringMap<llvm::StringRef> map;
+};
+
+} // namespace mlir::iree_compiler
+
+#endif // IREE_COMPILER_UTILS_EMBEDDEDDATADIRECTORY_H_
diff --git a/compiler/src/iree/compiler/Utils/unittests/BUILD.bazel b/compiler/src/iree/compiler/Utils/unittests/BUILD.bazel
index 197037a2e5ba..3581fddea250 100644
--- a/compiler/src/iree/compiler/Utils/unittests/BUILD.bazel
+++ b/compiler/src/iree/compiler/Utils/unittests/BUILD.bazel
@@ -19,5 +19,6 @@ iree_compiler_cc_test(
         "//compiler/src/iree/compiler/Utils",
         "//compiler/src/iree/testing:gtest_main",
         "@com_google_googletest//:gtest",
+        "@llvm-project//llvm:Support",
     ],
 )
diff --git a/compiler/src/iree/compiler/Utils/unittests/CMakeLists.txt b/compiler/src/iree/compiler/Utils/unittests/CMakeLists.txt
index 421262fd5d71..a850b2d7c83c 100644
--- a/compiler/src/iree/compiler/Utils/unittests/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Utils/unittests/CMakeLists.txt
@@ -16,6 +16,7 @@ iree_cc_test(
   SRCS
     "UtilsTest.cpp"
   DEPS
+    LLVMSupport
     gmock
     gtest
     iree::compiler::Utils
diff --git a/compiler/src/iree/compiler/Utils/unittests/UtilsTest.cpp b/compiler/src/iree/compiler/Utils/unittests/UtilsTest.cpp
index d3ad37ca7512..39dc6cd1e712 100644
--- a/compiler/src/iree/compiler/Utils/unittests/UtilsTest.cpp
+++ b/compiler/src/iree/compiler/Utils/unittests/UtilsTest.cpp
@@ -6,8 +6,11 @@
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include <thread>
 
+#include "iree/compiler/Utils/EmbeddedDataDirectory.h"
 #include "iree/compiler/Utils/Permutation.h"
+#include "llvm/Support/FormatVariadic.h"
 
 using namespace mlir::iree_compiler;
 using namespace testing;
@@ -19,3 +22,47 @@ TEST(Permutation, MakeMovePermutation) {
   EXPECT_THAT(makeMovePermutation(3, 1, 2), ElementsAre(0, 2, 1));
   EXPECT_THAT(makeMovePermutation(3, 2, 0), ElementsAre(2, 0, 1));
 }
+
+TEST(EmbeddedDataDirectory, AddFileGetFile) {
+  EmbeddedDataDirectory dir;
+  EXPECT_TRUE(dir.addFile("filename1", "file contents 1"));
+  EXPECT_TRUE(dir.addFile("filename2", "file contents 2"));
+  EXPECT_FALSE(dir.addFile("filename1", "file contents 3"));
+  EXPECT_EQ(dir.getFile("filename1"), "file contents 1");
+  EXPECT_EQ(dir.getFile("filename2"), "file contents 2");
+  EXPECT_EQ(dir.getFile("filename3"), std::nullopt);
+}
+
+TEST(EmbeddedDataDirectory, WithGlobal) {
+  std::vector<std::thread> threads;
+  for (int i = 0; i < 3; ++i) {
+    threads.emplace_back([i] {
+      EmbeddedDataDirectory::withGlobal([i](EmbeddedDataDirectory &globalDir) {
+        EXPECT_TRUE(globalDir.addFile(llvm::formatv("filename{0}", i).str(),
+                                      "file contents xxx"));
+      });
+    });
+  }
+  for (std::thread &thread : threads) {
+    thread.join();
+  }
+  EmbeddedDataDirectory::withGlobal([](EmbeddedDataDirectory &globalDir) {
+    std::vector<std::string> keys;
+    for (auto iter : globalDir.getMap().keys()) {
+      keys.push_back(iter.str());
+    }
+    EXPECT_THAT(keys,
+                UnorderedElementsAre("filename0", "filename1", "filename2"));
+  });
+}
+
+TEST(EmbeddedDataDirectory, GetMap) {
+  EmbeddedDataDirectory dir;
+  EXPECT_TRUE(dir.addFile("filename1", "file contents 1"));
+  EXPECT_TRUE(dir.addFile("filename2", "file contents 2"));
+  std::vector<std::string> keys;
+  for (auto iter : dir.getMap().keys()) {
+    keys.push_back(iter.str());
+  }
+  EXPECT_THAT(keys, UnorderedElementsAre("filename1", "filename2"));
+}