rocr: Generalize AMD::MemoryRegion Allocate and Free

Remove KFD-specific Allocate/Free calls from the AMD::MemoryRegion. The KFD-driver-specific Allocate/Free calls are now implemented in the KfdDriver. Future changes will migrate the remaining KFD-specific calls out of AMD::MemoryRegion. This allows the MemoryRegion to be used across AMD drivers like the XDNA driver. Change-Id: Ib6a2a9e5e1a15e61644d2592beb3a8e6578c3010
nod-ai · Aug 28, 2024 · 68669f4 · 68669f4
1 parent c42ff44
commit 68669f4
Show file tree

Hide file tree

Showing 11 changed files with 448 additions and 326 deletions.
diff --git a/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp b/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp
@@ -49,6 +49,10 @@
 
 #include "hsakmt/hsakmt.h"
 
+#include "core/inc/amd_cpu_agent.h"
+#include "core/inc/amd_gpu_agent.h"
+#include "core/inc/amd_memory_region.h"
+#include "core/inc/exceptions.h"
 #include "core/inc/runtime.h"
 
 namespace rocr {
@@ -70,18 +74,155 @@ hsa_status_t KfdDriver::QueryKernelModeDriver(core::DriverQuery query) {
   return HSA_STATUS_SUCCESS;
 }
 
-hsa_status_t KfdDriver::GetMemoryProperties(uint32_t node_id,
-                                            core::MemProperties &mprops) const {
+hsa_status_t
+KfdDriver::GetMemoryProperties(uint32_t node_id,
+                               core::MemoryRegion &mem_region) const {
   return HSA_STATUS_SUCCESS;
 }
 
-hsa_status_t KfdDriver::AllocateMemory(void **mem, size_t size,
-                                       uint32_t node_id, core::MemFlags flags) {
-  return HSA_STATUS_SUCCESS;
+hsa_status_t
+KfdDriver::AllocateMemory(const core::MemoryRegion &mem_region,
+                          core::MemoryRegion::AllocateFlags alloc_flags,
+                          void **mem, size_t size, uint32_t agent_node_id) {
+  const MemoryRegion &m_region(static_cast<const MemoryRegion &>(mem_region));
+  HsaMemFlags kmt_alloc_flags(m_region.mem_flags());
+
+  kmt_alloc_flags.ui32.ExecuteAccess =
+      (alloc_flags & core::MemoryRegion::AllocateExecutable ? 1 : 0);
+  kmt_alloc_flags.ui32.AQLQueueMemory =
+      (alloc_flags & core::MemoryRegion::AllocateDoubleMap ? 1 : 0);
+
+  if (m_region.IsSystem() &&
+      (alloc_flags & core::MemoryRegion::AllocateNonPaged)) {
+    kmt_alloc_flags.ui32.NonPaged = 1;
+  }
+
+  // Allocating a memory handle for virtual memory
+  kmt_alloc_flags.ui32.NoAddress =
+      !!(alloc_flags & core::MemoryRegion::AllocateMemoryOnly);
+
+  // Allocate pseudo fine grain memory
+  kmt_alloc_flags.ui32.CoarseGrain =
+      (alloc_flags & core::MemoryRegion::AllocatePCIeRW
+           ? 0
+           : kmt_alloc_flags.ui32.CoarseGrain);
+
+  kmt_alloc_flags.ui32.NoSubstitute =
+      (alloc_flags & core::MemoryRegion::AllocatePinned
+           ? 1
+           : kmt_alloc_flags.ui32.NoSubstitute);
+
+  kmt_alloc_flags.ui32.GTTAccess =
+      (alloc_flags & core::MemoryRegion::AllocateGTTAccess
+           ? 1
+           : kmt_alloc_flags.ui32.GTTAccess);
+
+  if (m_region.IsLocalMemory()) {
+    // Allocate physically contiguous memory. AllocateKfdMemory function call
+    // will fail if this flag is not supported in KFD.
+    kmt_alloc_flags.ui32.Contiguous =
+        (alloc_flags & core::MemoryRegion::AllocateContiguous
+             ? 1
+             : kmt_alloc_flags.ui32.Contiguous);
+  }
+
+  //// Only allow using the suballocator for ordinary VRAM.
+  if (m_region.IsLocalMemory() && !kmt_alloc_flags.ui32.NoAddress) {
+    bool subAllocEnabled =
+        !core::Runtime::runtime_singleton_->flag().disable_fragment_alloc();
+    // Avoid modifying executable or queue allocations.
+    bool useSubAlloc = subAllocEnabled;
+    useSubAlloc &=
+        ((alloc_flags & (~core::MemoryRegion::AllocateRestrict)) == 0);
+
+    if (useSubAlloc) {
+      *mem = m_region.fragment_alloc(size);
+
+      if ((alloc_flags & core::MemoryRegion::AllocateAsan) &&
+          hsaKmtReplaceAsanHeaderPage(*mem) != HSAKMT_STATUS_SUCCESS) {
+        m_region.fragment_free(*mem);
+        *mem = nullptr;
+        return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+      }
+
+      return HSA_STATUS_SUCCESS;
+    }
+  }
+
+  const uint32_t node_id =
+      (alloc_flags & core::MemoryRegion::AllocateGTTAccess)
+          ? agent_node_id
+          : m_region.owner()->node_id();
+
+  //// Allocate memory.
+  //// If it fails attempt to release memory from the block allocator and retry.
+  *mem = AllocateKfdMemory(kmt_alloc_flags, node_id, size);
+  if (*mem == nullptr) {
+    m_region.owner()->Trim();
+    *mem = AllocateKfdMemory(kmt_alloc_flags, node_id, size);
+  }
+
+  if (*mem != nullptr) {
+    if (kmt_alloc_flags.ui32.NoAddress)
+      return HSA_STATUS_SUCCESS;
+
+    // Commit the memory.
+    // For system memory, on non-restricted allocation, map it to all GPUs. On
+    // restricted allocation, only CPU is allowed to access by default, so
+    // no need to map
+    // For local memory, only map it to the owning GPU. Mapping to other GPU,
+    // if the access is allowed, is performed on AllowAccess.
+    HsaMemMapFlags map_flag = m_region.map_flags();
+    size_t map_node_count = 1;
+    const uint32_t owner_node_id = m_region.owner()->node_id();
+    const uint32_t *map_node_id = &owner_node_id;
+
+    if (m_region.IsSystem()) {
+      if ((alloc_flags & core::MemoryRegion::AllocateRestrict) == 0) {
+        // Map to all GPU agents.
+        map_node_count = core::Runtime::runtime_singleton_->gpu_ids().size();
+
+        if (map_node_count == 0) {
+          // No need to pin since no GPU in the platform.
+          return HSA_STATUS_SUCCESS;
+        }
+
+        map_node_id = &core::Runtime::runtime_singleton_->gpu_ids()[0];
+      } else {
+        // No need to pin it for CPU exclusive access.
+        return HSA_STATUS_SUCCESS;
+      }
+    }
+
+    uint64_t alternate_va = 0;
+    const bool is_resident = MakeKfdMemoryResident(
+        map_node_count, map_node_id, *mem, size, &alternate_va, map_flag);
+
+    const bool require_pinning =
+        (!m_region.full_profile() || m_region.IsLocalMemory() ||
+         m_region.IsScratch());
+
+    if (require_pinning && !is_resident) {
+      FreeKfdMemory(*mem, size);
+      *mem = nullptr;
+      return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+    }
+
+    if ((alloc_flags & core::MemoryRegion::AllocateAsan) &&
+        hsaKmtReplaceAsanHeaderPage(*mem) != HSAKMT_STATUS_SUCCESS) {
+      FreeKfdMemory(*mem, size);
+      *mem = nullptr;
+      return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+    }
+    return HSA_STATUS_SUCCESS;
+  }
+
+  return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
 }
 
-hsa_status_t KfdDriver::FreeMemory(void *mem, uint32_t node_id) {
-  return HSA_STATUS_SUCCESS;
+hsa_status_t KfdDriver::FreeMemory(void *mem, size_t size) {
+  MakeKfdMemoryUnresident(mem);
+  return FreeKfdMemory(mem, size) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
 }
 
 hsa_status_t KfdDriver::CreateQueue(core::Queue &queue) {
@@ -92,5 +233,45 @@ hsa_status_t KfdDriver::DestroyQueue(core::Queue &queue) const {
   return HSA_STATUS_SUCCESS;
 }
 
+void *KfdDriver::AllocateKfdMemory(const HsaMemFlags &flags, uint32_t node_id,
+                                   size_t size) {
+  void *mem = nullptr;
+  const HSAKMT_STATUS status = hsaKmtAllocMemory(node_id, size, flags, &mem);
+  return (status == HSAKMT_STATUS_SUCCESS) ? mem : nullptr;
+}
+
+bool KfdDriver::FreeKfdMemory(void *mem, size_t size) {
+  if (mem == nullptr || size == 0) {
+    debug_print("Invalid free ptr:%p size:%lu\n", mem, size);
+    return true;
+  }
+
+  if (hsaKmtFreeMemory(mem, size) != HSAKMT_STATUS_SUCCESS) {
+    debug_print("Failed to free ptr:%p size:%lu\n", mem, size);
+    return false;
+  }
+  return true;
+}
+
+bool KfdDriver::MakeKfdMemoryResident(size_t num_node, const uint32_t *nodes,
+                                      const void *mem, size_t size,
+                                      uint64_t *alternate_va,
+                                      HsaMemMapFlags map_flag) {
+  assert(num_node > 0);
+  assert(nodes);
+
+  *alternate_va = 0;
+
+  HSAKMT_STATUS kmt_status(hsaKmtMapMemoryToGPUNodes(
+      const_cast<void *>(mem), size, alternate_va, map_flag, num_node,
+      const_cast<uint32_t *>(nodes)));
+
+  return (kmt_status == HSAKMT_STATUS_SUCCESS);
+}
+
+void KfdDriver::MakeKfdMemoryUnresident(const void *mem) {
+  hsaKmtUnmapMemoryToGPU(const_cast<void *>(mem));
+}
+
 } // namespace AMD
 } // namespace rocr
diff --git a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
@@ -47,6 +47,7 @@
 #include <memory>
 #include <string>
 
+#include "core/inc/amd_memory_region.h"
 #include "core/inc/runtime.h"
 #include "uapi/amdxdna_accel.h"
 
@@ -89,17 +90,18 @@ hsa_status_t XdnaDriver::QueryKernelModeDriver(core::DriverQuery query) {
 
 hsa_status_t
 XdnaDriver::GetMemoryProperties(uint32_t node_id,
-                                core::MemProperties &mprops) const {
+                                core::MemoryRegion &mem_region) const {
   return HSA_STATUS_SUCCESS;
 }
 
-hsa_status_t XdnaDriver::AllocateMemory(void **mem, size_t size,
-                                        uint32_t node_id,
-                                        core::MemFlags flags) {
+hsa_status_t
+XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region,
+                           core::MemoryRegion::AllocateFlags alloc_flags,
+                           void **mem, size_t size, uint32_t node_id) {
   return HSA_STATUS_SUCCESS;
 }
 
-hsa_status_t XdnaDriver::FreeMemory(void *mem, uint32_t node_id) {
+hsa_status_t XdnaDriver::FreeMemory(void *mem, size_t size) {
   return HSA_STATUS_SUCCESS;
 }
 

diff --git a/runtime/hsa-runtime/core/inc/agent.h b/runtime/hsa-runtime/core/inc/agent.h
@@ -49,11 +49,12 @@
 #include <vector>
 
 #include "core/inc/checked.h"
+#include "core/inc/driver.h"
 #include "core/inc/isa.h"
-#include "core/inc/queue.h"
 #include "core/inc/memory_region.h"
-#include "core/util/utils.h"
+#include "core/inc/queue.h"
 #include "core/util/locks.h"
+#include "core/util/utils.h"
 
 namespace rocr {
 
@@ -117,19 +118,18 @@ class Agent : public Checked<0xF6BC25EB17E6F917> {
   // @brief Agent class contructor.
   //
   // @param [in] type CPU or GPU or other.
-  explicit Agent(uint32_t node_id, DeviceType type)
-      : node_id_(node_id),
-        device_type_(uint32_t(type)),
-        profiling_enabled_(false),
-        enabled_(false) {
+  explicit Agent(DriverType drv_type, uint32_t node_id, DeviceType type)
+      : driver_type(drv_type), node_id_(node_id), device_type_(uint32_t(type)),
+        profiling_enabled_(false), enabled_(false) {
     public_handle_ = Convert(this);
   }
 
   // @brief Agent class contructor.
   //
   // @param [in] type CPU or GPU or other.
-  explicit Agent(uint32_t node_id, uint32_t type)
-      : node_id_(node_id), device_type_(type), profiling_enabled_(false) {
+  explicit Agent(DriverType drv_type, uint32_t node_id, uint32_t type)
+      : driver_type(drv_type), node_id_(node_id), device_type_(type),
+        profiling_enabled_(false) {
     public_handle_ = Convert(this);
   }
 
@@ -315,7 +315,9 @@ class Agent : public Checked<0xF6BC25EB17E6F917> {
     for (auto region : regions()) region->Trim();
   }
 
- protected:
+  const DriverType driver_type;
+
+protected:
   // Intention here is to have a polymorphic update procedure for public_handle_
   // which is callable on any Agent* but only from some class dervied from
   // Agent*.  do_set_public_handle should remain protected or private in all