Skip to content

Commit

Permalink
rocr: Generalize AMD::MemoryRegion Allocate and Free
Browse files Browse the repository at this point in the history
Remove KFD-specific Allocate/Free calls from the AMD::MemoryRegion.
The KFD-driver-specific Allocate/Free calls are now implemented in
the KfdDriver. Future changes will migrate the remaining KFD-specific
calls out of AMD::MemoryRegion.

This allows the MemoryRegion to be used across AMD drivers like the
XDNA driver.

Change-Id: Ib6a2a9e5e1a15e61644d2592beb3a8e6578c3010
  • Loading branch information
atgutier committed Aug 28, 2024
1 parent c42ff44 commit 68669f4
Show file tree
Hide file tree
Showing 11 changed files with 448 additions and 326 deletions.
195 changes: 188 additions & 7 deletions runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@

#include "hsakmt/hsakmt.h"

#include "core/inc/amd_cpu_agent.h"
#include "core/inc/amd_gpu_agent.h"
#include "core/inc/amd_memory_region.h"
#include "core/inc/exceptions.h"
#include "core/inc/runtime.h"

namespace rocr {
Expand All @@ -70,18 +74,155 @@ hsa_status_t KfdDriver::QueryKernelModeDriver(core::DriverQuery query) {
return HSA_STATUS_SUCCESS;
}

hsa_status_t KfdDriver::GetMemoryProperties(uint32_t node_id,
core::MemProperties &mprops) const {
hsa_status_t
KfdDriver::GetMemoryProperties(uint32_t node_id,
core::MemoryRegion &mem_region) const {
return HSA_STATUS_SUCCESS;
}

hsa_status_t KfdDriver::AllocateMemory(void **mem, size_t size,
uint32_t node_id, core::MemFlags flags) {
return HSA_STATUS_SUCCESS;
hsa_status_t
KfdDriver::AllocateMemory(const core::MemoryRegion &mem_region,
core::MemoryRegion::AllocateFlags alloc_flags,
void **mem, size_t size, uint32_t agent_node_id) {
const MemoryRegion &m_region(static_cast<const MemoryRegion &>(mem_region));
HsaMemFlags kmt_alloc_flags(m_region.mem_flags());

kmt_alloc_flags.ui32.ExecuteAccess =
(alloc_flags & core::MemoryRegion::AllocateExecutable ? 1 : 0);
kmt_alloc_flags.ui32.AQLQueueMemory =
(alloc_flags & core::MemoryRegion::AllocateDoubleMap ? 1 : 0);

if (m_region.IsSystem() &&
(alloc_flags & core::MemoryRegion::AllocateNonPaged)) {
kmt_alloc_flags.ui32.NonPaged = 1;
}

// Allocating a memory handle for virtual memory
kmt_alloc_flags.ui32.NoAddress =
!!(alloc_flags & core::MemoryRegion::AllocateMemoryOnly);

// Allocate pseudo fine grain memory
kmt_alloc_flags.ui32.CoarseGrain =
(alloc_flags & core::MemoryRegion::AllocatePCIeRW
? 0
: kmt_alloc_flags.ui32.CoarseGrain);

kmt_alloc_flags.ui32.NoSubstitute =
(alloc_flags & core::MemoryRegion::AllocatePinned
? 1
: kmt_alloc_flags.ui32.NoSubstitute);

kmt_alloc_flags.ui32.GTTAccess =
(alloc_flags & core::MemoryRegion::AllocateGTTAccess
? 1
: kmt_alloc_flags.ui32.GTTAccess);

if (m_region.IsLocalMemory()) {
// Allocate physically contiguous memory. AllocateKfdMemory function call
// will fail if this flag is not supported in KFD.
kmt_alloc_flags.ui32.Contiguous =
(alloc_flags & core::MemoryRegion::AllocateContiguous
? 1
: kmt_alloc_flags.ui32.Contiguous);
}

//// Only allow using the suballocator for ordinary VRAM.
if (m_region.IsLocalMemory() && !kmt_alloc_flags.ui32.NoAddress) {
bool subAllocEnabled =
!core::Runtime::runtime_singleton_->flag().disable_fragment_alloc();
// Avoid modifying executable or queue allocations.
bool useSubAlloc = subAllocEnabled;
useSubAlloc &=
((alloc_flags & (~core::MemoryRegion::AllocateRestrict)) == 0);

if (useSubAlloc) {
*mem = m_region.fragment_alloc(size);

if ((alloc_flags & core::MemoryRegion::AllocateAsan) &&
hsaKmtReplaceAsanHeaderPage(*mem) != HSAKMT_STATUS_SUCCESS) {
m_region.fragment_free(*mem);
*mem = nullptr;
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}

return HSA_STATUS_SUCCESS;
}
}

const uint32_t node_id =
(alloc_flags & core::MemoryRegion::AllocateGTTAccess)
? agent_node_id
: m_region.owner()->node_id();

//// Allocate memory.
//// If it fails attempt to release memory from the block allocator and retry.
*mem = AllocateKfdMemory(kmt_alloc_flags, node_id, size);
if (*mem == nullptr) {
m_region.owner()->Trim();
*mem = AllocateKfdMemory(kmt_alloc_flags, node_id, size);
}

if (*mem != nullptr) {
if (kmt_alloc_flags.ui32.NoAddress)
return HSA_STATUS_SUCCESS;

// Commit the memory.
// For system memory, on non-restricted allocation, map it to all GPUs. On
// restricted allocation, only CPU is allowed to access by default, so
// no need to map
// For local memory, only map it to the owning GPU. Mapping to other GPU,
// if the access is allowed, is performed on AllowAccess.
HsaMemMapFlags map_flag = m_region.map_flags();
size_t map_node_count = 1;
const uint32_t owner_node_id = m_region.owner()->node_id();
const uint32_t *map_node_id = &owner_node_id;

if (m_region.IsSystem()) {
if ((alloc_flags & core::MemoryRegion::AllocateRestrict) == 0) {
// Map to all GPU agents.
map_node_count = core::Runtime::runtime_singleton_->gpu_ids().size();

if (map_node_count == 0) {
// No need to pin since no GPU in the platform.
return HSA_STATUS_SUCCESS;
}

map_node_id = &core::Runtime::runtime_singleton_->gpu_ids()[0];
} else {
// No need to pin it for CPU exclusive access.
return HSA_STATUS_SUCCESS;
}
}

uint64_t alternate_va = 0;
const bool is_resident = MakeKfdMemoryResident(
map_node_count, map_node_id, *mem, size, &alternate_va, map_flag);

const bool require_pinning =
(!m_region.full_profile() || m_region.IsLocalMemory() ||
m_region.IsScratch());

if (require_pinning && !is_resident) {
FreeKfdMemory(*mem, size);
*mem = nullptr;
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}

if ((alloc_flags & core::MemoryRegion::AllocateAsan) &&
hsaKmtReplaceAsanHeaderPage(*mem) != HSAKMT_STATUS_SUCCESS) {
FreeKfdMemory(*mem, size);
*mem = nullptr;
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
return HSA_STATUS_SUCCESS;
}

return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}

hsa_status_t KfdDriver::FreeMemory(void *mem, uint32_t node_id) {
return HSA_STATUS_SUCCESS;
hsa_status_t KfdDriver::FreeMemory(void *mem, size_t size) {
MakeKfdMemoryUnresident(mem);
return FreeKfdMemory(mem, size) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
}

hsa_status_t KfdDriver::CreateQueue(core::Queue &queue) {
Expand All @@ -92,5 +233,45 @@ hsa_status_t KfdDriver::DestroyQueue(core::Queue &queue) const {
return HSA_STATUS_SUCCESS;
}

void *KfdDriver::AllocateKfdMemory(const HsaMemFlags &flags, uint32_t node_id,
size_t size) {
void *mem = nullptr;
const HSAKMT_STATUS status = hsaKmtAllocMemory(node_id, size, flags, &mem);
return (status == HSAKMT_STATUS_SUCCESS) ? mem : nullptr;
}

bool KfdDriver::FreeKfdMemory(void *mem, size_t size) {
if (mem == nullptr || size == 0) {
debug_print("Invalid free ptr:%p size:%lu\n", mem, size);
return true;
}

if (hsaKmtFreeMemory(mem, size) != HSAKMT_STATUS_SUCCESS) {
debug_print("Failed to free ptr:%p size:%lu\n", mem, size);
return false;
}
return true;
}

bool KfdDriver::MakeKfdMemoryResident(size_t num_node, const uint32_t *nodes,
const void *mem, size_t size,
uint64_t *alternate_va,
HsaMemMapFlags map_flag) {
assert(num_node > 0);
assert(nodes);

*alternate_va = 0;

HSAKMT_STATUS kmt_status(hsaKmtMapMemoryToGPUNodes(
const_cast<void *>(mem), size, alternate_va, map_flag, num_node,
const_cast<uint32_t *>(nodes)));

return (kmt_status == HSAKMT_STATUS_SUCCESS);
}

void KfdDriver::MakeKfdMemoryUnresident(const void *mem) {
hsaKmtUnmapMemoryToGPU(const_cast<void *>(mem));
}

} // namespace AMD
} // namespace rocr
12 changes: 7 additions & 5 deletions runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
#include <memory>
#include <string>

#include "core/inc/amd_memory_region.h"
#include "core/inc/runtime.h"
#include "uapi/amdxdna_accel.h"

Expand Down Expand Up @@ -89,17 +90,18 @@ hsa_status_t XdnaDriver::QueryKernelModeDriver(core::DriverQuery query) {

hsa_status_t
XdnaDriver::GetMemoryProperties(uint32_t node_id,
core::MemProperties &mprops) const {
core::MemoryRegion &mem_region) const {
return HSA_STATUS_SUCCESS;
}

hsa_status_t XdnaDriver::AllocateMemory(void **mem, size_t size,
uint32_t node_id,
core::MemFlags flags) {
hsa_status_t
XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region,
core::MemoryRegion::AllocateFlags alloc_flags,
void **mem, size_t size, uint32_t node_id) {
return HSA_STATUS_SUCCESS;
}

hsa_status_t XdnaDriver::FreeMemory(void *mem, uint32_t node_id) {
hsa_status_t XdnaDriver::FreeMemory(void *mem, size_t size) {
return HSA_STATUS_SUCCESS;
}

Expand Down
22 changes: 12 additions & 10 deletions runtime/hsa-runtime/core/inc/agent.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,12 @@
#include <vector>

#include "core/inc/checked.h"
#include "core/inc/driver.h"
#include "core/inc/isa.h"
#include "core/inc/queue.h"
#include "core/inc/memory_region.h"
#include "core/util/utils.h"
#include "core/inc/queue.h"
#include "core/util/locks.h"
#include "core/util/utils.h"

namespace rocr {

Expand Down Expand Up @@ -117,19 +118,18 @@ class Agent : public Checked<0xF6BC25EB17E6F917> {
// @brief Agent class contructor.
//
// @param [in] type CPU or GPU or other.
explicit Agent(uint32_t node_id, DeviceType type)
: node_id_(node_id),
device_type_(uint32_t(type)),
profiling_enabled_(false),
enabled_(false) {
explicit Agent(DriverType drv_type, uint32_t node_id, DeviceType type)
: driver_type(drv_type), node_id_(node_id), device_type_(uint32_t(type)),
profiling_enabled_(false), enabled_(false) {
public_handle_ = Convert(this);
}

// @brief Agent class contructor.
//
// @param [in] type CPU or GPU or other.
explicit Agent(uint32_t node_id, uint32_t type)
: node_id_(node_id), device_type_(type), profiling_enabled_(false) {
explicit Agent(DriverType drv_type, uint32_t node_id, uint32_t type)
: driver_type(drv_type), node_id_(node_id), device_type_(type),
profiling_enabled_(false) {
public_handle_ = Convert(this);
}

Expand Down Expand Up @@ -315,7 +315,9 @@ class Agent : public Checked<0xF6BC25EB17E6F917> {
for (auto region : regions()) region->Trim();
}

protected:
const DriverType driver_type;

protected:
// Intention here is to have a polymorphic update procedure for public_handle_
// which is callable on any Agent* but only from some class dervied from
// Agent*. do_set_public_handle should remain protected or private in all
Expand Down
Loading

0 comments on commit 68669f4

Please sign in to comment.