Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[UR][L0] Manage UMF pools through usm::pool_manager #17065

Merged
merged 4 commits into from
Mar 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions sycl/test-e2e/Adapters/interop-level-zero-buffer-ownership.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,16 @@
// Keep ownership
// CHECK: zeMemFree

// Account for zeMemFree used to query page sizes by the UMF
// CHECK-COUNT-8: zeMemFree
// Account for zeMemFree used to query page sizes by the UMF (only affects v2 L0
// adapter)
// CHECK-OPT: zeMemFree
// CHECK-OPT: zeMemFree
// CHECK-OPT: zeMemFree
// CHECK-OPT: zeMemFree
// CHECK-OPT: zeMemFree
// CHECK-OPT: zeMemFree
// CHECK-OPT: zeMemFree
// CHECK-OPT: zeMemFree

// Transfer ownership
// CHECK: zeMemFree
Expand Down
111 changes: 1 addition & 110 deletions unified-runtime/source/adapters/level_zero/context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -192,115 +192,6 @@ ur_result_t urContextSetExtendedDeleter(
} // namespace ur::level_zero

ur_result_t ur_context_handle_t_::initialize() {

// Helper lambda to create various USM allocators for a device.
// Note that the CCS devices and their respective subdevices share a
// common ze_device_handle and therefore, also share USM allocators.
auto createUSMAllocators = [this](ur_device_handle_t Device) {
auto MemProvider = umf::memoryProviderMakeUnique<L0DeviceMemoryProvider>(
reinterpret_cast<ur_context_handle_t>(this), Device)
.second;
auto UmfDeviceParamsHandle = getUmfParamsHandle(
DisjointPoolConfigInstance.Configs[usm::DisjointPoolMemType::Device]);
DeviceMemPools.emplace(
std::piecewise_construct, std::make_tuple(Device->ZeDevice),
std::make_tuple(umf::poolMakeUniqueFromOps(umfDisjointPoolOps(),
std::move(MemProvider),
UmfDeviceParamsHandle.get())
.second));

MemProvider = umf::memoryProviderMakeUnique<L0SharedMemoryProvider>(
reinterpret_cast<ur_context_handle_t>(this), Device)
.second;

auto UmfSharedParamsHandle = getUmfParamsHandle(
DisjointPoolConfigInstance.Configs[usm::DisjointPoolMemType::Shared]);
SharedMemPools.emplace(
std::piecewise_construct, std::make_tuple(Device->ZeDevice),
std::make_tuple(umf::poolMakeUniqueFromOps(umfDisjointPoolOps(),
std::move(MemProvider),
UmfSharedParamsHandle.get())
.second));

MemProvider = umf::memoryProviderMakeUnique<L0SharedReadOnlyMemoryProvider>(
reinterpret_cast<ur_context_handle_t>(this), Device)
.second;

auto UmfSharedROParamsHandle = getUmfParamsHandle(
DisjointPoolConfigInstance
.Configs[usm::DisjointPoolMemType::SharedReadOnly]);
SharedReadOnlyMemPools.emplace(
std::piecewise_construct, std::make_tuple(Device->ZeDevice),
std::make_tuple(umf::poolMakeUniqueFromOps(
umfDisjointPoolOps(), std::move(MemProvider),
UmfSharedROParamsHandle.get())
.second));

MemProvider = umf::memoryProviderMakeUnique<L0DeviceMemoryProvider>(
reinterpret_cast<ur_context_handle_t>(this), Device)
.second;
DeviceMemProxyPools.emplace(
std::piecewise_construct, std::make_tuple(Device->ZeDevice),
std::make_tuple(
umf::poolMakeUnique<USMProxyPool>(std::move(MemProvider)).second));

MemProvider = umf::memoryProviderMakeUnique<L0SharedMemoryProvider>(
reinterpret_cast<ur_context_handle_t>(this), Device)
.second;
SharedMemProxyPools.emplace(
std::piecewise_construct, std::make_tuple(Device->ZeDevice),
std::make_tuple(
umf::poolMakeUnique<USMProxyPool>(std::move(MemProvider)).second));

MemProvider = umf::memoryProviderMakeUnique<L0SharedReadOnlyMemoryProvider>(
reinterpret_cast<ur_context_handle_t>(this), Device)
.second;
SharedReadOnlyMemProxyPools.emplace(
std::piecewise_construct, std::make_tuple(Device->ZeDevice),
std::make_tuple(
umf::poolMakeUnique<USMProxyPool>(std::move(MemProvider)).second));
};

// Recursive helper to call createUSMAllocators for all sub-devices
std::function<void(ur_device_handle_t)> createUSMAllocatorsRecursive;
createUSMAllocatorsRecursive =
[createUSMAllocators,
&createUSMAllocatorsRecursive](ur_device_handle_t Device) -> void {
createUSMAllocators(Device);
for (auto &SubDevice : Device->SubDevices)
createUSMAllocatorsRecursive(SubDevice);
};

// Create USM pool for each pair (device, context).
//
for (auto &Device : Devices) {
createUSMAllocatorsRecursive(Device);
}
// Create USM pool for host. Device and Shared USM allocations
// are device-specific. Host allocations are not device-dependent therefore
// we don't need a map with device as key.
auto MemProvider = umf::memoryProviderMakeUnique<L0HostMemoryProvider>(
reinterpret_cast<ur_context_handle_t>(this), nullptr)
.second;
auto UmfHostParamsHandle = getUmfParamsHandle(
DisjointPoolConfigInstance.Configs[usm::DisjointPoolMemType::Host]);
HostMemPool =
umf::poolMakeUniqueFromOps(umfDisjointPoolOps(), std::move(MemProvider),
UmfHostParamsHandle.get())
.second;

MemProvider = umf::memoryProviderMakeUnique<L0HostMemoryProvider>(
reinterpret_cast<ur_context_handle_t>(this), nullptr)
.second;
HostMemProxyPool =
umf::poolMakeUnique<USMProxyPool>(std::move(MemProvider)).second;

// We may allocate memory to this root device so create allocators.
if (SingleRootDevice &&
DeviceMemPools.find(SingleRootDevice->ZeDevice) == DeviceMemPools.end()) {
createUSMAllocators(SingleRootDevice);
}

// Create the immediate command list to be used for initializations.
// Created as synchronous so level-zero performs implicit synchronization and
// there is no need to query for completion in the plugin
Expand All @@ -311,7 +202,7 @@ ur_result_t ur_context_handle_t_::initialize() {
// D2D migartion, if no P2P, is broken since it should use
// immediate command-list for the specfic devices, and this single one.
//
ur_device_handle_t Device = SingleRootDevice ? SingleRootDevice : Devices[0];
ur_device_handle_t Device = Devices[0];

// Prefer to use copy engine for initialization copies,
// if available and allowed (main copy engine with index 0).
Expand Down
38 changes: 11 additions & 27 deletions unified-runtime/source/adapters/level_zero/context.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

#include "common.hpp"
#include "queue.hpp"
#include "usm.hpp"

#include <umf_helpers.hpp>

Expand Down Expand Up @@ -51,15 +52,18 @@ typedef struct _ze_intel_event_sync_mode_exp_desc_t {
ze_intel_event_sync_mode_exp_flags_t syncModeFlags;
} ze_intel_event_sync_mode_exp_desc_t;

extern const bool UseUSMAllocator;

struct ur_context_handle_t_ : _ur_object {
ur_context_handle_t_(ze_context_handle_t ZeContext, uint32_t NumDevices,
const ur_device_handle_t *Devs, bool OwnZeContext)
: ZeContext{ZeContext}, Devices{Devs, Devs + NumDevices},
NumDevices{NumDevices} {
NumDevices{NumDevices}, DefaultPool{this, nullptr, !UseUSMAllocator} {
OwnNativeHandle = OwnZeContext;
}

ur_context_handle_t_(ze_context_handle_t ZeContext) : ZeContext{ZeContext} {}
ur_context_handle_t_(ze_context_handle_t ZeContext)
: ZeContext{ZeContext}, DefaultPool{this, nullptr, !UseUSMAllocator} {}

// A L0 context handle is primarily used during creation and management of
// resources that may be used by multiple devices.
Expand Down Expand Up @@ -94,13 +98,6 @@ struct ur_context_handle_t_ : _ur_object {
// compute and copy command list caches.
ur_mutex ZeCommandListCacheMutex;

// If context contains one device or sub-devices of the same device, we want
// to save this device.
// This field is only set at ur_context_handle_t creation time, and cannot
// change. Therefore it can be accessed without holding a lock on this
// ur_context_handle_t.
ur_device_handle_t SingleRootDevice = nullptr;

// Cache of all currently available/completed command/copy lists.
// Note that command-list can only be re-used on the same device.
//
Expand All @@ -123,24 +120,11 @@ struct ur_context_handle_t_ : _ur_object {

// Store USM pool for USM shared and device allocations. There is 1 memory
// pool per each pair of (context, device) per each memory type.
std::unordered_map<ze_device_handle_t, umf::pool_unique_handle_t>
DeviceMemPools;
std::unordered_map<ze_device_handle_t, umf::pool_unique_handle_t>
SharedMemPools;
std::unordered_map<ze_device_handle_t, umf::pool_unique_handle_t>
SharedReadOnlyMemPools;

// Store the host memory pool. It does not depend on any device.
umf::pool_unique_handle_t HostMemPool;

// Allocation-tracking proxy pools for direct allocations. No pooling used.
std::unordered_map<ze_device_handle_t, umf::pool_unique_handle_t>
DeviceMemProxyPools;
std::unordered_map<ze_device_handle_t, umf::pool_unique_handle_t>
SharedMemProxyPools;
std::unordered_map<ze_device_handle_t, umf::pool_unique_handle_t>
SharedReadOnlyMemProxyPools;
umf::pool_unique_handle_t HostMemProxyPool;
// It's either a DisjointPool implementation from UMF or an
// allocation-tracking proxy pool for direct allocations that does not
// internally pool memory. Actual implementation during runtime is decided by
// the 'UseUSMAllocator' variable value.
ur_usm_pool_handle_t_ DefaultPool;

// Map associating pools created with urUsmPoolCreate and internal pools
std::list<ur_usm_pool_handle_t> UsmPoolHandles{};
Expand Down
66 changes: 14 additions & 52 deletions unified-runtime/source/adapters/level_zero/memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1515,9 +1515,7 @@ ur_result_t urMemImageCreate(
// own the image.
// TODO: Implement explicit copying for acessing the image from other devices
// in the context.
ur_device_handle_t Device = Context->SingleRootDevice
? Context->SingleRootDevice
: Context->Devices[0];
ur_device_handle_t Device = Context->Devices[0];
ze_image_handle_t ZeImage;
ZE2UR_CALL(zeImageCreate,
(Context->ZeContext, Device->ZeDevice, &ZeImageDesc, &ZeImage));
Expand Down Expand Up @@ -2079,58 +2077,22 @@ ur_result_t _ur_buffer::getBufferZeHandle(char *&ZeHandle,
LastDeviceWithValidAllocation = Device;
return UR_RESULT_SUCCESS;
}
// Reads user setting on how to deal with buffers in contexts where
// all devices have the same root-device. Returns "true" if the
// preference is to have allocate on each [sub-]device and migrate
// normally (copy) to other sub-devices as needed. Returns "false"
// if the preference is to have single root-device allocations
// serve the needs of all [sub-]devices, meaning potentially more
// cross-tile traffic.
//
static const bool SingleRootDeviceBufferMigration = [] {
const char *UrRet =
std::getenv("UR_L0_SINGLE_ROOT_DEVICE_BUFFER_MIGRATION");
const char *PiRet =
std::getenv("SYCL_PI_LEVEL_ZERO_SINGLE_ROOT_DEVICE_BUFFER_MIGRATION");
const char *EnvStr = UrRet ? UrRet : (PiRet ? PiRet : nullptr);
if (EnvStr)
return (std::stoi(EnvStr) != 0);
// The default is to migrate normally, which may not always be the
// best option (depends on buffer access patterns), but is an
// overall win on the set of the available benchmarks.
return true;
}();

// Peform actual device allocation as needed.
if (!Allocation.ZeHandle) {
if (!SingleRootDeviceBufferMigration && UrContext->SingleRootDevice &&
UrContext->SingleRootDevice != Device) {
// If all devices in the context are sub-devices of the same device
// then we reuse root-device allocation by all sub-devices in the
// context.
// TODO: we can probably generalize this and share root-device
// allocations by its own sub-devices even if not all other
// devices in the context have the same root.
UR_CALL(getZeHandle(ZeHandle, AccessMode, UrContext->SingleRootDevice,
phWaitEvents, numWaitEvents));
Allocation.ReleaseAction = allocation_t::keep;
Allocation.ZeHandle = ZeHandle;
Allocation.Valid = true;
return UR_RESULT_SUCCESS;
} else { // Create device allocation
if (DisjointPoolConfigInstance.EnableBuffers) {
Allocation.ReleaseAction = allocation_t::free;
ur_usm_desc_t USMDesc{};
USMDesc.align = getAlignment();
ur_usm_pool_handle_t Pool{};
UR_CALL(ur::level_zero::urUSMDeviceAlloc(
UrContext, Device, &USMDesc, Pool, Size,
reinterpret_cast<void **>(&ZeHandle)));
} else {
Allocation.ReleaseAction = allocation_t::free_native;
UR_CALL(ZeDeviceMemAllocHelper(reinterpret_cast<void **>(&ZeHandle),
UrContext, Device, Size));
}
// Create device allocation
if (DisjointPoolConfigInstance.EnableBuffers) {
Allocation.ReleaseAction = allocation_t::free;
ur_usm_desc_t USMDesc{};
USMDesc.align = getAlignment();
ur_usm_pool_handle_t Pool{};
UR_CALL(ur::level_zero::urUSMDeviceAlloc(
UrContext, Device, &USMDesc, Pool, Size,
reinterpret_cast<void **>(&ZeHandle)));
} else {
Allocation.ReleaseAction = allocation_t::free_native;
UR_CALL(ZeDeviceMemAllocHelper(reinterpret_cast<void **>(&ZeHandle),
UrContext, Device, Size));
}
Allocation.ZeHandle = ZeHandle;
} else {
Expand Down
Loading
Loading