Skip to content

Commit a45e961

Browse files
kswiecickiadamfidel
authored andcommitted
[UR][L0] Manage UMF pools through usm::pool_manager (intel#17065)
PR moved from: oneapi-src/unified-runtime#2495
1 parent c6bc338 commit a45e961

File tree

7 files changed

+253
-507
lines changed

7 files changed

+253
-507
lines changed

sycl/test-e2e/Adapters/interop-level-zero-buffer-ownership.cpp

+10-2
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,16 @@
1818
// Keep ownership
1919
// CHECK: zeMemFree
2020

21-
// Account for zeMemFree used to query page sizes by the UMF
22-
// CHECK-COUNT-8: zeMemFree
21+
// Account for zeMemFree used to query page sizes by the UMF (only affects v2 L0
22+
// adapter)
23+
// CHECK-OPT: zeMemFree
24+
// CHECK-OPT: zeMemFree
25+
// CHECK-OPT: zeMemFree
26+
// CHECK-OPT: zeMemFree
27+
// CHECK-OPT: zeMemFree
28+
// CHECK-OPT: zeMemFree
29+
// CHECK-OPT: zeMemFree
30+
// CHECK-OPT: zeMemFree
2331

2432
// Transfer ownership
2533
// CHECK: zeMemFree

unified-runtime/source/adapters/level_zero/context.cpp

+1-110
Original file line numberDiff line numberDiff line change
@@ -177,115 +177,6 @@ ur_result_t urContextSetExtendedDeleter(
177177
} // namespace ur::level_zero
178178

179179
ur_result_t ur_context_handle_t_::initialize() {
180-
181-
// Helper lambda to create various USM allocators for a device.
182-
// Note that the CCS devices and their respective subdevices share a
183-
// common ze_device_handle and therefore, also share USM allocators.
184-
auto createUSMAllocators = [this](ur_device_handle_t Device) {
185-
auto MemProvider = umf::memoryProviderMakeUnique<L0DeviceMemoryProvider>(
186-
reinterpret_cast<ur_context_handle_t>(this), Device)
187-
.second;
188-
auto UmfDeviceParamsHandle = getUmfParamsHandle(
189-
DisjointPoolConfigInstance.Configs[usm::DisjointPoolMemType::Device]);
190-
DeviceMemPools.emplace(
191-
std::piecewise_construct, std::make_tuple(Device->ZeDevice),
192-
std::make_tuple(umf::poolMakeUniqueFromOps(umfDisjointPoolOps(),
193-
std::move(MemProvider),
194-
UmfDeviceParamsHandle.get())
195-
.second));
196-
197-
MemProvider = umf::memoryProviderMakeUnique<L0SharedMemoryProvider>(
198-
reinterpret_cast<ur_context_handle_t>(this), Device)
199-
.second;
200-
201-
auto UmfSharedParamsHandle = getUmfParamsHandle(
202-
DisjointPoolConfigInstance.Configs[usm::DisjointPoolMemType::Shared]);
203-
SharedMemPools.emplace(
204-
std::piecewise_construct, std::make_tuple(Device->ZeDevice),
205-
std::make_tuple(umf::poolMakeUniqueFromOps(umfDisjointPoolOps(),
206-
std::move(MemProvider),
207-
UmfSharedParamsHandle.get())
208-
.second));
209-
210-
MemProvider = umf::memoryProviderMakeUnique<L0SharedReadOnlyMemoryProvider>(
211-
reinterpret_cast<ur_context_handle_t>(this), Device)
212-
.second;
213-
214-
auto UmfSharedROParamsHandle = getUmfParamsHandle(
215-
DisjointPoolConfigInstance
216-
.Configs[usm::DisjointPoolMemType::SharedReadOnly]);
217-
SharedReadOnlyMemPools.emplace(
218-
std::piecewise_construct, std::make_tuple(Device->ZeDevice),
219-
std::make_tuple(umf::poolMakeUniqueFromOps(
220-
umfDisjointPoolOps(), std::move(MemProvider),
221-
UmfSharedROParamsHandle.get())
222-
.second));
223-
224-
MemProvider = umf::memoryProviderMakeUnique<L0DeviceMemoryProvider>(
225-
reinterpret_cast<ur_context_handle_t>(this), Device)
226-
.second;
227-
DeviceMemProxyPools.emplace(
228-
std::piecewise_construct, std::make_tuple(Device->ZeDevice),
229-
std::make_tuple(
230-
umf::poolMakeUnique<USMProxyPool>(std::move(MemProvider)).second));
231-
232-
MemProvider = umf::memoryProviderMakeUnique<L0SharedMemoryProvider>(
233-
reinterpret_cast<ur_context_handle_t>(this), Device)
234-
.second;
235-
SharedMemProxyPools.emplace(
236-
std::piecewise_construct, std::make_tuple(Device->ZeDevice),
237-
std::make_tuple(
238-
umf::poolMakeUnique<USMProxyPool>(std::move(MemProvider)).second));
239-
240-
MemProvider = umf::memoryProviderMakeUnique<L0SharedReadOnlyMemoryProvider>(
241-
reinterpret_cast<ur_context_handle_t>(this), Device)
242-
.second;
243-
SharedReadOnlyMemProxyPools.emplace(
244-
std::piecewise_construct, std::make_tuple(Device->ZeDevice),
245-
std::make_tuple(
246-
umf::poolMakeUnique<USMProxyPool>(std::move(MemProvider)).second));
247-
};
248-
249-
// Recursive helper to call createUSMAllocators for all sub-devices
250-
std::function<void(ur_device_handle_t)> createUSMAllocatorsRecursive;
251-
createUSMAllocatorsRecursive =
252-
[createUSMAllocators,
253-
&createUSMAllocatorsRecursive](ur_device_handle_t Device) -> void {
254-
createUSMAllocators(Device);
255-
for (auto &SubDevice : Device->SubDevices)
256-
createUSMAllocatorsRecursive(SubDevice);
257-
};
258-
259-
// Create USM pool for each pair (device, context).
260-
//
261-
for (auto &Device : Devices) {
262-
createUSMAllocatorsRecursive(Device);
263-
}
264-
// Create USM pool for host. Device and Shared USM allocations
265-
// are device-specific. Host allocations are not device-dependent therefore
266-
// we don't need a map with device as key.
267-
auto MemProvider = umf::memoryProviderMakeUnique<L0HostMemoryProvider>(
268-
reinterpret_cast<ur_context_handle_t>(this), nullptr)
269-
.second;
270-
auto UmfHostParamsHandle = getUmfParamsHandle(
271-
DisjointPoolConfigInstance.Configs[usm::DisjointPoolMemType::Host]);
272-
HostMemPool =
273-
umf::poolMakeUniqueFromOps(umfDisjointPoolOps(), std::move(MemProvider),
274-
UmfHostParamsHandle.get())
275-
.second;
276-
277-
MemProvider = umf::memoryProviderMakeUnique<L0HostMemoryProvider>(
278-
reinterpret_cast<ur_context_handle_t>(this), nullptr)
279-
.second;
280-
HostMemProxyPool =
281-
umf::poolMakeUnique<USMProxyPool>(std::move(MemProvider)).second;
282-
283-
// We may allocate memory to this root device so create allocators.
284-
if (SingleRootDevice &&
285-
DeviceMemPools.find(SingleRootDevice->ZeDevice) == DeviceMemPools.end()) {
286-
createUSMAllocators(SingleRootDevice);
287-
}
288-
289180
// Create the immediate command list to be used for initializations.
290181
// Created as synchronous so level-zero performs implicit synchronization and
291182
// there is no need to query for completion in the plugin
@@ -296,7 +187,7 @@ ur_result_t ur_context_handle_t_::initialize() {
296187
// D2D migartion, if no P2P, is broken since it should use
297188
// immediate command-list for the specfic devices, and this single one.
298189
//
299-
ur_device_handle_t Device = SingleRootDevice ? SingleRootDevice : Devices[0];
190+
ur_device_handle_t Device = Devices[0];
300191

301192
// Prefer to use copy engine for initialization copies,
302193
// if available and allowed (main copy engine with index 0).

unified-runtime/source/adapters/level_zero/context.hpp

+11-27
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525
#include "common.hpp"
2626
#include "queue.hpp"
27+
#include "usm.hpp"
2728

2829
#include <umf_helpers.hpp>
2930

@@ -51,15 +52,18 @@ typedef struct _ze_intel_event_sync_mode_exp_desc_t {
5152
ze_intel_event_sync_mode_exp_flags_t syncModeFlags;
5253
} ze_intel_event_sync_mode_exp_desc_t;
5354

55+
extern const bool UseUSMAllocator;
56+
5457
struct ur_context_handle_t_ : _ur_object {
5558
ur_context_handle_t_(ze_context_handle_t ZeContext, uint32_t NumDevices,
5659
const ur_device_handle_t *Devs, bool OwnZeContext)
5760
: ZeContext{ZeContext}, Devices{Devs, Devs + NumDevices},
58-
NumDevices{NumDevices} {
61+
NumDevices{NumDevices}, DefaultPool{this, nullptr, !UseUSMAllocator} {
5962
OwnNativeHandle = OwnZeContext;
6063
}
6164

62-
ur_context_handle_t_(ze_context_handle_t ZeContext) : ZeContext{ZeContext} {}
65+
ur_context_handle_t_(ze_context_handle_t ZeContext)
66+
: ZeContext{ZeContext}, DefaultPool{this, nullptr, !UseUSMAllocator} {}
6367

6468
// A L0 context handle is primarily used during creation and management of
6569
// resources that may be used by multiple devices.
@@ -94,13 +98,6 @@ struct ur_context_handle_t_ : _ur_object {
9498
// compute and copy command list caches.
9599
ur_mutex ZeCommandListCacheMutex;
96100

97-
// If context contains one device or sub-devices of the same device, we want
98-
// to save this device.
99-
// This field is only set at ur_context_handle_t creation time, and cannot
100-
// change. Therefore it can be accessed without holding a lock on this
101-
// ur_context_handle_t.
102-
ur_device_handle_t SingleRootDevice = nullptr;
103-
104101
// Cache of all currently available/completed command/copy lists.
105102
// Note that command-list can only be re-used on the same device.
106103
//
@@ -123,24 +120,11 @@ struct ur_context_handle_t_ : _ur_object {
123120

124121
// Store USM pool for USM shared and device allocations. There is 1 memory
125122
// pool per each pair of (context, device) per each memory type.
126-
std::unordered_map<ze_device_handle_t, umf::pool_unique_handle_t>
127-
DeviceMemPools;
128-
std::unordered_map<ze_device_handle_t, umf::pool_unique_handle_t>
129-
SharedMemPools;
130-
std::unordered_map<ze_device_handle_t, umf::pool_unique_handle_t>
131-
SharedReadOnlyMemPools;
132-
133-
// Store the host memory pool. It does not depend on any device.
134-
umf::pool_unique_handle_t HostMemPool;
135-
136-
// Allocation-tracking proxy pools for direct allocations. No pooling used.
137-
std::unordered_map<ze_device_handle_t, umf::pool_unique_handle_t>
138-
DeviceMemProxyPools;
139-
std::unordered_map<ze_device_handle_t, umf::pool_unique_handle_t>
140-
SharedMemProxyPools;
141-
std::unordered_map<ze_device_handle_t, umf::pool_unique_handle_t>
142-
SharedReadOnlyMemProxyPools;
143-
umf::pool_unique_handle_t HostMemProxyPool;
123+
// It's either a DisjointPool implementation from UMF or an
124+
// allocation-tracking proxy pool for direct allocations that does not
125+
// internally pool memory. Actual implementation during runtime is decided by
126+
// the 'UseUSMAllocator' variable value.
127+
ur_usm_pool_handle_t_ DefaultPool;
144128

145129
// Map associating pools created with urUsmPoolCreate and internal pools
146130
std::list<ur_usm_pool_handle_t> UsmPoolHandles{};

unified-runtime/source/adapters/level_zero/memory.cpp

+14-52
Original file line numberDiff line numberDiff line change
@@ -1515,9 +1515,7 @@ ur_result_t urMemImageCreate(
15151515
// own the image.
15161516
// TODO: Implement explicit copying for acessing the image from other devices
15171517
// in the context.
1518-
ur_device_handle_t Device = Context->SingleRootDevice
1519-
? Context->SingleRootDevice
1520-
: Context->Devices[0];
1518+
ur_device_handle_t Device = Context->Devices[0];
15211519
ze_image_handle_t ZeImage;
15221520
ZE2UR_CALL(zeImageCreate,
15231521
(Context->ZeContext, Device->ZeDevice, &ZeImageDesc, &ZeImage));
@@ -2079,58 +2077,22 @@ ur_result_t _ur_buffer::getBufferZeHandle(char *&ZeHandle,
20792077
LastDeviceWithValidAllocation = Device;
20802078
return UR_RESULT_SUCCESS;
20812079
}
2082-
// Reads user setting on how to deal with buffers in contexts where
2083-
// all devices have the same root-device. Returns "true" if the
2084-
// preference is to have allocate on each [sub-]device and migrate
2085-
// normally (copy) to other sub-devices as needed. Returns "false"
2086-
// if the preference is to have single root-device allocations
2087-
// serve the needs of all [sub-]devices, meaning potentially more
2088-
// cross-tile traffic.
2089-
//
2090-
static const bool SingleRootDeviceBufferMigration = [] {
2091-
const char *UrRet =
2092-
std::getenv("UR_L0_SINGLE_ROOT_DEVICE_BUFFER_MIGRATION");
2093-
const char *PiRet =
2094-
std::getenv("SYCL_PI_LEVEL_ZERO_SINGLE_ROOT_DEVICE_BUFFER_MIGRATION");
2095-
const char *EnvStr = UrRet ? UrRet : (PiRet ? PiRet : nullptr);
2096-
if (EnvStr)
2097-
return (std::stoi(EnvStr) != 0);
2098-
// The default is to migrate normally, which may not always be the
2099-
// best option (depends on buffer access patterns), but is an
2100-
// overall win on the set of the available benchmarks.
2101-
return true;
2102-
}();
21032080

21042081
// Peform actual device allocation as needed.
21052082
if (!Allocation.ZeHandle) {
2106-
if (!SingleRootDeviceBufferMigration && UrContext->SingleRootDevice &&
2107-
UrContext->SingleRootDevice != Device) {
2108-
// If all devices in the context are sub-devices of the same device
2109-
// then we reuse root-device allocation by all sub-devices in the
2110-
// context.
2111-
// TODO: we can probably generalize this and share root-device
2112-
// allocations by its own sub-devices even if not all other
2113-
// devices in the context have the same root.
2114-
UR_CALL(getZeHandle(ZeHandle, AccessMode, UrContext->SingleRootDevice,
2115-
phWaitEvents, numWaitEvents));
2116-
Allocation.ReleaseAction = allocation_t::keep;
2117-
Allocation.ZeHandle = ZeHandle;
2118-
Allocation.Valid = true;
2119-
return UR_RESULT_SUCCESS;
2120-
} else { // Create device allocation
2121-
if (DisjointPoolConfigInstance.EnableBuffers) {
2122-
Allocation.ReleaseAction = allocation_t::free;
2123-
ur_usm_desc_t USMDesc{};
2124-
USMDesc.align = getAlignment();
2125-
ur_usm_pool_handle_t Pool{};
2126-
UR_CALL(ur::level_zero::urUSMDeviceAlloc(
2127-
UrContext, Device, &USMDesc, Pool, Size,
2128-
reinterpret_cast<void **>(&ZeHandle)));
2129-
} else {
2130-
Allocation.ReleaseAction = allocation_t::free_native;
2131-
UR_CALL(ZeDeviceMemAllocHelper(reinterpret_cast<void **>(&ZeHandle),
2132-
UrContext, Device, Size));
2133-
}
2083+
// Create device allocation
2084+
if (DisjointPoolConfigInstance.EnableBuffers) {
2085+
Allocation.ReleaseAction = allocation_t::free;
2086+
ur_usm_desc_t USMDesc{};
2087+
USMDesc.align = getAlignment();
2088+
ur_usm_pool_handle_t Pool{};
2089+
UR_CALL(ur::level_zero::urUSMDeviceAlloc(
2090+
UrContext, Device, &USMDesc, Pool, Size,
2091+
reinterpret_cast<void **>(&ZeHandle)));
2092+
} else {
2093+
Allocation.ReleaseAction = allocation_t::free_native;
2094+
UR_CALL(ZeDeviceMemAllocHelper(reinterpret_cast<void **>(&ZeHandle),
2095+
UrContext, Device, Size));
21342096
}
21352097
Allocation.ZeHandle = ZeHandle;
21362098
} else {

0 commit comments

Comments
 (0)