From 419a1cb32ebca8fb8cbdf1b4ad9fdabb6c551498 Mon Sep 17 00:00:00 2001 From: Krzysztof Swiecicki Date: Thu, 5 Dec 2024 17:30:13 +0100 Subject: [PATCH 1/4] [UR][L0] Manage UMF pools through usm::pool_manager --- .../source/adapters/level_zero/context.cpp | 102 ---- .../source/adapters/level_zero/context.hpp | 31 +- .../source/adapters/level_zero/usm.cpp | 496 +++++++----------- .../source/adapters/level_zero/usm.hpp | 25 +- .../source/common/ur_pool_manager.hpp | 7 + 5 files changed, 225 insertions(+), 436 deletions(-) diff --git a/unified-runtime/source/adapters/level_zero/context.cpp b/unified-runtime/source/adapters/level_zero/context.cpp index 6c1d51fbebd64..5d7354f576913 100644 --- a/unified-runtime/source/adapters/level_zero/context.cpp +++ b/unified-runtime/source/adapters/level_zero/context.cpp @@ -193,108 +193,6 @@ ur_result_t urContextSetExtendedDeleter( ur_result_t ur_context_handle_t_::initialize() { - // Helper lambda to create various USM allocators for a device. - // Note that the CCS devices and their respective subdevices share a - // common ze_device_handle and therefore, also share USM allocators. - auto createUSMAllocators = [this](ur_device_handle_t Device) { - auto MemProvider = umf::memoryProviderMakeUnique( - reinterpret_cast(this), Device) - .second; - auto UmfDeviceParamsHandle = getUmfParamsHandle( - DisjointPoolConfigInstance.Configs[usm::DisjointPoolMemType::Device]); - DeviceMemPools.emplace( - std::piecewise_construct, std::make_tuple(Device->ZeDevice), - std::make_tuple(umf::poolMakeUniqueFromOps(umfDisjointPoolOps(), - std::move(MemProvider), - UmfDeviceParamsHandle.get()) - .second)); - - MemProvider = umf::memoryProviderMakeUnique( - reinterpret_cast(this), Device) - .second; - - auto UmfSharedParamsHandle = getUmfParamsHandle( - DisjointPoolConfigInstance.Configs[usm::DisjointPoolMemType::Shared]); - SharedMemPools.emplace( - std::piecewise_construct, std::make_tuple(Device->ZeDevice), - std::make_tuple(umf::poolMakeUniqueFromOps(umfDisjointPoolOps(), - std::move(MemProvider), - UmfSharedParamsHandle.get()) - .second)); - - MemProvider = umf::memoryProviderMakeUnique( - reinterpret_cast(this), Device) - .second; - - auto UmfSharedROParamsHandle = getUmfParamsHandle( - DisjointPoolConfigInstance - .Configs[usm::DisjointPoolMemType::SharedReadOnly]); - SharedReadOnlyMemPools.emplace( - std::piecewise_construct, std::make_tuple(Device->ZeDevice), - std::make_tuple(umf::poolMakeUniqueFromOps( - umfDisjointPoolOps(), std::move(MemProvider), - UmfSharedROParamsHandle.get()) - .second)); - - MemProvider = umf::memoryProviderMakeUnique( - reinterpret_cast(this), Device) - .second; - DeviceMemProxyPools.emplace( - std::piecewise_construct, std::make_tuple(Device->ZeDevice), - std::make_tuple( - umf::poolMakeUnique(std::move(MemProvider)).second)); - - MemProvider = umf::memoryProviderMakeUnique( - reinterpret_cast(this), Device) - .second; - SharedMemProxyPools.emplace( - std::piecewise_construct, std::make_tuple(Device->ZeDevice), - std::make_tuple( - umf::poolMakeUnique(std::move(MemProvider)).second)); - - MemProvider = umf::memoryProviderMakeUnique( - reinterpret_cast(this), Device) - .second; - SharedReadOnlyMemProxyPools.emplace( - std::piecewise_construct, std::make_tuple(Device->ZeDevice), - std::make_tuple( - umf::poolMakeUnique(std::move(MemProvider)).second)); - }; - - // Recursive helper to call createUSMAllocators for all sub-devices - std::function createUSMAllocatorsRecursive; - createUSMAllocatorsRecursive = - [createUSMAllocators, - &createUSMAllocatorsRecursive](ur_device_handle_t Device) -> void { - createUSMAllocators(Device); - for (auto &SubDevice : Device->SubDevices) - createUSMAllocatorsRecursive(SubDevice); - }; - - // Create USM pool for each pair (device, context). - // - for (auto &Device : Devices) { - createUSMAllocatorsRecursive(Device); - } - // Create USM pool for host. Device and Shared USM allocations - // are device-specific. Host allocations are not device-dependent therefore - // we don't need a map with device as key. - auto MemProvider = umf::memoryProviderMakeUnique( - reinterpret_cast(this), nullptr) - .second; - auto UmfHostParamsHandle = getUmfParamsHandle( - DisjointPoolConfigInstance.Configs[usm::DisjointPoolMemType::Host]); - HostMemPool = - umf::poolMakeUniqueFromOps(umfDisjointPoolOps(), std::move(MemProvider), - UmfHostParamsHandle.get()) - .second; - - MemProvider = umf::memoryProviderMakeUnique( - reinterpret_cast(this), nullptr) - .second; - HostMemProxyPool = - umf::poolMakeUnique(std::move(MemProvider)).second; - // We may allocate memory to this root device so create allocators. if (SingleRootDevice && DeviceMemPools.find(SingleRootDevice->ZeDevice) == DeviceMemPools.end()) { diff --git a/unified-runtime/source/adapters/level_zero/context.hpp b/unified-runtime/source/adapters/level_zero/context.hpp index 43608e8bfc65c..15e8bd62eb901 100644 --- a/unified-runtime/source/adapters/level_zero/context.hpp +++ b/unified-runtime/source/adapters/level_zero/context.hpp @@ -24,6 +24,7 @@ #include "common.hpp" #include "queue.hpp" +#include "usm.hpp" #include @@ -51,15 +52,18 @@ typedef struct _ze_intel_event_sync_mode_exp_desc_t { ze_intel_event_sync_mode_exp_flags_t syncModeFlags; } ze_intel_event_sync_mode_exp_desc_t; +extern const bool UseUSMAllocator; + struct ur_context_handle_t_ : _ur_object { ur_context_handle_t_(ze_context_handle_t ZeContext, uint32_t NumDevices, const ur_device_handle_t *Devs, bool OwnZeContext) : ZeContext{ZeContext}, Devices{Devs, Devs + NumDevices}, - NumDevices{NumDevices} { + NumDevices{NumDevices}, DefaultPool{this, nullptr, !UseUSMAllocator} { OwnNativeHandle = OwnZeContext; } - ur_context_handle_t_(ze_context_handle_t ZeContext) : ZeContext{ZeContext} {} + ur_context_handle_t_(ze_context_handle_t ZeContext) + : ZeContext{ZeContext}, DefaultPool{this, nullptr, !UseUSMAllocator} {} // A L0 context handle is primarily used during creation and management of // resources that may be used by multiple devices. @@ -123,24 +127,11 @@ struct ur_context_handle_t_ : _ur_object { // Store USM pool for USM shared and device allocations. There is 1 memory // pool per each pair of (context, device) per each memory type. - std::unordered_map - DeviceMemPools; - std::unordered_map - SharedMemPools; - std::unordered_map - SharedReadOnlyMemPools; - - // Store the host memory pool. It does not depend on any device. - umf::pool_unique_handle_t HostMemPool; - - // Allocation-tracking proxy pools for direct allocations. No pooling used. - std::unordered_map - DeviceMemProxyPools; - std::unordered_map - SharedMemProxyPools; - std::unordered_map - SharedReadOnlyMemProxyPools; - umf::pool_unique_handle_t HostMemProxyPool; + // It's either a DisjointPool implementation from UMF or an + // allocation-tracking proxy pool for direct allocations that does not + // internally pool memory. Actual implementation during runtime is decided by + // the 'UseUSMAllocator' variable value. + ur_usm_pool_handle_t_ DefaultPool; // Map associating pools created with urUsmPoolCreate and internal pools std::list UsmPoolHandles{}; diff --git a/unified-runtime/source/adapters/level_zero/usm.cpp b/unified-runtime/source/adapters/level_zero/usm.cpp index ae3a693d20981..ace619a12426e 100644 --- a/unified-runtime/source/adapters/level_zero/usm.cpp +++ b/unified-runtime/source/adapters/level_zero/usm.cpp @@ -332,66 +332,15 @@ ur_result_t urUSMHostAlloc( size_t Size, /// [out] pointer to USM host memory object void **RetMem) { - - uint32_t Align = USMDesc ? USMDesc->align : 0; - // L0 supports alignment up to 64KB and silently ignores higher values. - // We flag alignment > 64KB as an invalid value. - // L0 spec says that alignment values that are not powers of 2 are invalid. - // If alignment == 0, then we are allowing the L0 driver to choose the - // alignment so no need to check. - if (Align > 0) { - if (Align > 65536 || (Align & (Align - 1)) != 0) - return UR_RESULT_ERROR_INVALID_VALUE; - } - - ur_platform_handle_t Plt = Context->getPlatform(); - // If indirect access tracking is enabled then lock the mutex which is - // guarding contexts container in the platform. This prevents new kernels from - // being submitted in any context while we are in the process of allocating a - // memory, this is needed to properly capture allocations by kernels with - // indirect access. This lock also protects access to the context's data - // structures. If indirect access tracking is not enabled then lock context - // mutex to protect access to context's data structures. - std::shared_lock ContextLock(Context->Mutex, - std::defer_lock); - std::unique_lock IndirectAccessTrackingLock( - Plt->ContextsMutex, std::defer_lock); - if (IndirectAccessTrackingEnabled) { - IndirectAccessTrackingLock.lock(); - // We are going to defer memory release if there are kernels with indirect - // access, that is why explicitly retain context to be sure that it is - // released after all memory allocations in this context are released. - UR_CALL(ur::level_zero::urContextRetain(Context)); - } else { - ContextLock.lock(); - } - - // There is a single allocator for Host USM allocations, so we don't need to - // find the allocator depending on context as we do for Shared and Device - // allocations. - umf_memory_pool_handle_t hPoolInternal = nullptr; - if (!UseUSMAllocator) { - hPoolInternal = Context->HostMemProxyPool.get(); - } else if (Pool) { - hPoolInternal = Pool->HostMemPool.get(); + ur_usm_pool_handle_t USMPool = nullptr; + if (Pool) { + USMPool = Pool; } else { - hPoolInternal = Context->HostMemPool.get(); - } - - *RetMem = umfPoolAlignedMalloc(hPoolInternal, Size, Align); - if (*RetMem == nullptr) { - auto umfRet = umfPoolGetLastAllocationError(hPoolInternal); - return umf2urResult(umfRet); + USMPool = &Context->DefaultPool; } - if (IndirectAccessTrackingEnabled) { - // Keep track of all memory allocations in the context - Context->MemAllocs.emplace(std::piecewise_construct, - std::forward_as_tuple(*RetMem), - std::forward_as_tuple(Context)); - } - - return UR_RESULT_SUCCESS; + return USMPool->allocate(Context, nullptr, USMDesc, UR_USM_TYPE_HOST, Size, + RetMem); } ur_result_t urUSMDeviceAlloc( @@ -408,72 +357,15 @@ ur_result_t urUSMDeviceAlloc( /// [out] pointer to USM device memory object void **RetMem) { - uint32_t Alignment = USMDesc ? USMDesc->align : 0; - - // L0 supports alignment up to 64KB and silently ignores higher values. - // We flag alignment > 64KB as an invalid value. - // L0 spec says that alignment values that are not powers of 2 are invalid. - // If alignment == 0, then we are allowing the L0 driver to choose the - // alignment so no need to check. - if (Alignment > 0) { - if (Alignment > 65536 || (Alignment & (Alignment - 1)) != 0) - return UR_RESULT_ERROR_INVALID_VALUE; - } - - ur_platform_handle_t Plt = Device->Platform; - - // If indirect access tracking is enabled then lock the mutex which is - // guarding contexts container in the platform. This prevents new kernels from - // being submitted in any context while we are in the process of allocating a - // memory, this is needed to properly capture allocations by kernels with - // indirect access. This lock also protects access to the context's data - // structures. If indirect access tracking is not enabled then lock context - // mutex to protect access to context's data structures. - std::shared_lock ContextLock(Context->Mutex, - std::defer_lock); - std::unique_lock IndirectAccessTrackingLock( - Plt->ContextsMutex, std::defer_lock); - if (IndirectAccessTrackingEnabled) { - IndirectAccessTrackingLock.lock(); - // We are going to defer memory release if there are kernels with indirect - // access, that is why explicitly retain context to be sure that it is - // released after all memory allocations in this context are released. - UR_CALL(ur::level_zero::urContextRetain(Context)); + ur_usm_pool_handle_t USMPool = nullptr; + if (Pool) { + USMPool = Pool; } else { - ContextLock.lock(); + USMPool = &Context->DefaultPool; } - umf_memory_pool_handle_t hPoolInternal = nullptr; - if (!UseUSMAllocator) { - auto It = Context->DeviceMemProxyPools.find(Device->ZeDevice); - if (It == Context->DeviceMemProxyPools.end()) - return UR_RESULT_ERROR_INVALID_VALUE; - - hPoolInternal = It->second.get(); - } else if (Pool) { - hPoolInternal = Pool->DeviceMemPools[Device].get(); - } else { - auto It = Context->DeviceMemPools.find(Device->ZeDevice); - if (It == Context->DeviceMemPools.end()) - return UR_RESULT_ERROR_INVALID_VALUE; - - hPoolInternal = It->second.get(); - } - - *RetMem = umfPoolAlignedMalloc(hPoolInternal, Size, Alignment); - if (*RetMem == nullptr) { - auto umfRet = umfPoolGetLastAllocationError(hPoolInternal); - return umf2urResult(umfRet); - } - - if (IndirectAccessTrackingEnabled) { - // Keep track of all memory allocations in the context - Context->MemAllocs.emplace(std::piecewise_construct, - std::forward_as_tuple(*RetMem), - std::forward_as_tuple(Context)); - } - - return UR_RESULT_SUCCESS; + return USMPool->allocate(Context, Device, USMDesc, UR_USM_TYPE_DEVICE, Size, + RetMem); } ur_result_t urUSMSharedAlloc( @@ -489,100 +381,15 @@ ur_result_t urUSMSharedAlloc( size_t Size, /// [out] pointer to USM shared memory object void **RetMem) { - - uint32_t Alignment = USMDesc ? USMDesc->align : 0; - - ur_usm_host_mem_flags_t UsmHostFlags{}; - - // See if the memory is going to be read-only on the device. - bool DeviceReadOnly = false; - ur_usm_device_mem_flags_t UsmDeviceFlags{}; - - void *pNext = USMDesc ? const_cast(USMDesc->pNext) : nullptr; - while (pNext != nullptr) { - const ur_base_desc_t *BaseDesc = - reinterpret_cast(pNext); - if (BaseDesc->stype == UR_STRUCTURE_TYPE_USM_DEVICE_DESC) { - const ur_usm_device_desc_t *UsmDeviceDesc = - reinterpret_cast(pNext); - UsmDeviceFlags = UsmDeviceDesc->flags; - } - if (BaseDesc->stype == UR_STRUCTURE_TYPE_USM_HOST_DESC) { - const ur_usm_host_desc_t *UsmHostDesc = - reinterpret_cast(pNext); - UsmHostFlags = UsmHostDesc->flags; - std::ignore = UsmHostFlags; - } - pNext = const_cast(BaseDesc->pNext); - } - DeviceReadOnly = UsmDeviceFlags & UR_USM_DEVICE_MEM_FLAG_DEVICE_READ_ONLY; - - // L0 supports alignment up to 64KB and silently ignores higher values. - // We flag alignment > 64KB as an invalid value. - // L0 spec says that alignment values that are not powers of 2 are invalid. - // If alignment == 0, then we are allowing the L0 driver to choose the - // alignment so no need to check. - if (Alignment > 0) { - if (Alignment > 65536 || (Alignment & (Alignment - 1)) != 0) - return UR_RESULT_ERROR_INVALID_VALUE; - } - - ur_platform_handle_t Plt = Device->Platform; - - // If indirect access tracking is enabled then lock the mutex which is - // guarding contexts container in the platform. This prevents new kernels from - // being submitted in any context while we are in the process of allocating a - // memory, this is needed to properly capture allocations by kernels with - // indirect access. This lock also protects access to the context's data - // structures. If indirect access tracking is not enabled then lock context - // mutex to protect access to context's data structures. - std::scoped_lock Lock( - IndirectAccessTrackingEnabled ? Plt->ContextsMutex : Context->Mutex); - - if (IndirectAccessTrackingEnabled) { - // We are going to defer memory release if there are kernels with indirect - // access, that is why explicitly retain context to be sure that it is - // released after all memory allocations in this context are released. - UR_CALL(ur::level_zero::urContextRetain(Context)); - } - - umf_memory_pool_handle_t hPoolInternal = nullptr; - if (!UseUSMAllocator) { - auto &Allocator = (DeviceReadOnly ? Context->SharedReadOnlyMemProxyPools - : Context->SharedMemProxyPools); - auto It = Allocator.find(Device->ZeDevice); - if (It == Allocator.end()) - return UR_RESULT_ERROR_INVALID_VALUE; - - hPoolInternal = It->second.get(); - } else if (Pool) { - hPoolInternal = (DeviceReadOnly) - ? Pool->SharedReadOnlyMemPools[Device].get() - : Pool->SharedMemPools[Device].get(); + ur_usm_pool_handle_t USMPool = nullptr; + if (Pool) { + USMPool = Pool; } else { - auto &Allocator = (DeviceReadOnly ? Context->SharedReadOnlyMemPools - : Context->SharedMemPools); - auto It = Allocator.find(Device->ZeDevice); - if (It == Allocator.end()) - return UR_RESULT_ERROR_INVALID_VALUE; - - hPoolInternal = It->second.get(); - } - - *RetMem = umfPoolAlignedMalloc(hPoolInternal, Size, Alignment); - if (*RetMem == nullptr) { - auto umfRet = umfPoolGetLastAllocationError(hPoolInternal); - return umf2urResult(umfRet); - } - - if (IndirectAccessTrackingEnabled) { - // Keep track of all memory allocations in the context - Context->MemAllocs.emplace(std::piecewise_construct, - std::forward_as_tuple(*RetMem), - std::forward_as_tuple(Context)); + USMPool = &Context->DefaultPool; } - return UR_RESULT_SUCCESS; + return USMPool->allocate(Context, Device, USMDesc, UR_USM_TYPE_SHARED, Size, + RetMem); } ur_result_t @@ -667,26 +474,8 @@ ur_result_t urUSMGetMemAllocInfo( std::shared_lock ContextLock(Context->Mutex); - auto SearchMatchingPool = - [](std::unordered_map - &PoolMap, - umf_memory_pool_handle_t UMFPool) { - for (auto &PoolPair : PoolMap) { - if (PoolPair.second.get() == UMFPool) { - return true; - } - } - return false; - }; - for (auto &Pool : Context->UsmPoolHandles) { - if (SearchMatchingPool(Pool->DeviceMemPools, UMFPool)) { - return ReturnValue(Pool); - } - if (SearchMatchingPool(Pool->SharedMemPools, UMFPool)) { - return ReturnValue(Pool); - } - if (Pool->HostMemPool.get() == UMFPool) { + if (Pool->hasPool(UMFPool)) { return ReturnValue(Pool); } } @@ -1082,86 +871,193 @@ ur_result_t L0HostMemoryProvider::allocateImpl(void **ResultPtr, size_t Size, return USMHostAllocImpl(ResultPtr, Context, /* flags */ 0, Size, Alignment); } -ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context, - ur_usm_pool_desc_t *PoolDesc) { - - this->Context = Context; - zeroInit = static_cast(PoolDesc->flags & - UR_USM_POOL_FLAG_ZERO_INITIALIZE_BLOCK); - - void *pNext = const_cast(PoolDesc->pNext); - while (pNext != nullptr) { - const ur_base_desc_t *BaseDesc = - reinterpret_cast(pNext); - switch (BaseDesc->stype) { - case UR_STRUCTURE_TYPE_USM_POOL_LIMITS_DESC: { - const ur_usm_pool_limits_desc_t *Limits = - reinterpret_cast(BaseDesc); - for (auto &config : DisjointPoolConfigs.Configs) { - config.MaxPoolableSize = Limits->maxPoolableSize; - config.SlabMinSize = Limits->minDriverAllocSize; +static usm::DisjointPoolMemType +DescToDisjointPoolMemType(const usm::pool_descriptor &desc) { + switch (desc.type) { + case UR_USM_TYPE_DEVICE: + return usm::DisjointPoolMemType::Device; + case UR_USM_TYPE_SHARED: + if (desc.deviceReadOnly) + return usm::DisjointPoolMemType::SharedReadOnly; + else + return usm::DisjointPoolMemType::Shared; + case UR_USM_TYPE_HOST: + return usm::DisjointPoolMemType::Host; + default: + throw UR_RESULT_ERROR_INVALID_ARGUMENT; + } +} + +typedef usm::pool_descriptor l0_memory_provider_params_t; + +template +static umf::provider_unique_handle_t +MakeProvider(ProviderParams *Params = nullptr) { + if constexpr (std::is_same_v) { + umf_result_t Ret = UMF_RESULT_SUCCESS; + umf::provider_unique_handle_t &&L0Provider = nullptr; + + switch (Params->type) { + case UR_USM_TYPE_HOST: + std::tie(Ret, L0Provider) = + umf::memoryProviderMakeUnique(Params->hContext, + Params->hDevice); + break; + case UR_USM_TYPE_DEVICE: + std::tie(Ret, L0Provider) = + umf::memoryProviderMakeUnique( + Params->hContext, Params->hDevice); + break; + case UR_USM_TYPE_SHARED: + if (Params->deviceReadOnly) { + std::tie(Ret, L0Provider) = + umf::memoryProviderMakeUnique( + Params->hContext, Params->hDevice); + } else { + std::tie(Ret, L0Provider) = + umf::memoryProviderMakeUnique( + Params->hContext, Params->hDevice); } break; + default: + logger::error("urUSMPoolCreate: invalid USM type found"); + Ret = UMF_RESULT_ERROR_INVALID_ARGUMENT; } - default: { - logger::error("urUSMPoolCreate: unexpected chained stype"); - throw UsmAllocationException(UR_RESULT_ERROR_INVALID_ARGUMENT); + + if (Ret != UMF_RESULT_SUCCESS) { + logger::error("urUSMPoolCreate: failed to create UMF provider"); + throw UsmAllocationException(umf::umf2urResult(Ret)); } + + return std::move(L0Provider); + } + + return nullptr; +} + +ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context, + ur_usm_pool_desc_t *PoolDesc, + bool IsProxy) + : Context(Context) { + // TODO: handle zero-init flag 'UR_USM_POOL_FLAG_ZERO_INITIALIZE_BLOCK' + auto DisjointPoolConfigs = DisjointPoolConfigInstance; + if (auto Limits = find_stype_node(PoolDesc)) { + for (auto &Config : DisjointPoolConfigs.Configs) { + Config.MaxPoolableSize = Limits->maxPoolableSize; + Config.SlabMinSize = Limits->minDriverAllocSize; } - pNext = const_cast(BaseDesc->pNext); } - auto MemProvider = - umf::memoryProviderMakeUnique(Context, nullptr) - .second; - - auto UmfHostParamsHandle = getUmfParamsHandle( - DisjointPoolConfigInstance.Configs[usm::DisjointPoolMemType::Host]); - HostMemPool = - umf::poolMakeUniqueFromOps(umfDisjointPoolOps(), std::move(MemProvider), - UmfHostParamsHandle.get()) - .second; - - for (auto device : Context->Devices) { - MemProvider = - umf::memoryProviderMakeUnique(Context, device) - .second; - auto UmfDeviceParamsHandle = getUmfParamsHandle( - DisjointPoolConfigInstance.Configs[usm::DisjointPoolMemType::Device]); - DeviceMemPools.emplace( - std::piecewise_construct, std::make_tuple(device), - std::make_tuple(umf::poolMakeUniqueFromOps(umfDisjointPoolOps(), - std::move(MemProvider), - UmfDeviceParamsHandle.get()) - .second)); - - MemProvider = - umf::memoryProviderMakeUnique(Context, device) - .second; - auto UmfSharedParamsHandle = getUmfParamsHandle( - DisjointPoolConfigInstance.Configs[usm::DisjointPoolMemType::Shared]); - SharedMemPools.emplace( - std::piecewise_construct, std::make_tuple(device), - std::make_tuple(umf::poolMakeUniqueFromOps(umfDisjointPoolOps(), - std::move(MemProvider), - UmfSharedParamsHandle.get()) - .second)); - - MemProvider = umf::memoryProviderMakeUnique( - Context, device) - .second; - auto UmfSharedROParamsHandle = getUmfParamsHandle( - DisjointPoolConfigInstance - .Configs[usm::DisjointPoolMemType::SharedReadOnly]); - SharedReadOnlyMemPools.emplace( - std::piecewise_construct, std::make_tuple(device), - std::make_tuple(umf::poolMakeUniqueFromOps( - umfDisjointPoolOps(), std::move(MemProvider), - UmfSharedROParamsHandle.get()) - .second)); + auto [Ret, Descriptors] = usm::pool_descriptor::create(this, Context); + if (Ret) { + logger::error("urUSMPoolCreate: failed to create pool descriptors"); + throw UsmAllocationException(Ret); + } + + for (auto &Desc : Descriptors) { + umf::pool_unique_handle_t Pool = nullptr; + if (IsProxy) { + Pool = usm::makeProxyPool(MakeProvider(&Desc)); + } else { + auto &PoolConfig = + DisjointPoolConfigs.Configs[DescToDisjointPoolMemType(Desc)]; + Pool = usm::makeDisjointPool(MakeProvider(&Desc), PoolConfig); + } + + Ret = PoolManager.addPool(Desc, std::move(Pool)); + if (Ret) { + logger::error("urUSMPoolCreate: failed to store UMF pool"); + throw UsmAllocationException(Ret); + } } } +umf_memory_pool_handle_t +ur_usm_pool_handle_t_::getPool(const usm::pool_descriptor &Desc) { + auto PoolOpt = PoolManager.getPool(Desc); + return PoolOpt.has_value() ? PoolOpt.value() : nullptr; +} + +ur_result_t ur_usm_pool_handle_t_::allocate(ur_context_handle_t Context, + ur_device_handle_t Device, + const ur_usm_desc_t *USMDesc, + ur_usm_type_t Type, size_t Size, + void **RetMem) { + uint32_t Alignment = USMDesc ? USMDesc->align : 0; + // L0 supports alignment up to 64KB and silently ignores higher values. + // We flag alignment > 64KB as an invalid value. + // L0 spec says that alignment values that are not powers of 2 are invalid. + // If alignment == 0, then we are allowing the L0 driver to choose the + // alignment so no need to check. + if (Alignment > 0) { + if (Alignment > 65536 || (Alignment & (Alignment - 1)) != 0) + return UR_RESULT_ERROR_INVALID_VALUE; + } + + // Handle the extension structures for 'ur_usm_desc_t'. + if (auto UsmHostDesc = find_stype_node(USMDesc)) { + std::ignore = UsmHostDesc; // Unused + } + + bool DeviceReadOnly = false; + if (auto UsmDeviceDesc = find_stype_node(USMDesc)) { + DeviceReadOnly = + (Type == UR_USM_TYPE_SHARED) && + (UsmDeviceDesc->flags & UR_USM_DEVICE_MEM_FLAG_DEVICE_READ_ONLY); + } + + ur_platform_handle_t Plt = + (Device) ? Device->Platform : Context->getPlatform(); + // If indirect access tracking is enabled then lock the mutex which is + // guarding contexts container in the platform. This prevents new kernels from + // being submitted in any context while we are in the process of allocating a + // memory, this is needed to properly capture allocations by kernels with + // indirect access. This lock also protects access to the context's data + // structures. If indirect access tracking is not enabled then lock context + // mutex to protect access to context's data structures. + std::shared_lock ContextLock(Context->Mutex, + std::defer_lock); + std::unique_lock IndirectAccessTrackingLock( + Plt->ContextsMutex, std::defer_lock); + if (IndirectAccessTrackingEnabled) { + IndirectAccessTrackingLock.lock(); + // We are going to defer memory release if there are kernels with indirect + // access, that is why explicitly retain context to be sure that it is + // released after all memory allocations in this context are released. + UR_CALL(ur::level_zero::urContextRetain(Context)); + } else { + ContextLock.lock(); + } + + auto umfPool = getPool( + usm::pool_descriptor{this, Context, Device, Type, DeviceReadOnly}); + if (!umfPool) { + return UR_RESULT_ERROR_INVALID_ARGUMENT; + } + + *RetMem = umfPoolAlignedMalloc(umfPool, Size, Alignment); + if (*RetMem == nullptr) { + auto umfRet = umfPoolGetLastAllocationError(umfPool); + logger::error( + "enqueueUSMAllocHelper: allocation from the UMF pool {} failed", + umfPool); + return umf::umf2urResult(umfRet); + } + + if (IndirectAccessTrackingEnabled) { + // Keep track of all memory allocations in the context + Context->MemAllocs.emplace(std::piecewise_construct, + std::forward_as_tuple(*RetMem), + std::forward_as_tuple(Context)); + } + + return UR_RESULT_SUCCESS; +} + +bool ur_usm_pool_handle_t_::hasPool(const umf_memory_pool_handle_t Pool) { + return PoolManager.hasPool(Pool); +} + // If indirect access tracking is not enabled then this functions just performs // zeMemFree. If indirect access tracking is enabled then reference counting is // performed. diff --git a/unified-runtime/source/adapters/level_zero/usm.hpp b/unified-runtime/source/adapters/level_zero/usm.hpp index 2fe74a5ecf288..3e4af8a6d32f0 100644 --- a/unified-runtime/source/adapters/level_zero/usm.hpp +++ b/unified-runtime/source/adapters/level_zero/usm.hpp @@ -11,28 +11,25 @@ #include "common.hpp" +#include "ur_pool_manager.hpp" #include usm::DisjointPoolAllConfigs InitializeDisjointPoolConfig(); struct ur_usm_pool_handle_t_ : _ur_object { - bool zeroInit; - - usm::DisjointPoolAllConfigs DisjointPoolConfigs = - InitializeDisjointPoolConfig(); + ur_usm_pool_handle_t_(ur_context_handle_t Context, + ur_usm_pool_desc_t *PoolDesc, bool IsProxy = false); - std::unordered_map - DeviceMemPools; - std::unordered_map - SharedMemPools; - std::unordered_map - SharedReadOnlyMemPools; - umf::pool_unique_handle_t HostMemPool; + ur_result_t allocate(ur_context_handle_t Context, ur_device_handle_t Device, + const ur_usm_desc_t *USMDesc, ur_usm_type_t Type, + size_t Size, void **RetMem); + bool hasPool(const umf_memory_pool_handle_t Pool); - ur_context_handle_t Context{}; + ur_context_handle_t Context; - ur_usm_pool_handle_t_(ur_context_handle_t Context, - ur_usm_pool_desc_t *PoolDesc); +private: + umf_memory_pool_handle_t getPool(const usm::pool_descriptor &Desc); + usm::pool_manager PoolManager; }; // Exception type to pass allocation errors diff --git a/unified-runtime/source/common/ur_pool_manager.hpp b/unified-runtime/source/common/ur_pool_manager.hpp index 5d2beda18d181..45dc7e39fc0b3 100644 --- a/unified-runtime/source/common/ur_pool_manager.hpp +++ b/unified-runtime/source/common/ur_pool_manager.hpp @@ -289,6 +289,13 @@ template struct pool_manager { return it->second.get(); } + + bool hasPool(const umf_memory_pool_handle_t hPool) noexcept { + return std::any_of(descToPoolMap.begin(), descToPoolMap.end(), + [hPool](const auto &descPoolPair) { + return descPoolPair.second.get() == hPool; + }); + } }; inline umf::pool_unique_handle_t From e260aa1610298ddb9ddc97f3276bd41fcc53fcd1 Mon Sep 17 00:00:00 2001 From: Krzysztof Swiecicki Date: Tue, 17 Dec 2024 13:23:05 +0100 Subject: [PATCH 2/4] [UR][L0] Remove SingleRootDevice from context This variable is not initialized in any way and it is set to nullptr by default. --- .../source/adapters/level_zero/context.cpp | 9 +-- .../source/adapters/level_zero/context.hpp | 7 -- .../source/adapters/level_zero/memory.cpp | 66 ++++--------------- 3 files changed, 15 insertions(+), 67 deletions(-) diff --git a/unified-runtime/source/adapters/level_zero/context.cpp b/unified-runtime/source/adapters/level_zero/context.cpp index 5d7354f576913..01320a0bb8acf 100644 --- a/unified-runtime/source/adapters/level_zero/context.cpp +++ b/unified-runtime/source/adapters/level_zero/context.cpp @@ -192,13 +192,6 @@ ur_result_t urContextSetExtendedDeleter( } // namespace ur::level_zero ur_result_t ur_context_handle_t_::initialize() { - - // We may allocate memory to this root device so create allocators. - if (SingleRootDevice && - DeviceMemPools.find(SingleRootDevice->ZeDevice) == DeviceMemPools.end()) { - createUSMAllocators(SingleRootDevice); - } - // Create the immediate command list to be used for initializations. // Created as synchronous so level-zero performs implicit synchronization and // there is no need to query for completion in the plugin @@ -209,7 +202,7 @@ ur_result_t ur_context_handle_t_::initialize() { // D2D migartion, if no P2P, is broken since it should use // immediate command-list for the specfic devices, and this single one. // - ur_device_handle_t Device = SingleRootDevice ? SingleRootDevice : Devices[0]; + ur_device_handle_t Device = Devices[0]; // Prefer to use copy engine for initialization copies, // if available and allowed (main copy engine with index 0). diff --git a/unified-runtime/source/adapters/level_zero/context.hpp b/unified-runtime/source/adapters/level_zero/context.hpp index 15e8bd62eb901..dc6a56e2cb10f 100644 --- a/unified-runtime/source/adapters/level_zero/context.hpp +++ b/unified-runtime/source/adapters/level_zero/context.hpp @@ -98,13 +98,6 @@ struct ur_context_handle_t_ : _ur_object { // compute and copy command list caches. ur_mutex ZeCommandListCacheMutex; - // If context contains one device or sub-devices of the same device, we want - // to save this device. - // This field is only set at ur_context_handle_t creation time, and cannot - // change. Therefore it can be accessed without holding a lock on this - // ur_context_handle_t. - ur_device_handle_t SingleRootDevice = nullptr; - // Cache of all currently available/completed command/copy lists. // Note that command-list can only be re-used on the same device. // diff --git a/unified-runtime/source/adapters/level_zero/memory.cpp b/unified-runtime/source/adapters/level_zero/memory.cpp index 4a5cb787dcd11..925dd053ccaaa 100644 --- a/unified-runtime/source/adapters/level_zero/memory.cpp +++ b/unified-runtime/source/adapters/level_zero/memory.cpp @@ -1515,9 +1515,7 @@ ur_result_t urMemImageCreate( // own the image. // TODO: Implement explicit copying for acessing the image from other devices // in the context. - ur_device_handle_t Device = Context->SingleRootDevice - ? Context->SingleRootDevice - : Context->Devices[0]; + ur_device_handle_t Device = Context->Devices[0]; ze_image_handle_t ZeImage; ZE2UR_CALL(zeImageCreate, (Context->ZeContext, Device->ZeDevice, &ZeImageDesc, &ZeImage)); @@ -2079,58 +2077,22 @@ ur_result_t _ur_buffer::getBufferZeHandle(char *&ZeHandle, LastDeviceWithValidAllocation = Device; return UR_RESULT_SUCCESS; } - // Reads user setting on how to deal with buffers in contexts where - // all devices have the same root-device. Returns "true" if the - // preference is to have allocate on each [sub-]device and migrate - // normally (copy) to other sub-devices as needed. Returns "false" - // if the preference is to have single root-device allocations - // serve the needs of all [sub-]devices, meaning potentially more - // cross-tile traffic. - // - static const bool SingleRootDeviceBufferMigration = [] { - const char *UrRet = - std::getenv("UR_L0_SINGLE_ROOT_DEVICE_BUFFER_MIGRATION"); - const char *PiRet = - std::getenv("SYCL_PI_LEVEL_ZERO_SINGLE_ROOT_DEVICE_BUFFER_MIGRATION"); - const char *EnvStr = UrRet ? UrRet : (PiRet ? PiRet : nullptr); - if (EnvStr) - return (std::stoi(EnvStr) != 0); - // The default is to migrate normally, which may not always be the - // best option (depends on buffer access patterns), but is an - // overall win on the set of the available benchmarks. - return true; - }(); // Peform actual device allocation as needed. if (!Allocation.ZeHandle) { - if (!SingleRootDeviceBufferMigration && UrContext->SingleRootDevice && - UrContext->SingleRootDevice != Device) { - // If all devices in the context are sub-devices of the same device - // then we reuse root-device allocation by all sub-devices in the - // context. - // TODO: we can probably generalize this and share root-device - // allocations by its own sub-devices even if not all other - // devices in the context have the same root. - UR_CALL(getZeHandle(ZeHandle, AccessMode, UrContext->SingleRootDevice, - phWaitEvents, numWaitEvents)); - Allocation.ReleaseAction = allocation_t::keep; - Allocation.ZeHandle = ZeHandle; - Allocation.Valid = true; - return UR_RESULT_SUCCESS; - } else { // Create device allocation - if (DisjointPoolConfigInstance.EnableBuffers) { - Allocation.ReleaseAction = allocation_t::free; - ur_usm_desc_t USMDesc{}; - USMDesc.align = getAlignment(); - ur_usm_pool_handle_t Pool{}; - UR_CALL(ur::level_zero::urUSMDeviceAlloc( - UrContext, Device, &USMDesc, Pool, Size, - reinterpret_cast(&ZeHandle))); - } else { - Allocation.ReleaseAction = allocation_t::free_native; - UR_CALL(ZeDeviceMemAllocHelper(reinterpret_cast(&ZeHandle), - UrContext, Device, Size)); - } + // Create device allocation + if (DisjointPoolConfigInstance.EnableBuffers) { + Allocation.ReleaseAction = allocation_t::free; + ur_usm_desc_t USMDesc{}; + USMDesc.align = getAlignment(); + ur_usm_pool_handle_t Pool{}; + UR_CALL(ur::level_zero::urUSMDeviceAlloc( + UrContext, Device, &USMDesc, Pool, Size, + reinterpret_cast(&ZeHandle))); + } else { + Allocation.ReleaseAction = allocation_t::free_native; + UR_CALL(ZeDeviceMemAllocHelper(reinterpret_cast(&ZeHandle), + UrContext, Device, Size)); } Allocation.ZeHandle = ZeHandle; } else { From d0caea2802a032f06c5e90aff4a46fd302128c91 Mon Sep 17 00:00:00 2001 From: Krzysztof Swiecicki Date: Tue, 7 Jan 2025 14:41:35 +0100 Subject: [PATCH 3/4] [UR][L0] Fix provider native error reporting Level Zero provider internally stores native errors of ur_result_t type. --- unified-runtime/source/adapters/level_zero/usm.cpp | 3 ++- unified-runtime/source/adapters/level_zero/usm.hpp | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/unified-runtime/source/adapters/level_zero/usm.cpp b/unified-runtime/source/adapters/level_zero/usm.cpp index ace619a12426e..e4de92af67c93 100644 --- a/unified-runtime/source/adapters/level_zero/usm.cpp +++ b/unified-runtime/source/adapters/level_zero/usm.cpp @@ -27,7 +27,8 @@ namespace umf { ur_result_t getProviderNativeError(const char *providerName, int32_t nativeError) { if (strcmp(providerName, "Level Zero") == 0) { - return ze2urResult(static_cast(nativeError)); + // L0 provider stores native errors of ur_result_t type + return static_cast(nativeError); } return UR_RESULT_ERROR_UNKNOWN; diff --git a/unified-runtime/source/adapters/level_zero/usm.hpp b/unified-runtime/source/adapters/level_zero/usm.hpp index 3e4af8a6d32f0..f812d441f1802 100644 --- a/unified-runtime/source/adapters/level_zero/usm.hpp +++ b/unified-runtime/source/adapters/level_zero/usm.hpp @@ -122,7 +122,7 @@ class L0MemoryProvider : public USMMemoryProviderBase { umf_result_t free(void *Ptr, size_t Size) override; umf_result_t get_min_page_size(void *, size_t *) override; // TODO: Different name for each provider (Host/Shared/SharedRO/Device) - const char *get_name() override { return "L0"; }; + const char *get_name() override { return "Level Zero"; }; umf_result_t get_ipc_handle_size(size_t *) override; umf_result_t get_ipc_handle(const void *, size_t, void *) override; umf_result_t put_ipc_handle(void *) override; From 610cb0e1d9192a9fe319a9a4afa19f4411fa8723 Mon Sep 17 00:00:00 2001 From: Krzysztof Swiecicki Date: Thu, 27 Feb 2025 16:05:59 +0000 Subject: [PATCH 4/4] [SYCL][E2E] Make 8 free checks accounting for page query optional When the L0 adapter internal pooling is turned off eg. with SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR=1, the adapter no longer creates an UMF pool that queries for page size during its creation. The allocations made for the query were accounted for in the L0 buffer ownership test, and it's no longer the case. It still affects the v2 L0 adapter. --- .../Adapters/interop-level-zero-buffer-ownership.cpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/sycl/test-e2e/Adapters/interop-level-zero-buffer-ownership.cpp b/sycl/test-e2e/Adapters/interop-level-zero-buffer-ownership.cpp index c1e72929ad489..f5055fb4a4455 100644 --- a/sycl/test-e2e/Adapters/interop-level-zero-buffer-ownership.cpp +++ b/sycl/test-e2e/Adapters/interop-level-zero-buffer-ownership.cpp @@ -18,8 +18,16 @@ // Keep ownership // CHECK: zeMemFree -// Account for zeMemFree used to query page sizes by the UMF -// CHECK-COUNT-8: zeMemFree +// Account for zeMemFree used to query page sizes by the UMF (only affects v2 L0 +// adapter) +// CHECK-OPT: zeMemFree +// CHECK-OPT: zeMemFree +// CHECK-OPT: zeMemFree +// CHECK-OPT: zeMemFree +// CHECK-OPT: zeMemFree +// CHECK-OPT: zeMemFree +// CHECK-OPT: zeMemFree +// CHECK-OPT: zeMemFree // Transfer ownership // CHECK: zeMemFree