From da5500c6b267d1492f84e9552946d55817045076 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Mon, 26 Aug 2024 20:25:02 +0100 Subject: [PATCH 01/48] [SYCLNATIVECPU] inline native_cpu adapter functions --- source/adapters/native_cpu/enqueue.cpp | 4 ++-- source/adapters/native_cpu/kernel.hpp | 12 ++++++------ source/adapters/native_cpu/nativecpu_state.hpp | 6 +++--- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp index b5d4713e2f..ec9ba99389 100644 --- a/source/adapters/native_cpu/enqueue.cpp +++ b/source/adapters/native_cpu/enqueue.cpp @@ -25,7 +25,7 @@ struct NDRDescT { RangeT GlobalOffset; RangeT GlobalSize; RangeT LocalSize; - NDRDescT(uint32_t WorkDim, const size_t *GlobalWorkOffset, + inline NDRDescT(uint32_t WorkDim, const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, const size_t *LocalWorkSize) : WorkDim(WorkDim) { for (uint32_t I = 0; I < WorkDim; I++) { @@ -52,7 +52,7 @@ struct NDRDescT { } // namespace native_cpu #ifdef NATIVECPU_USE_OCK -static native_cpu::state getResizedState(const native_cpu::NDRDescT &ndr, +static inline native_cpu::state getResizedState(const native_cpu::NDRDescT &ndr, size_t itemsPerThread) { native_cpu::state resized_state( ndr.GlobalSize[0], ndr.GlobalSize[1], ndr.GlobalSize[2], itemsPerThread, diff --git a/source/adapters/native_cpu/kernel.hpp b/source/adapters/native_cpu/kernel.hpp index b5728fa8b2..c71e4f7e75 100644 --- a/source/adapters/native_cpu/kernel.hpp +++ b/source/adapters/native_cpu/kernel.hpp @@ -20,7 +20,7 @@ namespace native_cpu { struct NativeCPUArgDesc { void *MPtr; - NativeCPUArgDesc(void *Ptr) : MPtr(Ptr){}; + inline NativeCPUArgDesc(void *Ptr) : MPtr(Ptr){}; }; } // namespace native_cpu @@ -33,18 +33,18 @@ using nativecpu_task_t = std::function<nativecpu_kernel_t>; struct local_arg_info_t { uint32_t argIndex; size_t argSize; - local_arg_info_t(uint32_t argIndex, size_t argSize) + inline local_arg_info_t(uint32_t argIndex, size_t argSize) : argIndex(argIndex), argSize(argSize) {} }; struct ur_kernel_handle_t_ : RefCounted { - ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name, + inline ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name, nativecpu_task_t subhandler) : hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)}, HasReqdWGSize(false) {} - ur_kernel_handle_t_(const ur_kernel_handle_t_ &other) + inline ur_kernel_handle_t_(const ur_kernel_handle_t_ &other) : hProgram(other.hProgram), _name(other._name), _subhandler(other._subhandler), _args(other._args), _localArgInfo(other._localArgInfo), _localMemPool(other._localMemPool), @@ -53,12 +53,12 @@ struct ur_kernel_handle_t_ : RefCounted { incrementReferenceCount(); } - ~ur_kernel_handle_t_() { + inline ~ur_kernel_handle_t_() { if (decrementReferenceCount() == 0) { free(_localMemPool); } } - ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name, + inline ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name, nativecpu_task_t subhandler, const native_cpu::ReqdWGSize_t &ReqdWGSize) : hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)}, diff --git a/source/adapters/native_cpu/nativecpu_state.hpp b/source/adapters/native_cpu/nativecpu_state.hpp index bb798b22e6..b3a14d4137 100755 --- a/source/adapters/native_cpu/nativecpu_state.hpp +++ b/source/adapters/native_cpu/nativecpu_state.hpp @@ -20,7 +20,7 @@ struct state { size_t MNumGroups[3]; size_t MGlobalOffset[3]; uint32_t NumSubGroups, SubGroup_id, SubGroup_local_id, SubGroup_size; - state(size_t globalR0, size_t globalR1, size_t globalR2, size_t localR0, + inline state(size_t globalR0, size_t globalR1, size_t globalR2, size_t localR0, size_t localR1, size_t localR2, size_t globalO0, size_t globalO1, size_t globalO2) : MGlobal_range{globalR0, globalR1, globalR2}, MWorkGroup_size{localR0, @@ -43,7 +43,7 @@ struct state { SubGroup_size = 1; } - void update(size_t group0, size_t group1, size_t group2, size_t local0, + inline void update(size_t group0, size_t group1, size_t group2, size_t local0, size_t local1, size_t local2) { MWorkGroup_id[0] = group0; MWorkGroup_id[1] = group1; @@ -59,7 +59,7 @@ struct state { MWorkGroup_size[2] * MWorkGroup_id[2] + MLocal_id[2] + MGlobalOffset[2]; } - void update(size_t group0, size_t group1, size_t group2) { + inline void update(size_t group0, size_t group1, size_t group2) { MWorkGroup_id[0] = group0; MWorkGroup_id[1] = group1; MWorkGroup_id[2] = group2; From 0b8b0f7d9debb791ef18c3ae2e8853345c1c46b5 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Wed, 28 Aug 2024 10:55:46 +0100 Subject: [PATCH 02/48] [NATIVECPU] use size_t, reserve vector size --- source/adapters/native_cpu/enqueue.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp index ec9ba99389..c878d289e4 100644 --- a/source/adapters/native_cpu/enqueue.cpp +++ b/source/adapters/native_cpu/enqueue.cpp @@ -158,7 +158,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( } // Peel the remaining work items. Since the local size is 1, we iterate // over the work groups. - for (unsigned g0 = new_num_work_groups_0 * itemsPerThread; g0 < numWG0; + for (size_t g0 = new_num_work_groups_0 * itemsPerThread; g0 < numWG0; g0++) { state.update(g0, g1, g2); hKernel->_subhandler(hKernel->_args.data(), &state); @@ -188,6 +188,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( // Split dimension 0 across the threadpool // Here we try to create groups of workgroups in order to reduce // synchronization overhead + groups.reserve(numWG2 * numWG1 * numWG0); for (unsigned g2 = 0; g2 < numWG2; g2++) { for (unsigned g1 = 0; g1 < numWG1; g1++) { for (unsigned g0 = 0; g0 < numWG0; g0++) { @@ -204,6 +205,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( auto numGroups = groups.size(); auto groupsPerThread = numGroups / numParallelThreads; auto remainder = numGroups % numParallelThreads; + futures.reserve(numParallelThreads + remainder); for (unsigned thread = 0; thread < numParallelThreads; thread++) { futures.emplace_back(tp.schedule_task( [&groups, thread, groupsPerThread, hKernel](size_t threadId) { @@ -400,7 +402,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( // TODO: error checking // TODO: handle async void *startingPtr = hBuffer->_mem + offset; - unsigned steps = size / patternSize; + size_t steps = size / patternSize; for (unsigned i = 0; i < steps; i++) { memcpy(static_cast<int8_t *>(startingPtr) + i * patternSize, pPattern, patternSize); @@ -546,7 +548,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( break; } default: { - for (unsigned int step{0}; step < size; step += patternSize) { + for (size_t step{0}; step < size; step += patternSize) { auto *dest = reinterpret_cast<void *>(reinterpret_cast<uint8_t *>(ptr) + step); memcpy(dest, pPattern, patternSize); From 88db20afc48d24e5ce52dabe99fd8d77998f967d Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Thu, 17 Oct 2024 16:56:06 +0100 Subject: [PATCH 03/48] [NATIVECPU] use strcpy_s --- source/adapters/native_cpu/common.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/adapters/native_cpu/common.cpp b/source/adapters/native_cpu/common.cpp index b956fc8c7a..47afed3729 100644 --- a/source/adapters/native_cpu/common.cpp +++ b/source/adapters/native_cpu/common.cpp @@ -19,7 +19,7 @@ thread_local char ErrorMessage[MaxMessageSize]; [[maybe_unused]] void setErrorMessage(const char *pMessage, ur_result_t ErrorCode) { assert(strlen(pMessage) <= MaxMessageSize); - strcpy(ErrorMessage, pMessage); + strcpy_s(ErrorMessage, MaxMessageSize, pMessage); ErrorMessageCode = ErrorCode; } From 0b89dee701105aaa1e5eca5077c7b5b11babbccd Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Mon, 21 Oct 2024 17:02:10 +0100 Subject: [PATCH 04/48] [NATIVECPU] only enqeue when groupsPerThread is >0 --- source/adapters/native_cpu/enqueue.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) mode change 100644 => 100755 source/adapters/native_cpu/enqueue.cpp diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp old mode 100644 new mode 100755 index b36b550647..da49a4d518 --- a/source/adapters/native_cpu/enqueue.cpp +++ b/source/adapters/native_cpu/enqueue.cpp @@ -217,7 +217,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( auto groupsPerThread = numGroups / numParallelThreads; auto remainder = numGroups % numParallelThreads; futures.reserve(numParallelThreads + remainder); - for (unsigned thread = 0; thread < numParallelThreads; thread++) { + for (unsigned thread = 0; groupsPerThread && thread < numParallelThreads; + thread++) { futures.emplace_back(tp.schedule_task( [&groups, thread, groupsPerThread, hKernel](size_t threadId) { for (unsigned i = 0; i < groupsPerThread; i++) { From d11133734455f548cdb9c630849261b37f4a0872 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Mon, 21 Oct 2024 17:08:10 +0100 Subject: [PATCH 05/48] [NATIVECPU] changed filemode back --- source/adapters/native_cpu/enqueue.cpp | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 source/adapters/native_cpu/enqueue.cpp diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp old mode 100755 new mode 100644 From a1166d803f33d948bad9367dd06a0dda56e0a370 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Tue, 22 Oct 2024 15:26:58 +0100 Subject: [PATCH 06/48] [NATIVECPU] added threadpool file to CMakeList --- source/adapters/native_cpu/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt index 56cfc577d8..560172444b 100644 --- a/source/adapters/native_cpu/CMakeLists.txt +++ b/source/adapters/native_cpu/CMakeLists.txt @@ -34,6 +34,7 @@ add_ur_adapter(${TARGET_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/queue.cpp ${CMAKE_CURRENT_SOURCE_DIR}/queue.hpp ${CMAKE_CURRENT_SOURCE_DIR}/sampler.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/threadpool.hpp ${CMAKE_CURRENT_SOURCE_DIR}/ur_interface_loader.cpp ${CMAKE_CURRENT_SOURCE_DIR}/usm_p2p.cpp ${CMAKE_CURRENT_SOURCE_DIR}/virtual_mem.cpp From 47b12a41d968297b56ca6bf962b2c8d5c1888987 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Tue, 22 Oct 2024 15:43:32 +0100 Subject: [PATCH 07/48] [SYCLNATIVECPU] threadID now managed by kernel enqueue --- source/adapters/native_cpu/enqueue.cpp | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) mode change 100644 => 100755 source/adapters/native_cpu/enqueue.cpp diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp old mode 100644 new mode 100755 index da49a4d518..8b1ca3445d --- a/source/adapters/native_cpu/enqueue.cpp +++ b/source/adapters/native_cpu/enqueue.cpp @@ -26,7 +26,7 @@ struct NDRDescT { RangeT GlobalSize; RangeT LocalSize; inline NDRDescT(uint32_t WorkDim, const size_t *GlobalWorkOffset, - const size_t *GlobalWorkSize, const size_t *LocalWorkSize) + const size_t *GlobalWorkSize, const size_t *LocalWorkSize) : WorkDim(WorkDim) { for (uint32_t I = 0; I < WorkDim; I++) { GlobalOffset[I] = GlobalWorkOffset[I]; @@ -53,7 +53,7 @@ struct NDRDescT { #ifdef NATIVECPU_USE_OCK static inline native_cpu::state getResizedState(const native_cpu::NDRDescT &ndr, - size_t itemsPerThread) { + size_t itemsPerThread) { native_cpu::state resized_state( ndr.GlobalSize[0], ndr.GlobalSize[1], ndr.GlobalSize[2], itemsPerThread, ndr.LocalSize[1], ndr.LocalSize[2], ndr.GlobalOffset[0], @@ -179,20 +179,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( } else { // We are running a parallel_for over an nd_range - + size_t threadId = 0; if (numWG1 * numWG2 >= numParallelThreads) { // Dimensions 1 and 2 have enough work, split them across the threadpool for (unsigned g2 = 0; g2 < numWG2; g2++) { for (unsigned g1 = 0; g1 < numWG1; g1++) { - futures.emplace_back( - tp.schedule_task([state, kernel = *hKernel, numWG0, g1, g2, - numParallelThreads](size_t threadId) mutable { + futures.emplace_back(tp.schedule_task( + [state, kernel = *hKernel, numWG0, g1, g2, numParallelThreads, + threadId](size_t /*threadId*/) mutable { for (unsigned g0 = 0; g0 < numWG0; g0++) { kernel.handleLocalArgs(numParallelThreads, threadId); state.update(g0, g1, g2); kernel._subhandler(kernel._args.data(), &state); } })); + if (++threadId == numParallelThreads) + threadId = 0; } } } else { @@ -220,20 +222,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( for (unsigned thread = 0; groupsPerThread && thread < numParallelThreads; thread++) { futures.emplace_back(tp.schedule_task( - [&groups, thread, groupsPerThread, hKernel](size_t threadId) { + [&groups, thread, groupsPerThread, hKernel](size_t /*threadId*/) { for (unsigned i = 0; i < groupsPerThread; i++) { auto index = thread * groupsPerThread + i; - groups[index](threadId, *hKernel); + groups[index](thread /*Id*/, *hKernel); } })); } // schedule the remaining tasks if (remainder) { + const size_t threadId = futures.size(); futures.emplace_back( - tp.schedule_task([&groups, remainder, + tp.schedule_task([&groups, remainder, threadId, scheduled = numParallelThreads * groupsPerThread, - hKernel](size_t threadId) { + hKernel](size_t /* threadId*/) { for (unsigned i = 0; i < remainder; i++) { auto index = scheduled + i; groups[index](threadId, *hKernel); From 4b04ce6763765d7eb086a5f33a838f38a63a30e5 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Tue, 22 Oct 2024 15:44:32 +0100 Subject: [PATCH 08/48] [SYCLNATIVECPU] file mode changed back --- source/adapters/native_cpu/enqueue.cpp | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 source/adapters/native_cpu/enqueue.cpp diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp old mode 100755 new mode 100644 From 5406b39f26c6b0d9523303f0bc83845edc24a229 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Tue, 22 Oct 2024 17:32:26 +0100 Subject: [PATCH 09/48] [NATIVECPU] Simple TBB backend --- source/adapters/native_cpu/CMakeLists.txt | 41 ++++++++++++++++ source/adapters/native_cpu/device.hpp | 4 ++ source/adapters/native_cpu/enqueue.cpp | 58 +++++++++++------------ source/adapters/native_cpu/threadpool.hpp | 49 ++++++++++++++++++- 4 files changed, 120 insertions(+), 32 deletions(-) mode change 100644 => 100755 source/adapters/native_cpu/enqueue.cpp mode change 100644 => 100755 source/adapters/native_cpu/threadpool.hpp diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt index 56cfc577d8..d0f332a71a 100644 --- a/source/adapters/native_cpu/CMakeLists.txt +++ b/source/adapters/native_cpu/CMakeLists.txt @@ -48,6 +48,37 @@ set_target_properties(${TARGET_NAME} PROPERTIES SOVERSION "${PROJECT_VERSION_MAJOR}" ) +option(NATIVECPU_WITH_TBB "Use TBB as backend for Native CPU" ON) +if(NATIVECPU_WITH_TBB) + message(STATUS "Building Native CPU adapter with TBB backend.") + + include(FetchContent) + FetchContent_Declare( + tbb + GIT_REPOSITORY https://github.com/oneapi-src/oneTBB.git + GIT_TAG 42b833fe806606d05a5cad064b8b87365818d716 + CMAKE_ARGS "-DTBB_TEST:BOOL=OFF -DTBB_EXAMPLES:BOOL=OFF -DTBB_BENCH:BOOL=OFF" + GIT_SHALLOW ON + OVERRIDE_FIND_PACKAGE + ) + set(TBB_TEST OFF CACHE INTERNAL "" FORCE) + set(TBB_EXAMPLES OFF CACHE INTERNAL "" FORCE) + set(TBB_BENCH OFF CACHE INTERNAL "" FORCE) + set(TBB_BUILD ON CACHE INTERNAL "" FORCE) + set(TBB_FIND_PACKAGE OFF CACHE INTERNAL "" FORCE) + set(TBB_FUZZ_TESTING OFF CACHE INTERNAL "" FORCE) + set(TBB_INSTALL ON CACHE INTERNAL "" FORCE) + FetchContent_MakeAvailable(tbb) + + FetchContent_GetProperties(tbb) + + if(NOT tbb_POPULATED) + FetchContent_Populate(tbb) + endif() + set(TBB_SOURCE_DIR_INTERNAL ${tbb_SOURCE_DIR}/include) + set(TBB_BINARY_DIR_INTERNAL ${tbb_BINARY_DIR}) +endif() + find_package(Threads REQUIRED) target_link_libraries(${TARGET_NAME} PRIVATE @@ -60,3 +91,13 @@ target_link_libraries(${TARGET_NAME} PRIVATE target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../../" ) + +if(NATIVECPU_WITH_TBB) + target_link_libraries(${TARGET_NAME} PRIVATE + TBB::tbb + ) + target_include_directories(${TARGET_NAME} PRIVATE + "${TBB_SOURCE_DIR_INTERNAL}" + ) + target_compile_definitions(${TARGET_NAME} PRIVATE NATIVECPU_USE_TBB) +endif() diff --git a/source/adapters/native_cpu/device.hpp b/source/adapters/native_cpu/device.hpp index 2308c1a7f4..1a6b0d091a 100644 --- a/source/adapters/native_cpu/device.hpp +++ b/source/adapters/native_cpu/device.hpp @@ -14,7 +14,11 @@ #include <ur/ur.hpp> struct ur_device_handle_t_ { +#ifdef NATIVECPU_USE_TBB + native_cpu::TBB_threadpool tp; +#else native_cpu::threadpool_t tp; +#endif ur_device_handle_t_(ur_platform_handle_t ArgPlt); const uint64_t mem_size; diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp old mode 100644 new mode 100755 index 33d8c35c36..1849d24c2c --- a/source/adapters/native_cpu/enqueue.cpp +++ b/source/adapters/native_cpu/enqueue.cpp @@ -109,7 +109,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( auto &tp = hQueue->device->tp; const size_t numParallelThreads = tp.num_threads(); hKernel->updateMemPool(numParallelThreads); - std::vector<std::future<void>> futures; + auto Tasks = native_cpu::getScheduler(tp); std::vector<std::function<void(size_t, ur_kernel_handle_t_)>> groups; auto numWG0 = ndr.GlobalSize[0] / ndr.LocalSize[0]; auto numWG1 = ndr.GlobalSize[1] / ndr.LocalSize[1]; @@ -158,14 +158,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( for (unsigned g2 = 0; g2 < numWG2; g2++) { for (unsigned g1 = 0; g1 < numWG1; g1++) { for (unsigned g0 = 0; g0 < new_num_work_groups_0; g0 += 1) { - futures.emplace_back( - tp.schedule_task([&ndr = std::as_const(ndr), itemsPerThread, - hKernel, g0, g1, g2](size_t) { - native_cpu::state resized_state = - getResizedState(ndr, itemsPerThread); - resized_state.update(g0, g1, g2); - hKernel->_subhandler(hKernel->_args.data(), &resized_state); - })); + Tasks.schedule([&ndr = std::as_const(ndr), itemsPerThread, hKernel, + g0, g1, g2](size_t) { + native_cpu::state resized_state = + getResizedState(ndr, itemsPerThread); + resized_state.update(g0, g1, g2); + hKernel->_subhandler(hKernel->_args.data(), &resized_state); + }); } // Peel the remaining work items. Since the local size is 1, we iterate // over the work groups. @@ -184,15 +183,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( // Dimensions 1 and 2 have enough work, split them across the threadpool for (unsigned g2 = 0; g2 < numWG2; g2++) { for (unsigned g1 = 0; g1 < numWG1; g1++) { - futures.emplace_back( - tp.schedule_task([state, kernel = *hKernel, numWG0, g1, g2, - numParallelThreads](size_t threadId) mutable { - for (unsigned g0 = 0; g0 < numWG0; g0++) { - kernel.handleLocalArgs(numParallelThreads, threadId); - state.update(g0, g1, g2); - kernel._subhandler(kernel._args.data(), &state); - } - })); + Tasks.schedule([state, kernel = *hKernel, numWG0, g1, g2, + numParallelThreads](size_t threadId) mutable { + for (unsigned g0 = 0; g0 < numWG0; g0++) { + kernel.handleLocalArgs(numParallelThreads, threadId); + state.update(g0, g1, g2); + kernel._subhandler(kernel._args.data(), &state); + } + }); } } } else { @@ -216,32 +214,30 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( auto groupsPerThread = numGroups / numParallelThreads; auto remainder = numGroups % numParallelThreads; for (unsigned thread = 0; thread < numParallelThreads; thread++) { - futures.emplace_back(tp.schedule_task( + Tasks.schedule( [&groups, thread, groupsPerThread, hKernel](size_t threadId) { for (unsigned i = 0; i < groupsPerThread; i++) { auto index = thread * groupsPerThread + i; groups[index](threadId, *hKernel); } - })); + }); } // schedule the remaining tasks if (remainder) { - futures.emplace_back( - tp.schedule_task([&groups, remainder, - scheduled = numParallelThreads * groupsPerThread, - hKernel](size_t threadId) { - for (unsigned i = 0; i < remainder; i++) { - auto index = scheduled + i; - groups[index](threadId, *hKernel); - } - })); + Tasks.schedule([&groups, remainder, + scheduled = numParallelThreads * groupsPerThread, + hKernel](size_t threadId) { + for (unsigned i = 0; i < remainder; i++) { + auto index = scheduled + i; + groups[index](threadId, *hKernel); + } + }); } } } - for (auto &f : futures) - f.get(); + Tasks.wait(); #endif // NATIVECPU_USE_OCK // TODO: we should avoid calling clear here by avoiding using push_back // in setKernelArgs. diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp old mode 100644 new mode 100755 index 2f2f79cd5a..b402609781 --- a/source/adapters/native_cpu/threadpool.hpp +++ b/source/adapters/native_cpu/threadpool.hpp @@ -209,6 +209,53 @@ template <typename ThreadPoolT> class threadpool_interface { } }; -using threadpool_t = threadpool_interface<detail::simple_thread_pool>; +template <class TP> class Scheduler { + std::vector<std::future<void>> futures; + TP &TPref; + +public: + Scheduler(TP &ref) : TPref(ref) {} + + inline void schedule(worker_task_t &&task) { + futures.emplace_back(TPref.schedule_task(std::move(task))); + } + inline void wait() { + for (auto &f : futures) + f.get(); + } +}; + +using simple_threadpool_t = threadpool_interface<detail::simple_thread_pool>; +inline Scheduler<simple_threadpool_t> getScheduler(simple_threadpool_t &tp) { + return Scheduler(tp); +} + +using threadpool_t = simple_threadpool_t; + +} // namespace native_cpu + +#ifdef NATIVECPU_USE_TBB +// Simple TBB backend +#include "oneapi/tbb.h" +namespace native_cpu { + +struct TBB_threadpool { + inline size_t num_threads() const noexcept { return 32; } +}; +template <> class Scheduler<TBB_threadpool> { + oneapi::tbb::task_group tasks; + +public: + inline void schedule(worker_task_t &&task) { + tasks.run([&]() { task(0); }); + } + inline void wait() { tasks.wait(); } +}; + +inline Scheduler<TBB_threadpool> getScheduler(TBB_threadpool &tp) { + return Scheduler<TBB_threadpool>(); +} } // namespace native_cpu + +#endif From 2e8ae3f5a4ba308cd3054459934b141fe0383212 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Tue, 22 Oct 2024 17:33:31 +0100 Subject: [PATCH 10/48] [NATIVECPU] changed back filemode --- source/adapters/native_cpu/enqueue.cpp | 0 source/adapters/native_cpu/threadpool.hpp | 0 2 files changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 source/adapters/native_cpu/enqueue.cpp mode change 100755 => 100644 source/adapters/native_cpu/threadpool.hpp diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp old mode 100755 new mode 100644 diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp old mode 100755 new mode 100644 From d5cf2c99a231df4b020959a3ad943d5d28c67584 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Wed, 23 Oct 2024 09:54:06 +0100 Subject: [PATCH 11/48] [NATIVECPU] fixed scheduling --- source/adapters/native_cpu/threadpool.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp index b402609781..216436d29a 100644 --- a/source/adapters/native_cpu/threadpool.hpp +++ b/source/adapters/native_cpu/threadpool.hpp @@ -247,7 +247,7 @@ template <> class Scheduler<TBB_threadpool> { public: inline void schedule(worker_task_t &&task) { - tasks.run([&]() { task(0); }); + tasks.run(std::function<void()>([=]() mutable { task(0); })); } inline void wait() { tasks.wait(); } }; From ba9b2c5f5aa716a723a19a704a4300bf75cfeeae Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Wed, 23 Oct 2024 17:32:29 +0100 Subject: [PATCH 12/48] [NATIVECPU] more shared code --- source/adapters/native_cpu/threadpool.hpp | 35 +++++++++++------------ 1 file changed, 17 insertions(+), 18 deletions(-) mode change 100644 => 100755 source/adapters/native_cpu/threadpool.hpp diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp old mode 100644 new mode 100755 index 216436d29a..9098d52de4 --- a/source/adapters/native_cpu/threadpool.hpp +++ b/source/adapters/native_cpu/threadpool.hpp @@ -209,25 +209,29 @@ template <typename ThreadPoolT> class threadpool_interface { } }; -template <class TP> class Scheduler { - std::vector<std::future<void>> futures; +template <class TP> struct SchedulerBase { TP &TPref; + SchedulerBase(TP &ref) : TPref(ref) {} +}; -public: - Scheduler(TP &ref) : TPref(ref) {} +template <class TP> struct Scheduler : SchedulerBase<TP> { + using SchedulerBase<TP>::SchedulerBase; inline void schedule(worker_task_t &&task) { - futures.emplace_back(TPref.schedule_task(std::move(task))); + futures.emplace_back(this->TPref.schedule_task(std::move(task))); } inline void wait() { for (auto &f : futures) f.get(); } + +private: + std::vector<std::future<void>> futures; }; using simple_threadpool_t = threadpool_interface<detail::simple_thread_pool>; -inline Scheduler<simple_threadpool_t> getScheduler(simple_threadpool_t &tp) { - return Scheduler(tp); +template <class TPType> inline Scheduler<TPType> getScheduler(TPType &tp) { + return Scheduler<TPType>(tp); } using threadpool_t = simple_threadpool_t; @@ -240,22 +244,17 @@ using threadpool_t = simple_threadpool_t; namespace native_cpu { struct TBB_threadpool { + oneapi::tbb::task_group tasks; inline size_t num_threads() const noexcept { return 32; } }; -template <> class Scheduler<TBB_threadpool> { - oneapi::tbb::task_group tasks; - -public: - inline void schedule(worker_task_t &&task) { - tasks.run(std::function<void()>([=]() mutable { task(0); })); +template <> struct Scheduler<TBB_threadpool> : SchedulerBase<TBB_threadpool> { + using SchedulerBase<TBB_threadpool>::SchedulerBase; + template <class T> inline void schedule(T &&task) { + TPref.tasks.run(std::function<void()>([=]() mutable { task(0); })); } - inline void wait() { tasks.wait(); } + inline void wait() { TPref.tasks.wait(); } }; -inline Scheduler<TBB_threadpool> getScheduler(TBB_threadpool &tp) { - return Scheduler<TBB_threadpool>(); -} - } // namespace native_cpu #endif From 5cf59d21dd548bbf30125b7a3f9d7c17e8edcb30 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Wed, 23 Oct 2024 17:34:27 +0100 Subject: [PATCH 13/48] [NATIVECPU] reversed filemode change --- source/adapters/native_cpu/threadpool.hpp | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 source/adapters/native_cpu/threadpool.hpp diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp old mode 100755 new mode 100644 From 7077d1a1a6b5f7efdee940c50dc2d092c2605008 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Thu, 24 Oct 2024 09:55:56 +0100 Subject: [PATCH 14/48] [NATIVECPU] update oneTBB tag --- source/adapters/native_cpu/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt index 57ddf11cc8..59fd1f859c 100644 --- a/source/adapters/native_cpu/CMakeLists.txt +++ b/source/adapters/native_cpu/CMakeLists.txt @@ -57,7 +57,7 @@ if(NATIVECPU_WITH_TBB) FetchContent_Declare( tbb GIT_REPOSITORY https://github.com/oneapi-src/oneTBB.git - GIT_TAG 42b833fe806606d05a5cad064b8b87365818d716 + GIT_TAG 377a91431ec62c5e296dbeca683c5d1e66d69f32 CMAKE_ARGS "-DTBB_TEST:BOOL=OFF -DTBB_EXAMPLES:BOOL=OFF -DTBB_BENCH:BOOL=OFF" GIT_SHALLOW ON OVERRIDE_FIND_PACKAGE From a8e599cb5ac3a1c9ca814c87ed69e5cd5f24a247 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Thu, 24 Oct 2024 11:03:31 +0100 Subject: [PATCH 15/48] [NATIVECPU] added required include not needed by Windows --- source/adapters/native_cpu/common.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/source/adapters/native_cpu/common.cpp b/source/adapters/native_cpu/common.cpp index 47afed3729..f4becf23b4 100644 --- a/source/adapters/native_cpu/common.cpp +++ b/source/adapters/native_cpu/common.cpp @@ -9,6 +9,7 @@ //===----------------------------------------------------------------------===// #include "common.hpp" +#include <string.h> // Global variables for UR_RESULT_ADAPTER_SPECIFIC_ERROR // See urGetLastResult From 4c64575ce1ef6a7aa2ac34796d247178a6a123a3 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Thu, 24 Oct 2024 11:33:54 +0100 Subject: [PATCH 16/48] [NATIVECPU] removed strcpy_s because it's not supported by gcc --- source/adapters/native_cpu/common.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/source/adapters/native_cpu/common.cpp b/source/adapters/native_cpu/common.cpp index f4becf23b4..b956fc8c7a 100644 --- a/source/adapters/native_cpu/common.cpp +++ b/source/adapters/native_cpu/common.cpp @@ -9,7 +9,6 @@ //===----------------------------------------------------------------------===// #include "common.hpp" -#include <string.h> // Global variables for UR_RESULT_ADAPTER_SPECIFIC_ERROR // See urGetLastResult @@ -20,7 +19,7 @@ thread_local char ErrorMessage[MaxMessageSize]; [[maybe_unused]] void setErrorMessage(const char *pMessage, ur_result_t ErrorCode) { assert(strlen(pMessage) <= MaxMessageSize); - strcpy_s(ErrorMessage, MaxMessageSize, pMessage); + strcpy(ErrorMessage, pMessage); ErrorMessageCode = ErrorCode; } From 4905c44e8b04ba2630c43622aa7b3970dd3d81cf Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Thu, 24 Oct 2024 18:30:34 +0100 Subject: [PATCH 17/48] [NATIVECPU] added system headers first --- source/adapters/native_cpu/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt index 59fd1f859c..9ffa00c4b2 100644 --- a/source/adapters/native_cpu/CMakeLists.txt +++ b/source/adapters/native_cpu/CMakeLists.txt @@ -90,6 +90,7 @@ target_link_libraries(${TARGET_NAME} PRIVATE ) target_include_directories(${TARGET_NAME} PRIVATE + "${CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES}" "${CMAKE_CURRENT_SOURCE_DIR}/../../" ) From e426b3fc58d89bb593b2b4f58094bfb3b2de8199 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Thu, 24 Oct 2024 18:51:10 +0100 Subject: [PATCH 18/48] [NATIVECPU] cmake fix --- source/adapters/native_cpu/CMakeLists.txt | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt index 9ffa00c4b2..88cd0972b0 100644 --- a/source/adapters/native_cpu/CMakeLists.txt +++ b/source/adapters/native_cpu/CMakeLists.txt @@ -90,7 +90,6 @@ target_link_libraries(${TARGET_NAME} PRIVATE ) target_include_directories(${TARGET_NAME} PRIVATE - "${CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES}" "${CMAKE_CURRENT_SOURCE_DIR}/../../" ) @@ -98,8 +97,8 @@ if(NATIVECPU_WITH_TBB) target_link_libraries(${TARGET_NAME} PRIVATE TBB::tbb ) - target_include_directories(${TARGET_NAME} PRIVATE - "${TBB_SOURCE_DIR_INTERNAL}" - ) +# target_include_directories(${TARGET_NAME} PRIVATE +# "${TBB_SOURCE_DIR_INTERNAL}" +# ) target_compile_definitions(${TARGET_NAME} PRIVATE NATIVECPU_USE_TBB) endif() From 4200f305850ea6977c05798dd7112efb0aa6d80a Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Fri, 25 Oct 2024 19:17:33 +0100 Subject: [PATCH 19/48] [NATIVECPU] removed GIT_SHALLOW --- source/adapters/native_cpu/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt index 88cd0972b0..fac3cc672a 100644 --- a/source/adapters/native_cpu/CMakeLists.txt +++ b/source/adapters/native_cpu/CMakeLists.txt @@ -59,7 +59,6 @@ if(NATIVECPU_WITH_TBB) GIT_REPOSITORY https://github.com/oneapi-src/oneTBB.git GIT_TAG 377a91431ec62c5e296dbeca683c5d1e66d69f32 CMAKE_ARGS "-DTBB_TEST:BOOL=OFF -DTBB_EXAMPLES:BOOL=OFF -DTBB_BENCH:BOOL=OFF" - GIT_SHALLOW ON OVERRIDE_FIND_PACKAGE ) set(TBB_TEST OFF CACHE INTERNAL "" FORCE) From 53f44944d61b3e793918b9397454046c71749d99 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Fri, 1 Nov 2024 15:41:31 +0000 Subject: [PATCH 20/48] [NATIVECPU] turn CMAKE_INCLUDE_CURRENT_DIR off for tbb --- source/adapters/native_cpu/CMakeLists.txt | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt index fac3cc672a..3777dbd3b7 100644 --- a/source/adapters/native_cpu/CMakeLists.txt +++ b/source/adapters/native_cpu/CMakeLists.txt @@ -68,6 +68,7 @@ if(NATIVECPU_WITH_TBB) set(TBB_FIND_PACKAGE OFF CACHE INTERNAL "" FORCE) set(TBB_FUZZ_TESTING OFF CACHE INTERNAL "" FORCE) set(TBB_INSTALL ON CACHE INTERNAL "" FORCE) + set (CMAKE_INCLUDE_CURRENT_DIR OFF) FetchContent_MakeAvailable(tbb) FetchContent_GetProperties(tbb) @@ -96,8 +97,6 @@ if(NATIVECPU_WITH_TBB) target_link_libraries(${TARGET_NAME} PRIVATE TBB::tbb ) -# target_include_directories(${TARGET_NAME} PRIVATE -# "${TBB_SOURCE_DIR_INTERNAL}" -# ) + target_compile_definitions(${TARGET_NAME} PRIVATE NATIVECPU_USE_TBB) endif() From 2ca6a3f7dead3a9597b8b49ab0ceeefdea7a19a0 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Fri, 1 Nov 2024 15:50:58 +0000 Subject: [PATCH 21/48] [NATIVECPU] workaround for oneTBB casting away const qualifiers --- source/adapters/native_cpu/CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt index 3777dbd3b7..150dafac8c 100644 --- a/source/adapters/native_cpu/CMakeLists.txt +++ b/source/adapters/native_cpu/CMakeLists.txt @@ -97,6 +97,11 @@ if(NATIVECPU_WITH_TBB) target_link_libraries(${TARGET_NAME} PRIVATE TBB::tbb ) + if (MSVC) + else() + # oneTBB currently casts away some const qualifiers + target_compile_options(tbb PRIVATE -Wno-cast-qual) + endif() target_compile_definitions(${TARGET_NAME} PRIVATE NATIVECPU_USE_TBB) endif() From 835ce2f4bebccffba8ca7606b91aa1153f095a5a Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Fri, 1 Nov 2024 17:25:51 +0000 Subject: [PATCH 22/48] [NATIVECPU] workaround for oneTBB casting away const qualifiers --- source/adapters/native_cpu/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt index 150dafac8c..c6b7e4725c 100644 --- a/source/adapters/native_cpu/CMakeLists.txt +++ b/source/adapters/native_cpu/CMakeLists.txt @@ -101,6 +101,7 @@ if(NATIVECPU_WITH_TBB) else() # oneTBB currently casts away some const qualifiers target_compile_options(tbb PRIVATE -Wno-cast-qual) + target_compile_options(tbbmalloc PRIVATE -Wno-cast-qual) endif() target_compile_definitions(${TARGET_NAME} PRIVATE NATIVECPU_USE_TBB) From 42b1e6e09a4b1ac53014d80a09fb6a085c920673 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Fri, 1 Nov 2024 19:03:56 +0000 Subject: [PATCH 23/48] [NATIVECPU] remove potentially unneeded cmake --- source/adapters/native_cpu/CMakeLists.txt | 8 -------- 1 file changed, 8 deletions(-) diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt index c6b7e4725c..d55ce2544a 100644 --- a/source/adapters/native_cpu/CMakeLists.txt +++ b/source/adapters/native_cpu/CMakeLists.txt @@ -70,14 +70,6 @@ if(NATIVECPU_WITH_TBB) set(TBB_INSTALL ON CACHE INTERNAL "" FORCE) set (CMAKE_INCLUDE_CURRENT_DIR OFF) FetchContent_MakeAvailable(tbb) - - FetchContent_GetProperties(tbb) - - if(NOT tbb_POPULATED) - FetchContent_Populate(tbb) - endif() - set(TBB_SOURCE_DIR_INTERNAL ${tbb_SOURCE_DIR}/include) - set(TBB_BINARY_DIR_INTERNAL ${tbb_BINARY_DIR}) endif() find_package(Threads REQUIRED) From 9c6fb07121ec3c8dad52cf401a158b8c7e9cd2db Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Mon, 4 Nov 2024 12:51:03 +0000 Subject: [PATCH 24/48] [NATIVECPU] oneTBB disabled by default --- source/adapters/native_cpu/CMakeLists.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt index d55ce2544a..781fd36392 100644 --- a/source/adapters/native_cpu/CMakeLists.txt +++ b/source/adapters/native_cpu/CMakeLists.txt @@ -49,9 +49,10 @@ set_target_properties(${TARGET_NAME} PROPERTIES SOVERSION "${PROJECT_VERSION_MAJOR}" ) -option(NATIVECPU_WITH_TBB "Use TBB as backend for Native CPU" ON) +# oneTBB is an optional NativeCPU backend and disabled by default. +option(NATIVECPU_WITH_TBB "Use oneTBB as backend for Native CPU" OFF) if(NATIVECPU_WITH_TBB) - message(STATUS "Building Native CPU adapter with TBB backend.") + message(STATUS "Configuring Native CPU adapter with TBB backend.") include(FetchContent) FetchContent_Declare( From de98e9b31abfa24257c9ab68b78c6676cf1088f1 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Mon, 4 Nov 2024 13:20:13 +0000 Subject: [PATCH 25/48] [NATIVECPU] tbb to oneTBB --- source/adapters/native_cpu/CMakeLists.txt | 6 +++--- source/adapters/native_cpu/device.hpp | 2 +- source/adapters/native_cpu/threadpool.hpp | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt index 781fd36392..a57eff8cfc 100644 --- a/source/adapters/native_cpu/CMakeLists.txt +++ b/source/adapters/native_cpu/CMakeLists.txt @@ -50,8 +50,8 @@ set_target_properties(${TARGET_NAME} PROPERTIES ) # oneTBB is an optional NativeCPU backend and disabled by default. -option(NATIVECPU_WITH_TBB "Use oneTBB as backend for Native CPU" OFF) -if(NATIVECPU_WITH_TBB) +option(NATIVECPU_WITH_ONETBB "Use oneTBB as backend for Native CPU" OFF) +if(NATIVECPU_WITH_ONETBB) message(STATUS "Configuring Native CPU adapter with TBB backend.") include(FetchContent) @@ -97,5 +97,5 @@ if(NATIVECPU_WITH_TBB) target_compile_options(tbbmalloc PRIVATE -Wno-cast-qual) endif() - target_compile_definitions(${TARGET_NAME} PRIVATE NATIVECPU_USE_TBB) + target_compile_definitions(${TARGET_NAME} PRIVATE NATIVECPU_WITH_ONETBB) endif() diff --git a/source/adapters/native_cpu/device.hpp b/source/adapters/native_cpu/device.hpp index 1a6b0d091a..e9f7602930 100644 --- a/source/adapters/native_cpu/device.hpp +++ b/source/adapters/native_cpu/device.hpp @@ -14,7 +14,7 @@ #include <ur/ur.hpp> struct ur_device_handle_t_ { -#ifdef NATIVECPU_USE_TBB +#ifdef NATIVECPU_WITH_ONETBB native_cpu::TBB_threadpool tp; #else native_cpu::threadpool_t tp; diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp index 9098d52de4..3d1dacb93a 100644 --- a/source/adapters/native_cpu/threadpool.hpp +++ b/source/adapters/native_cpu/threadpool.hpp @@ -238,7 +238,7 @@ using threadpool_t = simple_threadpool_t; } // namespace native_cpu -#ifdef NATIVECPU_USE_TBB +#ifdef NATIVECPU_WITH_ONETBB // Simple TBB backend #include "oneapi/tbb.h" namespace native_cpu { From 51e915adf102488445967b12414ca6a5ef05db71 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Mon, 4 Nov 2024 14:52:10 +0000 Subject: [PATCH 26/48] [NATIVECPU] improved comment --- source/adapters/native_cpu/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt index a57eff8cfc..4a1aa0b253 100644 --- a/source/adapters/native_cpu/CMakeLists.txt +++ b/source/adapters/native_cpu/CMakeLists.txt @@ -49,7 +49,7 @@ set_target_properties(${TARGET_NAME} PROPERTIES SOVERSION "${PROJECT_VERSION_MAJOR}" ) -# oneTBB is an optional NativeCPU backend and disabled by default. +# oneTBB is used as an optional NativeCPU backend and disabled by default. option(NATIVECPU_WITH_ONETBB "Use oneTBB as backend for Native CPU" OFF) if(NATIVECPU_WITH_ONETBB) message(STATUS "Configuring Native CPU adapter with TBB backend.") From dd8b027ee63d417e53ba1031dbf55c0602b4db27 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Mon, 4 Nov 2024 15:07:17 +0000 Subject: [PATCH 27/48] [NATIVECPU] tbb to oneTBB --- source/adapters/native_cpu/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt index 4a1aa0b253..620e163ed4 100644 --- a/source/adapters/native_cpu/CMakeLists.txt +++ b/source/adapters/native_cpu/CMakeLists.txt @@ -86,7 +86,7 @@ target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../../" ) -if(NATIVECPU_WITH_TBB) +if(NATIVECPU_WITH_ONETBB) target_link_libraries(${TARGET_NAME} PRIVATE TBB::tbb ) From 4a5238fe61ee65941423b62ef52db84e9224ddc7 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Mon, 4 Nov 2024 15:13:06 +0000 Subject: [PATCH 28/48] [NATIVECPU] tbb to oneTBB --- source/adapters/native_cpu/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt index 620e163ed4..c9e89cbdb6 100644 --- a/source/adapters/native_cpu/CMakeLists.txt +++ b/source/adapters/native_cpu/CMakeLists.txt @@ -52,7 +52,7 @@ set_target_properties(${TARGET_NAME} PROPERTIES # oneTBB is used as an optional NativeCPU backend and disabled by default. option(NATIVECPU_WITH_ONETBB "Use oneTBB as backend for Native CPU" OFF) if(NATIVECPU_WITH_ONETBB) - message(STATUS "Configuring Native CPU adapter with TBB backend.") + message(STATUS "Configuring Native CPU adapter with oneTBB backend.") include(FetchContent) FetchContent_Declare( From 3f697aef0c47d2235b8aaf4f65ccd38430698283 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Mon, 4 Nov 2024 15:26:45 +0000 Subject: [PATCH 29/48] [NATIVECPU] num_threads with oneTBB --- source/adapters/native_cpu/threadpool.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp index 3d1dacb93a..d6da676e4d 100644 --- a/source/adapters/native_cpu/threadpool.hpp +++ b/source/adapters/native_cpu/threadpool.hpp @@ -245,7 +245,9 @@ namespace native_cpu { struct TBB_threadpool { oneapi::tbb::task_group tasks; - inline size_t num_threads() const noexcept { return 32; } + inline size_t num_threads() const noexcept { + return oneapi::tbb::info::default_concurrency(); + } }; template <> struct Scheduler<TBB_threadpool> : SchedulerBase<TBB_threadpool> { using SchedulerBase<TBB_threadpool>::SchedulerBase; From 5f687cc85bdd49733e207d589776fa00b6c51e1d Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Wed, 6 Nov 2024 13:34:49 +0000 Subject: [PATCH 30/48] [NATIVECPU] added comment to cmake --- source/adapters/native_cpu/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt index c9e89cbdb6..5b72cbf773 100644 --- a/source/adapters/native_cpu/CMakeLists.txt +++ b/source/adapters/native_cpu/CMakeLists.txt @@ -90,9 +90,9 @@ if(NATIVECPU_WITH_ONETBB) target_link_libraries(${TARGET_NAME} PRIVATE TBB::tbb ) - if (MSVC) - else() + if (NOT MSVC) # oneTBB currently casts away some const qualifiers + # todo: check if compiler actually supports these options target_compile_options(tbb PRIVATE -Wno-cast-qual) target_compile_options(tbbmalloc PRIVATE -Wno-cast-qual) endif() From b651d299f24dd3a713ac451fb8cfadf2791605f7 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Wed, 6 Nov 2024 13:35:46 +0000 Subject: [PATCH 31/48] [NATIVECPU] waiting for tasks when using local args --- source/adapters/native_cpu/enqueue.cpp | 11 +++++++++-- source/adapters/native_cpu/threadpool.hpp | 1 + 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp index 2e7a210083..5b3f071e01 100644 --- a/source/adapters/native_cpu/enqueue.cpp +++ b/source/adapters/native_cpu/enqueue.cpp @@ -191,8 +191,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( kernel._subhandler(kernel._args.data(), &state); } }); - if (++threadId == numParallelThreads) + if (++threadId == numParallelThreads) { threadId = 0; + if (!hKernel->_localArgInfo.empty()) + Tasks.wait(); + } } } } else { @@ -226,9 +229,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( } }); } - // schedule the remaining tasks if (remainder) { + if (thread) { + thread = 0; + if (!hKernel->_localArgInfo.empty()) + Tasks.wait(); + } Tasks.schedule([&groups, remainder, thread, scheduled = numParallelThreads * groupsPerThread, hKernel](size_t /*threadId*/) { diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp index d6da676e4d..5a49cbc88e 100644 --- a/source/adapters/native_cpu/threadpool.hpp +++ b/source/adapters/native_cpu/threadpool.hpp @@ -223,6 +223,7 @@ template <class TP> struct Scheduler : SchedulerBase<TP> { inline void wait() { for (auto &f : futures) f.get(); + futures.clear(); } private: From 4f64538d70c8526d85205cb0011ec97a88baea6a Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Wed, 6 Nov 2024 18:25:27 +0000 Subject: [PATCH 32/48] [NATIVECPU] using old task ids with tbb (WIP) --- source/adapters/native_cpu/enqueue.cpp | 28 +++++++---------------- source/adapters/native_cpu/threadpool.hpp | 7 +++++- 2 files changed, 14 insertions(+), 21 deletions(-) mode change 100644 => 100755 source/adapters/native_cpu/threadpool.hpp diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp index 5b3f071e01..86b5d1116f 100644 --- a/source/adapters/native_cpu/enqueue.cpp +++ b/source/adapters/native_cpu/enqueue.cpp @@ -178,24 +178,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( } else { // We are running a parallel_for over an nd_range - size_t threadId = 0; if (numWG1 * numWG2 >= numParallelThreads) { // Dimensions 1 and 2 have enough work, split them across the threadpool for (unsigned g2 = 0; g2 < numWG2; g2++) { for (unsigned g1 = 0; g1 < numWG1; g1++) { - Tasks.schedule([state, kernel = *hKernel, numWG0, g1, g2, threadId, - numParallelThreads](size_t /*threadId*/) mutable { + Tasks.schedule([state, kernel = *hKernel, numWG0, g1, g2, + numParallelThreads](size_t threadId) mutable { for (unsigned g0 = 0; g0 < numWG0; g0++) { kernel.handleLocalArgs(numParallelThreads, threadId); state.update(g0, g1, g2); kernel._subhandler(kernel._args.data(), &state); } }); - if (++threadId == numParallelThreads) { - threadId = 0; - if (!hKernel->_localArgInfo.empty()) - Tasks.wait(); - } } } } else { @@ -219,29 +213,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( auto numGroups = groups.size(); auto groupsPerThread = numGroups / numParallelThreads; auto remainder = numGroups % numParallelThreads; - unsigned thread = 0; - for (; groupsPerThread && thread < numParallelThreads; thread++) { + for (size_t thread = 0; groupsPerThread && thread < numParallelThreads; thread++) { Tasks.schedule( - [&groups, thread, groupsPerThread, hKernel](size_t /*threadId*/) { + [&groups, thread, groupsPerThread, hKernel](size_t threadId) { for (unsigned i = 0; i < groupsPerThread; i++) { auto index = thread * groupsPerThread + i; - groups[index](thread /*Id*/, *hKernel); + groups[index](threadId, *hKernel); } }); } // schedule the remaining tasks if (remainder) { - if (thread) { - thread = 0; - if (!hKernel->_localArgInfo.empty()) - Tasks.wait(); - } - Tasks.schedule([&groups, remainder, thread, + Tasks.schedule([&groups, remainder, scheduled = numParallelThreads * groupsPerThread, - hKernel](size_t /*threadId*/) { + hKernel](size_t threadId) { for (unsigned i = 0; i < remainder; i++) { auto index = scheduled + i; - groups[index](thread /*Id*/, *hKernel); + groups[index](threadId, *hKernel); } }); } diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp old mode 100644 new mode 100755 index 5a49cbc88e..25852bed79 --- a/source/adapters/native_cpu/threadpool.hpp +++ b/source/adapters/native_cpu/threadpool.hpp @@ -253,7 +253,12 @@ struct TBB_threadpool { template <> struct Scheduler<TBB_threadpool> : SchedulerBase<TBB_threadpool> { using SchedulerBase<TBB_threadpool>::SchedulerBase; template <class T> inline void schedule(T &&task) { - TPref.tasks.run(std::function<void()>([=]() mutable { task(0); })); + TPref.tasks.run(std::function<void()>([=]() mutable { + auto thread_id = tbb::this_task_arena::current_thread_index(); + assert(thread_id >= 0 && + thread_id < oneapi::tbb::info::default_concurrency()); + task(thread_id); + })); } inline void wait() { TPref.tasks.wait(); } }; From 6330a292951fdf19510ee7eed50eab9f3af9b005 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Wed, 6 Nov 2024 18:27:01 +0000 Subject: [PATCH 33/48] [NATIVECPU] changed back filemode --- source/adapters/native_cpu/threadpool.hpp | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 source/adapters/native_cpu/threadpool.hpp diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp old mode 100755 new mode 100644 From f566f803d9583364b8262c1d9f2cdd1568cb3e81 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Thu, 7 Nov 2024 10:21:41 +0000 Subject: [PATCH 34/48] [NATIVECPU] removed unneeded clear --- source/adapters/native_cpu/threadpool.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp index 25852bed79..28d071f29b 100644 --- a/source/adapters/native_cpu/threadpool.hpp +++ b/source/adapters/native_cpu/threadpool.hpp @@ -223,7 +223,6 @@ template <class TP> struct Scheduler : SchedulerBase<TP> { inline void wait() { for (auto &f : futures) f.get(); - futures.clear(); } private: From 73576520a524e9b44d63fe6c24ae737bc9d0b0ca Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Wed, 13 Nov 2024 18:09:32 +0000 Subject: [PATCH 35/48] [NATIVECPU] removed MS extensions --- source/adapters/native_cpu/event.hpp | 2 +- source/adapters/native_cpu/threadpool.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/source/adapters/native_cpu/event.hpp b/source/adapters/native_cpu/event.hpp index de51231d8f..713971b24f 100644 --- a/source/adapters/native_cpu/event.hpp +++ b/source/adapters/native_cpu/event.hpp @@ -42,7 +42,7 @@ struct ur_event_handle_t_ : RefCounted { ur_command_t getCommandType() const { return command_type; } // todo: get rid of this function - void set_futures(native_cpu::TasksInfoType &fs) { + void set_futures(native_cpu::TasksInfoType &&fs) { std::lock_guard<std::mutex> lock(mutex); futures = std::move(fs); } diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp index 1763d353d5..f1bc70bd65 100644 --- a/source/adapters/native_cpu/threadpool.hpp +++ b/source/adapters/native_cpu/threadpool.hpp @@ -231,7 +231,7 @@ template <class TP> struct Scheduler : Scheduler_base<TP, TasksInfo_TP> { using Scheduler_base<TP, TasksInfo_TP>::Scheduler_base; inline void schedule(worker_task_t &&task) { - ti.schedule(this->ref.schedule_task(std::move(task))); + this->ti.schedule(this->ref.schedule_task(std::move(task))); } }; From a3e52e62c9b4489d1091b687f31f1294950f9770 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Thu, 14 Nov 2024 09:56:49 +0000 Subject: [PATCH 36/48] [NATIVECPU] fix merge with events update --- source/adapters/native_cpu/device.hpp | 2 +- source/adapters/native_cpu/event.cpp | 3 ++- source/adapters/native_cpu/event.hpp | 4 ++-- source/adapters/native_cpu/threadpool.hpp | 17 ++++++++++------- 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/source/adapters/native_cpu/device.hpp b/source/adapters/native_cpu/device.hpp index e0cf3872b5..358e9a37b6 100644 --- a/source/adapters/native_cpu/device.hpp +++ b/source/adapters/native_cpu/device.hpp @@ -14,7 +14,7 @@ #include <ur/ur.hpp> struct ur_device_handle_t_ { - native_cpu::ThreadPoolType tp; + native_cpu::threadpool_t tp; ur_device_handle_t_(ur_platform_handle_t ArgPlt); diff --git a/source/adapters/native_cpu/event.cpp b/source/adapters/native_cpu/event.cpp index adb560bca9..6a5ff41e28 100644 --- a/source/adapters/native_cpu/event.cpp +++ b/source/adapters/native_cpu/event.cpp @@ -13,6 +13,7 @@ #include "common.hpp" #include "event.hpp" #include "queue.hpp" +#include "device.hpp" #include <cstdint> #include <mutex> @@ -123,7 +124,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( ur_event_handle_t_::ur_event_handle_t_(ur_queue_handle_t queue, ur_command_t command_type) : queue(queue), context(queue->getContext()), command_type(command_type), - done(false) { + done(false), futures(queue->getDevice()->tp) { this->queue->addEvent(this); } diff --git a/source/adapters/native_cpu/event.hpp b/source/adapters/native_cpu/event.hpp index 713971b24f..4b51875afd 100644 --- a/source/adapters/native_cpu/event.hpp +++ b/source/adapters/native_cpu/event.hpp @@ -42,7 +42,7 @@ struct ur_event_handle_t_ : RefCounted { ur_command_t getCommandType() const { return command_type; } // todo: get rid of this function - void set_futures(native_cpu::TasksInfoType &&fs) { + void set_futures(native_cpu::tasksinfo_t &&fs) { std::lock_guard<std::mutex> lock(mutex); futures = std::move(fs); } @@ -61,7 +61,7 @@ struct ur_event_handle_t_ : RefCounted { ur_command_t command_type; bool done; std::mutex mutex; - native_cpu::TasksInfoType futures; + native_cpu::tasksinfo_t futures; std::function<void()> callback; uint64_t timestamp_start = 0; uint64_t timestamp_end = 0; diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp index f1bc70bd65..f2f907f4b5 100644 --- a/source/adapters/native_cpu/threadpool.hpp +++ b/source/adapters/native_cpu/threadpool.hpp @@ -208,6 +208,7 @@ template <typename ThreadPoolT> class threadpool_interface { return workerTask->get_future(); } }; +using simple_threadpool_t = threadpool_interface<detail::simple_thread_pool>; class TasksInfo_TP { using FType = std::future<void>; @@ -218,12 +219,13 @@ class TasksInfo_TP { for (auto &f : futures) f.wait(); } + TasksInfo_TP(simple_threadpool_t &) {} }; template <class TP, class TaskInfo> struct Scheduler_base { TP &ref; TaskInfo ti; - Scheduler_base(TP &ref_) : ref(ref_) {} + Scheduler_base(TP &ref_) : ref(ref_), ti(ref_) {} TaskInfo getTaskInfo() { return std::move(ti); } }; @@ -235,7 +237,6 @@ template <class TP> struct Scheduler : Scheduler_base<TP, TasksInfo_TP> { } }; -using simple_threadpool_t = threadpool_interface<detail::simple_thread_pool>; template <class TPType> inline Scheduler<TPType> getScheduler(TPType &tp) { return Scheduler<TPType>(tp); } @@ -254,9 +255,11 @@ struct TBB_threadpool { } }; -struct TBB_TasksInfo { +class TBB_TasksInfo { TBB_threadpool *tp; +public: inline void wait() { tp->tasks.wait(); } + TBB_TasksInfo(TBB_threadpool &t) : tp(&t) {} }; template <> struct Scheduler<TBB_threadpool> : Scheduler_base<TBB_threadpool, TBB_TasksInfo> { @@ -271,14 +274,14 @@ template <> struct Scheduler<TBB_threadpool> : Scheduler_base<TBB_threadpool, T } }; -using TasksInfoType = TBB_TasksInfo; -using ThreadPoolType = TBB_threadpool; +using tasksinfo_t = TBB_TasksInfo; +using threadpool_t = TBB_threadpool; } // namespace native_cpu #else // The default backend namespace native_cpu { -using TasksInfoType = TasksInfo_TP; -using ThreadPoolType = simple_threadpool_t; +using tasksinfo_t = TasksInfo_TP; +using threadpool_t = simple_threadpool_t; } #endif From 56afb9a7b8c3961764f6c97698a130479dc4fce1 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Thu, 14 Nov 2024 12:29:39 +0000 Subject: [PATCH 37/48] [NATIVECPU] revert noise --- source/adapters/native_cpu/device.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/source/adapters/native_cpu/device.hpp b/source/adapters/native_cpu/device.hpp index 358e9a37b6..2308c1a7f4 100644 --- a/source/adapters/native_cpu/device.hpp +++ b/source/adapters/native_cpu/device.hpp @@ -15,7 +15,6 @@ struct ur_device_handle_t_ { native_cpu::threadpool_t tp; - ur_device_handle_t_(ur_platform_handle_t ArgPlt); const uint64_t mem_size; From 488b641055e420db2f756de17e066b949d2a19e6 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Thu, 14 Nov 2024 12:41:46 +0000 Subject: [PATCH 38/48] [NATIVECPU] fix integer size warnings --- source/adapters/native_cpu/context.hpp | 2 +- source/adapters/native_cpu/enqueue.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/source/adapters/native_cpu/context.hpp b/source/adapters/native_cpu/context.hpp index b9d2d22dd1..8168e0d10e 100644 --- a/source/adapters/native_cpu/context.hpp +++ b/source/adapters/native_cpu/context.hpp @@ -116,7 +116,7 @@ struct ur_context_handle_t_ : RefCounted { // We need to ensure that we align to at least alignof(usm_alloc_info), // otherwise its start address may be unaligned. alignment = - std::max<size_t>(alignment, alignof(native_cpu::usm_alloc_info)); + std::max<uint32_t>(alignment, alignof(native_cpu::usm_alloc_info)); void *alloc = native_cpu::malloc_impl(alignment, size); if (!alloc) return nullptr; diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp index c4dcd0a04f..bda547bf36 100644 --- a/source/adapters/native_cpu/enqueue.cpp +++ b/source/adapters/native_cpu/enqueue.cpp @@ -445,7 +445,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( // TODO: error checking // TODO: handle async void *startingPtr = hBuffer->_mem + offset; - unsigned steps = size / patternSize; + size_t steps = size / patternSize; for (unsigned i = 0; i < steps; i++) { memcpy(static_cast<int8_t *>(startingPtr) + i * patternSize, pPattern, patternSize); @@ -586,7 +586,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( break; } default: { - for (unsigned int step{0}; step < size; step += patternSize) { + for (size_t step{0}; step < size; step += patternSize) { auto *dest = reinterpret_cast<void *>( reinterpret_cast<uint8_t *>(ptr) + step); memcpy(dest, pPattern, patternSize); From 75288ce526d11a0b1afe30ae5b2592276b16dd37 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Tue, 26 Nov 2024 17:26:27 +0000 Subject: [PATCH 39/48] [NATIVECPU] update oneTBB tag --- source/adapters/native_cpu/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt index 5b72cbf773..83dc07af57 100644 --- a/source/adapters/native_cpu/CMakeLists.txt +++ b/source/adapters/native_cpu/CMakeLists.txt @@ -58,7 +58,7 @@ if(NATIVECPU_WITH_ONETBB) FetchContent_Declare( tbb GIT_REPOSITORY https://github.com/oneapi-src/oneTBB.git - GIT_TAG 377a91431ec62c5e296dbeca683c5d1e66d69f32 + GIT_TAG bef1519a4216d77042637c3f48af2c060a5213d1 CMAKE_ARGS "-DTBB_TEST:BOOL=OFF -DTBB_EXAMPLES:BOOL=OFF -DTBB_BENCH:BOOL=OFF" OVERRIDE_FIND_PACKAGE ) From fc992e3c2e84156164d32cc01e0736db85a44814 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Fri, 29 Nov 2024 10:56:53 +0000 Subject: [PATCH 40/48] [NATIVECPU] use oneTBB UXL github --- source/adapters/native_cpu/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt index 83dc07af57..d12843e059 100644 --- a/source/adapters/native_cpu/CMakeLists.txt +++ b/source/adapters/native_cpu/CMakeLists.txt @@ -57,7 +57,7 @@ if(NATIVECPU_WITH_ONETBB) include(FetchContent) FetchContent_Declare( tbb - GIT_REPOSITORY https://github.com/oneapi-src/oneTBB.git + GIT_REPOSITORY https://github.com/uxlfoundation/oneTBB.git GIT_TAG bef1519a4216d77042637c3f48af2c060a5213d1 CMAKE_ARGS "-DTBB_TEST:BOOL=OFF -DTBB_EXAMPLES:BOOL=OFF -DTBB_BENCH:BOOL=OFF" OVERRIDE_FIND_PACKAGE From 469f27f3c173ffa23329cafae54466d6a42277c4 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Thu, 12 Dec 2024 19:07:34 +0000 Subject: [PATCH 41/48] [NATIVECPU] undefine _DEBUG in release builds for tbb --- source/adapters/native_cpu/CMakeLists.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt index d12843e059..3ac4b9bf87 100644 --- a/source/adapters/native_cpu/CMakeLists.txt +++ b/source/adapters/native_cpu/CMakeLists.txt @@ -97,5 +97,11 @@ if(NATIVECPU_WITH_ONETBB) target_compile_options(tbbmalloc PRIVATE -Wno-cast-qual) endif() + # Undefine _DEBUG option in release builds to find + # release tbbbind + if (NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG") + target_compile_options(tbb PRIVATE -U_DEBUG) + endif() + target_compile_definitions(${TARGET_NAME} PRIVATE NATIVECPU_WITH_ONETBB) endif() From 979072f93db951567b032fc3efe2156b575d6b48 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Mon, 27 Jan 2025 15:25:39 +0000 Subject: [PATCH 42/48] [NATIVECPU] oneTBB bump --- source/adapters/native_cpu/CMakeLists.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt index cf0b109e92..dfb9d0a655 100644 --- a/source/adapters/native_cpu/CMakeLists.txt +++ b/source/adapters/native_cpu/CMakeLists.txt @@ -60,7 +60,11 @@ if(NATIVECPU_WITH_ONETBB) FetchContent_Declare( tbb GIT_REPOSITORY https://github.com/uxlfoundation/oneTBB.git - GIT_TAG bef1519a4216d77042637c3f48af2c060a5213d1 +#commit 9d4578723827f31defd79389819a5fbf659577f7 (HEAD -> master, origin/master, origin/HEAD) +#Author: Konstantin Boyarinov <konstantin.boyarinov@intel.com> +#Date: Fri Jan 24 23:23:59 2025 +0200 +# Add explicit deduction guides for blocked_nd_range (#1525) + GIT_TAG 9d4578723827f31defd79389819a5fbf659577f7 CMAKE_ARGS "-DTBB_TEST:BOOL=OFF -DTBB_EXAMPLES:BOOL=OFF -DTBB_BENCH:BOOL=OFF" OVERRIDE_FIND_PACKAGE ) From dac6f01ae0f7b3be02f85c08db30d867aedb8cd3 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Tue, 28 Jan 2025 10:09:32 +0000 Subject: [PATCH 43/48] [NATIVECPU] clang-format and removed one inline --- source/adapters/native_cpu/enqueue.cpp | 40 +++++++++---------- source/adapters/native_cpu/event.cpp | 2 +- source/adapters/native_cpu/event.hpp | 2 +- source/adapters/native_cpu/kernel.hpp | 2 +- .../adapters/native_cpu/nativecpu_state.hpp | 6 +-- source/adapters/native_cpu/threadpool.hpp | 6 ++- 6 files changed, 31 insertions(+), 27 deletions(-) diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp index d670626258..96100ac945 100644 --- a/source/adapters/native_cpu/enqueue.cpp +++ b/source/adapters/native_cpu/enqueue.cpp @@ -184,13 +184,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( for (unsigned g2 = 0; g2 < numWG2; g2++) { for (unsigned g1 = 0; g1 < numWG1; g1++) { Tasks.schedule([state, kernel = *hKernel, numWG0, g1, g2, - numParallelThreads](size_t threadId) mutable { - for (unsigned g0 = 0; g0 < numWG0; g0++) { - kernel.handleLocalArgs(numParallelThreads, threadId); - state.update(g0, g1, g2); - kernel._subhandler(kernel.getArgs().data(), &state); - } - }); + numParallelThreads](size_t threadId) mutable { + for (unsigned g0 = 0; g0 < numWG0; g0++) { + kernel.handleLocalArgs(numParallelThreads, threadId); + state.update(g0, g1, g2); + kernel._subhandler(kernel.getArgs().data(), &state); + } + }); } } } else { @@ -217,23 +217,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( for (unsigned thread = 0; groupsPerThread && thread < numParallelThreads; thread++) { Tasks.schedule([groups, thread, groupsPerThread, - kernel = *hKernel](size_t threadId) { - for (unsigned i = 0; i < groupsPerThread; i++) { - auto index = thread * groupsPerThread + i; - groups[index](threadId, kernel); - } - }); + kernel = *hKernel](size_t threadId) { + for (unsigned i = 0; i < groupsPerThread; i++) { + auto index = thread * groupsPerThread + i; + groups[index](threadId, kernel); + } + }); } // schedule the remaining tasks if (remainder) { Tasks.schedule([groups, remainder, - scheduled = numParallelThreads * groupsPerThread, - kernel = *hKernel](size_t threadId) { - for (unsigned i = 0; i < remainder; i++) { - auto index = scheduled + i; - groups[index](threadId, kernel); - } - }); + scheduled = numParallelThreads * groupsPerThread, + kernel = *hKernel](size_t threadId) { + for (unsigned i = 0; i < remainder; i++) { + auto index = scheduled + i; + groups[index](threadId, kernel); + } + }); } } } diff --git a/source/adapters/native_cpu/event.cpp b/source/adapters/native_cpu/event.cpp index 6a5ff41e28..13afc9f66a 100644 --- a/source/adapters/native_cpu/event.cpp +++ b/source/adapters/native_cpu/event.cpp @@ -11,9 +11,9 @@ #include "ur_api.h" #include "common.hpp" +#include "device.hpp" #include "event.hpp" #include "queue.hpp" -#include "device.hpp" #include <cstdint> #include <mutex> diff --git a/source/adapters/native_cpu/event.hpp b/source/adapters/native_cpu/event.hpp index 4b51875afd..ac3a322e21 100644 --- a/source/adapters/native_cpu/event.hpp +++ b/source/adapters/native_cpu/event.hpp @@ -9,12 +9,12 @@ //===----------------------------------------------------------------------===// #pragma once #include "common.hpp" +#include "threadpool.hpp" #include "ur_api.h" #include <cstdint> #include <future> #include <mutex> #include <vector> -#include "threadpool.hpp" struct ur_event_handle_t_ : RefCounted { diff --git a/source/adapters/native_cpu/kernel.hpp b/source/adapters/native_cpu/kernel.hpp index 59779e439c..6ca3eae777 100644 --- a/source/adapters/native_cpu/kernel.hpp +++ b/source/adapters/native_cpu/kernel.hpp @@ -28,7 +28,7 @@ struct local_arg_info_t { struct ur_kernel_handle_t_ : RefCounted { - inline ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name, + ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name, nativecpu_task_t subhandler) : hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)} {} diff --git a/source/adapters/native_cpu/nativecpu_state.hpp b/source/adapters/native_cpu/nativecpu_state.hpp index c802229326..b9109f647e 100644 --- a/source/adapters/native_cpu/nativecpu_state.hpp +++ b/source/adapters/native_cpu/nativecpu_state.hpp @@ -20,9 +20,9 @@ struct state { size_t MNumGroups[3]; size_t MGlobalOffset[3]; uint32_t NumSubGroups, SubGroup_id, SubGroup_local_id, SubGroup_size; - inline state(size_t globalR0, size_t globalR1, size_t globalR2, size_t localR0, - size_t localR1, size_t localR2, size_t globalO0, size_t globalO1, - size_t globalO2) + inline state(size_t globalR0, size_t globalR1, size_t globalR2, + size_t localR0, size_t localR1, size_t localR2, size_t globalO0, + size_t globalO1, size_t globalO2) : MGlobal_range{globalR0, globalR1, globalR2}, MWorkGroup_size{localR0, localR1, localR2}, MNumGroups{globalR0 / localR0, globalR1 / localR1, globalR2 / localR2}, diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp index f2f907f4b5..8b589e4cd4 100644 --- a/source/adapters/native_cpu/threadpool.hpp +++ b/source/adapters/native_cpu/threadpool.hpp @@ -213,6 +213,7 @@ using simple_threadpool_t = threadpool_interface<detail::simple_thread_pool>; class TasksInfo_TP { using FType = std::future<void>; std::vector<FType> futures; + public: inline void schedule(FType &&f) { futures.emplace_back(std::move(f)); } inline void wait() { @@ -257,12 +258,15 @@ struct TBB_threadpool { class TBB_TasksInfo { TBB_threadpool *tp; + public: inline void wait() { tp->tasks.wait(); } TBB_TasksInfo(TBB_threadpool &t) : tp(&t) {} }; -template <> struct Scheduler<TBB_threadpool> : Scheduler_base<TBB_threadpool, TBB_TasksInfo> { +template <> +struct Scheduler<TBB_threadpool> + : Scheduler_base<TBB_threadpool, TBB_TasksInfo> { using Scheduler_base<TBB_threadpool, TBB_TasksInfo>::Scheduler_base; template <class T> inline void schedule(T &&task) { ref.tasks.run(std::function<void()>([=]() mutable { From 2abe90aa0670fd64b0b5fb8dadab45bb5716b448 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Tue, 28 Jan 2025 12:10:04 +0000 Subject: [PATCH 44/48] [NATIVECPU] clang-format --- source/adapters/native_cpu/nativecpu_state.hpp | 2 +- source/adapters/native_cpu/threadpool.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/source/adapters/native_cpu/nativecpu_state.hpp b/source/adapters/native_cpu/nativecpu_state.hpp index b9109f647e..68743c33cf 100644 --- a/source/adapters/native_cpu/nativecpu_state.hpp +++ b/source/adapters/native_cpu/nativecpu_state.hpp @@ -43,7 +43,7 @@ struct state { } inline void update(size_t group0, size_t group1, size_t group2, size_t local0, - size_t local1, size_t local2) { + size_t local1, size_t local2) { MWorkGroup_id[0] = group0; MWorkGroup_id[1] = group1; MWorkGroup_id[2] = group2; diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp index 8b589e4cd4..9b2abc45dc 100644 --- a/source/adapters/native_cpu/threadpool.hpp +++ b/source/adapters/native_cpu/threadpool.hpp @@ -287,5 +287,5 @@ using threadpool_t = TBB_threadpool; namespace native_cpu { using tasksinfo_t = TasksInfo_TP; using threadpool_t = simple_threadpool_t; -} +} // namespace native_cpu #endif From bddd831c1efdae43e566007182ccfd7fc1273f09 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Tue, 28 Jan 2025 12:47:34 +0000 Subject: [PATCH 45/48] [NATIVECPU] removed inline --- source/adapters/native_cpu/enqueue.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp index 96100ac945..7f5241a43c 100644 --- a/source/adapters/native_cpu/enqueue.cpp +++ b/source/adapters/native_cpu/enqueue.cpp @@ -26,8 +26,8 @@ struct NDRDescT { RangeT GlobalOffset; RangeT GlobalSize; RangeT LocalSize; - inline NDRDescT(uint32_t WorkDim, const size_t *GlobalWorkOffset, - const size_t *GlobalWorkSize, const size_t *LocalWorkSize) + NDRDescT(uint32_t WorkDim, const size_t *GlobalWorkOffset, + const size_t *GlobalWorkSize, const size_t *LocalWorkSize) : WorkDim(WorkDim) { for (uint32_t I = 0; I < WorkDim; I++) { GlobalOffset[I] = GlobalWorkOffset[I]; From e7d9ff72dc759a92e2df105e025510c8f6e4eb1d Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Tue, 28 Jan 2025 12:59:53 +0000 Subject: [PATCH 46/48] [NATIVECPU] added separate if statement for clarity --- source/adapters/native_cpu/enqueue.cpp | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp index 7f5241a43c..22cf26602f 100644 --- a/source/adapters/native_cpu/enqueue.cpp +++ b/source/adapters/native_cpu/enqueue.cpp @@ -213,18 +213,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( } auto numGroups = groups.size(); auto groupsPerThread = numGroups / numParallelThreads; - auto remainder = numGroups % numParallelThreads; - for (unsigned thread = 0; groupsPerThread && thread < numParallelThreads; - thread++) { - Tasks.schedule([groups, thread, groupsPerThread, - kernel = *hKernel](size_t threadId) { - for (unsigned i = 0; i < groupsPerThread; i++) { - auto index = thread * groupsPerThread + i; - groups[index](threadId, kernel); - } - }); + if (groupsPerThread) { + for (unsigned thread = 0; thread < numParallelThreads; thread++) { + Tasks.schedule([groups, thread, groupsPerThread, + kernel = *hKernel](size_t threadId) { + for (unsigned i = 0; i < groupsPerThread; i++) { + auto index = thread * groupsPerThread + i; + groups[index](threadId, kernel); + } + }); + } } // schedule the remaining tasks + auto remainder = numGroups % numParallelThreads; if (remainder) { Tasks.schedule([groups, remainder, scheduled = numParallelThreads * groupsPerThread, From 4e9bd6780a9303b13ac45dd9684692019adb8bd1 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Tue, 28 Jan 2025 13:08:22 +0000 Subject: [PATCH 47/48] [NATIVECPU] renamed wait to wait_all --- source/adapters/native_cpu/event.cpp | 2 +- source/adapters/native_cpu/threadpool.hpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/source/adapters/native_cpu/event.cpp b/source/adapters/native_cpu/event.cpp index 13afc9f66a..b03591dc57 100644 --- a/source/adapters/native_cpu/event.cpp +++ b/source/adapters/native_cpu/event.cpp @@ -139,7 +139,7 @@ void ur_event_handle_t_::wait() { if (done) { return; } - this->futures.wait(); + this->futures.wait_all(); queue->removeEvent(this); done = true; // The callback may need to acquire the lock, so we unlock it here diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp index 9b2abc45dc..b38ccad83a 100644 --- a/source/adapters/native_cpu/threadpool.hpp +++ b/source/adapters/native_cpu/threadpool.hpp @@ -216,7 +216,7 @@ class TasksInfo_TP { public: inline void schedule(FType &&f) { futures.emplace_back(std::move(f)); } - inline void wait() { + inline void wait_all() { for (auto &f : futures) f.wait(); } @@ -260,7 +260,7 @@ class TBB_TasksInfo { TBB_threadpool *tp; public: - inline void wait() { tp->tasks.wait(); } + inline void wait_all() { tp->tasks.wait(); } TBB_TasksInfo(TBB_threadpool &t) : tp(&t) {} }; From 20668ff36e92b5050d3a8c587523f45568c34d39 Mon Sep 17 00:00:00 2001 From: Uwe Dolinsky <uwe@codeplay.com> Date: Mon, 3 Feb 2025 20:30:51 +0000 Subject: [PATCH 48/48] [NATIVECPU] move --- source/adapters/native_cpu/threadpool.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp index b38ccad83a..a016131f67 100644 --- a/source/adapters/native_cpu/threadpool.hpp +++ b/source/adapters/native_cpu/threadpool.hpp @@ -268,8 +268,8 @@ template <> struct Scheduler<TBB_threadpool> : Scheduler_base<TBB_threadpool, TBB_TasksInfo> { using Scheduler_base<TBB_threadpool, TBB_TasksInfo>::Scheduler_base; - template <class T> inline void schedule(T &&task) { - ref.tasks.run(std::function<void()>([=]() mutable { + template <class T> inline void schedule(T &&task_) { + ref.tasks.run(std::function<void()>([task = std::move(task_)]() mutable { auto thread_id = tbb::this_task_arena::current_thread_index(); assert(thread_id >= 0 && thread_id < oneapi::tbb::info::default_concurrency());