From da5500c6b267d1492f84e9552946d55817045076 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Mon, 26 Aug 2024 20:25:02 +0100
Subject: [PATCH 01/48] [SYCLNATIVECPU] inline native_cpu adapter functions

---
 source/adapters/native_cpu/enqueue.cpp         |  4 ++--
 source/adapters/native_cpu/kernel.hpp          | 12 ++++++------
 source/adapters/native_cpu/nativecpu_state.hpp |  6 +++---
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp
index b5d4713e2f..ec9ba99389 100644
--- a/source/adapters/native_cpu/enqueue.cpp
+++ b/source/adapters/native_cpu/enqueue.cpp
@@ -25,7 +25,7 @@ struct NDRDescT {
   RangeT GlobalOffset;
   RangeT GlobalSize;
   RangeT LocalSize;
-  NDRDescT(uint32_t WorkDim, const size_t *GlobalWorkOffset,
+  inline NDRDescT(uint32_t WorkDim, const size_t *GlobalWorkOffset,
            const size_t *GlobalWorkSize, const size_t *LocalWorkSize)
       : WorkDim(WorkDim) {
     for (uint32_t I = 0; I < WorkDim; I++) {
@@ -52,7 +52,7 @@ struct NDRDescT {
 } // namespace native_cpu
 
 #ifdef NATIVECPU_USE_OCK
-static native_cpu::state getResizedState(const native_cpu::NDRDescT &ndr,
+static inline native_cpu::state getResizedState(const native_cpu::NDRDescT &ndr,
                                          size_t itemsPerThread) {
   native_cpu::state resized_state(
       ndr.GlobalSize[0], ndr.GlobalSize[1], ndr.GlobalSize[2], itemsPerThread,
diff --git a/source/adapters/native_cpu/kernel.hpp b/source/adapters/native_cpu/kernel.hpp
index b5728fa8b2..c71e4f7e75 100644
--- a/source/adapters/native_cpu/kernel.hpp
+++ b/source/adapters/native_cpu/kernel.hpp
@@ -20,7 +20,7 @@ namespace native_cpu {
 struct NativeCPUArgDesc {
   void *MPtr;
 
-  NativeCPUArgDesc(void *Ptr) : MPtr(Ptr){};
+  inline NativeCPUArgDesc(void *Ptr) : MPtr(Ptr){};
 };
 
 } // namespace native_cpu
@@ -33,18 +33,18 @@ using nativecpu_task_t = std::function<nativecpu_kernel_t>;
 struct local_arg_info_t {
   uint32_t argIndex;
   size_t argSize;
-  local_arg_info_t(uint32_t argIndex, size_t argSize)
+  inline local_arg_info_t(uint32_t argIndex, size_t argSize)
       : argIndex(argIndex), argSize(argSize) {}
 };
 
 struct ur_kernel_handle_t_ : RefCounted {
 
-  ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name,
+  inline ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name,
                       nativecpu_task_t subhandler)
       : hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)},
         HasReqdWGSize(false) {}
 
-  ur_kernel_handle_t_(const ur_kernel_handle_t_ &other)
+  inline ur_kernel_handle_t_(const ur_kernel_handle_t_ &other)
       : hProgram(other.hProgram), _name(other._name),
         _subhandler(other._subhandler), _args(other._args),
         _localArgInfo(other._localArgInfo), _localMemPool(other._localMemPool),
@@ -53,12 +53,12 @@ struct ur_kernel_handle_t_ : RefCounted {
     incrementReferenceCount();
   }
 
-  ~ur_kernel_handle_t_() {
+  inline ~ur_kernel_handle_t_() {
     if (decrementReferenceCount() == 0) {
       free(_localMemPool);
     }
   }
-  ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name,
+  inline ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name,
                       nativecpu_task_t subhandler,
                       const native_cpu::ReqdWGSize_t &ReqdWGSize)
       : hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)},
diff --git a/source/adapters/native_cpu/nativecpu_state.hpp b/source/adapters/native_cpu/nativecpu_state.hpp
index bb798b22e6..b3a14d4137 100755
--- a/source/adapters/native_cpu/nativecpu_state.hpp
+++ b/source/adapters/native_cpu/nativecpu_state.hpp
@@ -20,7 +20,7 @@ struct state {
   size_t MNumGroups[3];
   size_t MGlobalOffset[3];
   uint32_t NumSubGroups, SubGroup_id, SubGroup_local_id, SubGroup_size;
-  state(size_t globalR0, size_t globalR1, size_t globalR2, size_t localR0,
+  inline state(size_t globalR0, size_t globalR1, size_t globalR2, size_t localR0,
         size_t localR1, size_t localR2, size_t globalO0, size_t globalO1,
         size_t globalO2)
       : MGlobal_range{globalR0, globalR1, globalR2}, MWorkGroup_size{localR0,
@@ -43,7 +43,7 @@ struct state {
     SubGroup_size = 1;
   }
 
-  void update(size_t group0, size_t group1, size_t group2, size_t local0,
+  inline void update(size_t group0, size_t group1, size_t group2, size_t local0,
               size_t local1, size_t local2) {
     MWorkGroup_id[0] = group0;
     MWorkGroup_id[1] = group1;
@@ -59,7 +59,7 @@ struct state {
         MWorkGroup_size[2] * MWorkGroup_id[2] + MLocal_id[2] + MGlobalOffset[2];
   }
 
-  void update(size_t group0, size_t group1, size_t group2) {
+  inline void update(size_t group0, size_t group1, size_t group2) {
     MWorkGroup_id[0] = group0;
     MWorkGroup_id[1] = group1;
     MWorkGroup_id[2] = group2;

From 0b8b0f7d9debb791ef18c3ae2e8853345c1c46b5 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Wed, 28 Aug 2024 10:55:46 +0100
Subject: [PATCH 02/48] [NATIVECPU] use size_t, reserve vector size

---
 source/adapters/native_cpu/enqueue.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp
index ec9ba99389..c878d289e4 100644
--- a/source/adapters/native_cpu/enqueue.cpp
+++ b/source/adapters/native_cpu/enqueue.cpp
@@ -158,7 +158,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
         }
         // Peel the remaining work items. Since the local size is 1, we iterate
         // over the work groups.
-        for (unsigned g0 = new_num_work_groups_0 * itemsPerThread; g0 < numWG0;
+        for (size_t g0 = new_num_work_groups_0 * itemsPerThread; g0 < numWG0;
              g0++) {
           state.update(g0, g1, g2);
           hKernel->_subhandler(hKernel->_args.data(), &state);
@@ -188,6 +188,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
       // Split dimension 0 across the threadpool
       // Here we try to create groups of workgroups in order to reduce
       // synchronization overhead
+      groups.reserve(numWG2 * numWG1 * numWG0);
       for (unsigned g2 = 0; g2 < numWG2; g2++) {
         for (unsigned g1 = 0; g1 < numWG1; g1++) {
           for (unsigned g0 = 0; g0 < numWG0; g0++) {
@@ -204,6 +205,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
       auto numGroups = groups.size();
       auto groupsPerThread = numGroups / numParallelThreads;
       auto remainder = numGroups % numParallelThreads;
+      futures.reserve(numParallelThreads + remainder);
       for (unsigned thread = 0; thread < numParallelThreads; thread++) {
         futures.emplace_back(tp.schedule_task(
             [&groups, thread, groupsPerThread, hKernel](size_t threadId) {
@@ -400,7 +402,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
   // TODO: error checking
   // TODO: handle async
   void *startingPtr = hBuffer->_mem + offset;
-  unsigned steps = size / patternSize;
+  size_t steps = size / patternSize;
   for (unsigned i = 0; i < steps; i++) {
     memcpy(static_cast<int8_t *>(startingPtr) + i * patternSize, pPattern,
            patternSize);
@@ -546,7 +548,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
     break;
   }
   default: {
-    for (unsigned int step{0}; step < size; step += patternSize) {
+    for (size_t step{0}; step < size; step += patternSize) {
       auto *dest =
           reinterpret_cast<void *>(reinterpret_cast<uint8_t *>(ptr) + step);
       memcpy(dest, pPattern, patternSize);

From 88db20afc48d24e5ce52dabe99fd8d77998f967d Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Thu, 17 Oct 2024 16:56:06 +0100
Subject: [PATCH 03/48] [NATIVECPU] use strcpy_s

---
 source/adapters/native_cpu/common.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/adapters/native_cpu/common.cpp b/source/adapters/native_cpu/common.cpp
index b956fc8c7a..47afed3729 100644
--- a/source/adapters/native_cpu/common.cpp
+++ b/source/adapters/native_cpu/common.cpp
@@ -19,7 +19,7 @@ thread_local char ErrorMessage[MaxMessageSize];
 [[maybe_unused]] void setErrorMessage(const char *pMessage,
                                       ur_result_t ErrorCode) {
   assert(strlen(pMessage) <= MaxMessageSize);
-  strcpy(ErrorMessage, pMessage);
+  strcpy_s(ErrorMessage, MaxMessageSize, pMessage);
   ErrorMessageCode = ErrorCode;
 }
 

From 0b89dee701105aaa1e5eca5077c7b5b11babbccd Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Mon, 21 Oct 2024 17:02:10 +0100
Subject: [PATCH 04/48] [NATIVECPU] only enqeue when groupsPerThread is >0

---
 source/adapters/native_cpu/enqueue.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
 mode change 100644 => 100755 source/adapters/native_cpu/enqueue.cpp

diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp
old mode 100644
new mode 100755
index b36b550647..da49a4d518
--- a/source/adapters/native_cpu/enqueue.cpp
+++ b/source/adapters/native_cpu/enqueue.cpp
@@ -217,7 +217,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
       auto groupsPerThread = numGroups / numParallelThreads;
       auto remainder = numGroups % numParallelThreads;
       futures.reserve(numParallelThreads + remainder);
-      for (unsigned thread = 0; thread < numParallelThreads; thread++) {
+      for (unsigned thread = 0; groupsPerThread && thread < numParallelThreads;
+           thread++) {
         futures.emplace_back(tp.schedule_task(
             [&groups, thread, groupsPerThread, hKernel](size_t threadId) {
               for (unsigned i = 0; i < groupsPerThread; i++) {

From d11133734455f548cdb9c630849261b37f4a0872 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Mon, 21 Oct 2024 17:08:10 +0100
Subject: [PATCH 05/48] [NATIVECPU] changed filemode back

---
 source/adapters/native_cpu/enqueue.cpp | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 source/adapters/native_cpu/enqueue.cpp

diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp
old mode 100755
new mode 100644

From a1166d803f33d948bad9367dd06a0dda56e0a370 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Tue, 22 Oct 2024 15:26:58 +0100
Subject: [PATCH 06/48] [NATIVECPU] added threadpool file to CMakeList

---
 source/adapters/native_cpu/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt
index 56cfc577d8..560172444b 100644
--- a/source/adapters/native_cpu/CMakeLists.txt
+++ b/source/adapters/native_cpu/CMakeLists.txt
@@ -34,6 +34,7 @@ add_ur_adapter(${TARGET_NAME}
         ${CMAKE_CURRENT_SOURCE_DIR}/queue.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/queue.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/sampler.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/threadpool.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/ur_interface_loader.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/usm_p2p.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/virtual_mem.cpp

From 47b12a41d968297b56ca6bf962b2c8d5c1888987 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Tue, 22 Oct 2024 15:43:32 +0100
Subject: [PATCH 07/48] [SYCLNATIVECPU] threadID now managed by kernel enqueue

---
 source/adapters/native_cpu/enqueue.cpp | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)
 mode change 100644 => 100755 source/adapters/native_cpu/enqueue.cpp

diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp
old mode 100644
new mode 100755
index da49a4d518..8b1ca3445d
--- a/source/adapters/native_cpu/enqueue.cpp
+++ b/source/adapters/native_cpu/enqueue.cpp
@@ -26,7 +26,7 @@ struct NDRDescT {
   RangeT GlobalSize;
   RangeT LocalSize;
   inline NDRDescT(uint32_t WorkDim, const size_t *GlobalWorkOffset,
-           const size_t *GlobalWorkSize, const size_t *LocalWorkSize)
+                  const size_t *GlobalWorkSize, const size_t *LocalWorkSize)
       : WorkDim(WorkDim) {
     for (uint32_t I = 0; I < WorkDim; I++) {
       GlobalOffset[I] = GlobalWorkOffset[I];
@@ -53,7 +53,7 @@ struct NDRDescT {
 
 #ifdef NATIVECPU_USE_OCK
 static inline native_cpu::state getResizedState(const native_cpu::NDRDescT &ndr,
-                                         size_t itemsPerThread) {
+                                                size_t itemsPerThread) {
   native_cpu::state resized_state(
       ndr.GlobalSize[0], ndr.GlobalSize[1], ndr.GlobalSize[2], itemsPerThread,
       ndr.LocalSize[1], ndr.LocalSize[2], ndr.GlobalOffset[0],
@@ -179,20 +179,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
 
   } else {
     // We are running a parallel_for over an nd_range
-
+    size_t threadId = 0;
     if (numWG1 * numWG2 >= numParallelThreads) {
       // Dimensions 1 and 2 have enough work, split them across the threadpool
       for (unsigned g2 = 0; g2 < numWG2; g2++) {
         for (unsigned g1 = 0; g1 < numWG1; g1++) {
-          futures.emplace_back(
-              tp.schedule_task([state, kernel = *hKernel, numWG0, g1, g2,
-                                numParallelThreads](size_t threadId) mutable {
+          futures.emplace_back(tp.schedule_task(
+              [state, kernel = *hKernel, numWG0, g1, g2, numParallelThreads,
+               threadId](size_t /*threadId*/) mutable {
                 for (unsigned g0 = 0; g0 < numWG0; g0++) {
                   kernel.handleLocalArgs(numParallelThreads, threadId);
                   state.update(g0, g1, g2);
                   kernel._subhandler(kernel._args.data(), &state);
                 }
               }));
+          if (++threadId == numParallelThreads)
+            threadId = 0;
         }
       }
     } else {
@@ -220,20 +222,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
       for (unsigned thread = 0; groupsPerThread && thread < numParallelThreads;
            thread++) {
         futures.emplace_back(tp.schedule_task(
-            [&groups, thread, groupsPerThread, hKernel](size_t threadId) {
+            [&groups, thread, groupsPerThread, hKernel](size_t /*threadId*/) {
               for (unsigned i = 0; i < groupsPerThread; i++) {
                 auto index = thread * groupsPerThread + i;
-                groups[index](threadId, *hKernel);
+                groups[index](thread /*Id*/, *hKernel);
               }
             }));
       }
 
       // schedule the remaining tasks
       if (remainder) {
+        const size_t threadId = futures.size();
         futures.emplace_back(
-            tp.schedule_task([&groups, remainder,
+            tp.schedule_task([&groups, remainder, threadId,
                               scheduled = numParallelThreads * groupsPerThread,
-                              hKernel](size_t threadId) {
+                              hKernel](size_t /* threadId*/) {
               for (unsigned i = 0; i < remainder; i++) {
                 auto index = scheduled + i;
                 groups[index](threadId, *hKernel);

From 4b04ce6763765d7eb086a5f33a838f38a63a30e5 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Tue, 22 Oct 2024 15:44:32 +0100
Subject: [PATCH 08/48] [SYCLNATIVECPU] file mode changed back

---
 source/adapters/native_cpu/enqueue.cpp | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 source/adapters/native_cpu/enqueue.cpp

diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp
old mode 100755
new mode 100644

From 5406b39f26c6b0d9523303f0bc83845edc24a229 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Tue, 22 Oct 2024 17:32:26 +0100
Subject: [PATCH 09/48] [NATIVECPU] Simple TBB backend

---
 source/adapters/native_cpu/CMakeLists.txt | 41 ++++++++++++++++
 source/adapters/native_cpu/device.hpp     |  4 ++
 source/adapters/native_cpu/enqueue.cpp    | 58 +++++++++++------------
 source/adapters/native_cpu/threadpool.hpp | 49 ++++++++++++++++++-
 4 files changed, 120 insertions(+), 32 deletions(-)
 mode change 100644 => 100755 source/adapters/native_cpu/enqueue.cpp
 mode change 100644 => 100755 source/adapters/native_cpu/threadpool.hpp

diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt
index 56cfc577d8..d0f332a71a 100644
--- a/source/adapters/native_cpu/CMakeLists.txt
+++ b/source/adapters/native_cpu/CMakeLists.txt
@@ -48,6 +48,37 @@ set_target_properties(${TARGET_NAME} PROPERTIES
         SOVERSION "${PROJECT_VERSION_MAJOR}"
 )
 
+option(NATIVECPU_WITH_TBB "Use TBB as backend for Native CPU" ON)
+if(NATIVECPU_WITH_TBB)
+  message(STATUS "Building Native CPU adapter with TBB backend.")
+
+  include(FetchContent)
+  FetchContent_Declare(
+    tbb
+    GIT_REPOSITORY https://github.com/oneapi-src/oneTBB.git
+    GIT_TAG 42b833fe806606d05a5cad064b8b87365818d716
+    CMAKE_ARGS "-DTBB_TEST:BOOL=OFF -DTBB_EXAMPLES:BOOL=OFF -DTBB_BENCH:BOOL=OFF"
+    GIT_SHALLOW ON
+    OVERRIDE_FIND_PACKAGE
+  )
+  set(TBB_TEST OFF CACHE INTERNAL "" FORCE)
+  set(TBB_EXAMPLES OFF CACHE INTERNAL "" FORCE)
+  set(TBB_BENCH OFF CACHE INTERNAL "" FORCE)
+  set(TBB_BUILD ON CACHE INTERNAL "" FORCE)
+  set(TBB_FIND_PACKAGE OFF CACHE INTERNAL "" FORCE)
+  set(TBB_FUZZ_TESTING OFF CACHE INTERNAL "" FORCE)
+  set(TBB_INSTALL ON CACHE INTERNAL "" FORCE)
+  FetchContent_MakeAvailable(tbb)
+
+  FetchContent_GetProperties(tbb)
+
+  if(NOT tbb_POPULATED)
+    FetchContent_Populate(tbb)
+  endif()
+  set(TBB_SOURCE_DIR_INTERNAL ${tbb_SOURCE_DIR}/include)
+  set(TBB_BINARY_DIR_INTERNAL ${tbb_BINARY_DIR})
+endif()
+
 find_package(Threads REQUIRED)
 
 target_link_libraries(${TARGET_NAME} PRIVATE
@@ -60,3 +91,13 @@ target_link_libraries(${TARGET_NAME} PRIVATE
 target_include_directories(${TARGET_NAME} PRIVATE
         "${CMAKE_CURRENT_SOURCE_DIR}/../../"
 )
+
+if(NATIVECPU_WITH_TBB)
+  target_link_libraries(${TARGET_NAME} PRIVATE
+        TBB::tbb
+  )
+  target_include_directories(${TARGET_NAME} PRIVATE
+        "${TBB_SOURCE_DIR_INTERNAL}"
+  )
+  target_compile_definitions(${TARGET_NAME} PRIVATE NATIVECPU_USE_TBB)
+endif()
diff --git a/source/adapters/native_cpu/device.hpp b/source/adapters/native_cpu/device.hpp
index 2308c1a7f4..1a6b0d091a 100644
--- a/source/adapters/native_cpu/device.hpp
+++ b/source/adapters/native_cpu/device.hpp
@@ -14,7 +14,11 @@
 #include <ur/ur.hpp>
 
 struct ur_device_handle_t_ {
+#ifdef NATIVECPU_USE_TBB
+  native_cpu::TBB_threadpool tp;
+#else
   native_cpu::threadpool_t tp;
+#endif
   ur_device_handle_t_(ur_platform_handle_t ArgPlt);
 
   const uint64_t mem_size;
diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp
old mode 100644
new mode 100755
index 33d8c35c36..1849d24c2c
--- a/source/adapters/native_cpu/enqueue.cpp
+++ b/source/adapters/native_cpu/enqueue.cpp
@@ -109,7 +109,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   auto &tp = hQueue->device->tp;
   const size_t numParallelThreads = tp.num_threads();
   hKernel->updateMemPool(numParallelThreads);
-  std::vector<std::future<void>> futures;
+  auto Tasks = native_cpu::getScheduler(tp);
   std::vector<std::function<void(size_t, ur_kernel_handle_t_)>> groups;
   auto numWG0 = ndr.GlobalSize[0] / ndr.LocalSize[0];
   auto numWG1 = ndr.GlobalSize[1] / ndr.LocalSize[1];
@@ -158,14 +158,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
     for (unsigned g2 = 0; g2 < numWG2; g2++) {
       for (unsigned g1 = 0; g1 < numWG1; g1++) {
         for (unsigned g0 = 0; g0 < new_num_work_groups_0; g0 += 1) {
-          futures.emplace_back(
-              tp.schedule_task([&ndr = std::as_const(ndr), itemsPerThread,
-                                hKernel, g0, g1, g2](size_t) {
-                native_cpu::state resized_state =
-                    getResizedState(ndr, itemsPerThread);
-                resized_state.update(g0, g1, g2);
-                hKernel->_subhandler(hKernel->_args.data(), &resized_state);
-              }));
+          Tasks.schedule([&ndr = std::as_const(ndr), itemsPerThread, hKernel,
+                          g0, g1, g2](size_t) {
+            native_cpu::state resized_state =
+                getResizedState(ndr, itemsPerThread);
+            resized_state.update(g0, g1, g2);
+            hKernel->_subhandler(hKernel->_args.data(), &resized_state);
+          });
         }
         // Peel the remaining work items. Since the local size is 1, we iterate
         // over the work groups.
@@ -184,15 +183,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
       // Dimensions 1 and 2 have enough work, split them across the threadpool
       for (unsigned g2 = 0; g2 < numWG2; g2++) {
         for (unsigned g1 = 0; g1 < numWG1; g1++) {
-          futures.emplace_back(
-              tp.schedule_task([state, kernel = *hKernel, numWG0, g1, g2,
-                                numParallelThreads](size_t threadId) mutable {
-                for (unsigned g0 = 0; g0 < numWG0; g0++) {
-                  kernel.handleLocalArgs(numParallelThreads, threadId);
-                  state.update(g0, g1, g2);
-                  kernel._subhandler(kernel._args.data(), &state);
-                }
-              }));
+          Tasks.schedule([state, kernel = *hKernel, numWG0, g1, g2,
+                          numParallelThreads](size_t threadId) mutable {
+            for (unsigned g0 = 0; g0 < numWG0; g0++) {
+              kernel.handleLocalArgs(numParallelThreads, threadId);
+              state.update(g0, g1, g2);
+              kernel._subhandler(kernel._args.data(), &state);
+            }
+          });
         }
       }
     } else {
@@ -216,32 +214,30 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
       auto groupsPerThread = numGroups / numParallelThreads;
       auto remainder = numGroups % numParallelThreads;
       for (unsigned thread = 0; thread < numParallelThreads; thread++) {
-        futures.emplace_back(tp.schedule_task(
+        Tasks.schedule(
             [&groups, thread, groupsPerThread, hKernel](size_t threadId) {
               for (unsigned i = 0; i < groupsPerThread; i++) {
                 auto index = thread * groupsPerThread + i;
                 groups[index](threadId, *hKernel);
               }
-            }));
+            });
       }
 
       // schedule the remaining tasks
       if (remainder) {
-        futures.emplace_back(
-            tp.schedule_task([&groups, remainder,
-                              scheduled = numParallelThreads * groupsPerThread,
-                              hKernel](size_t threadId) {
-              for (unsigned i = 0; i < remainder; i++) {
-                auto index = scheduled + i;
-                groups[index](threadId, *hKernel);
-              }
-            }));
+        Tasks.schedule([&groups, remainder,
+                        scheduled = numParallelThreads * groupsPerThread,
+                        hKernel](size_t threadId) {
+          for (unsigned i = 0; i < remainder; i++) {
+            auto index = scheduled + i;
+            groups[index](threadId, *hKernel);
+          }
+        });
       }
     }
   }
 
-  for (auto &f : futures)
-    f.get();
+  Tasks.wait();
 #endif // NATIVECPU_USE_OCK
   // TODO: we should avoid calling clear here by avoiding using push_back
   // in setKernelArgs.
diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp
old mode 100644
new mode 100755
index 2f2f79cd5a..b402609781
--- a/source/adapters/native_cpu/threadpool.hpp
+++ b/source/adapters/native_cpu/threadpool.hpp
@@ -209,6 +209,53 @@ template <typename ThreadPoolT> class threadpool_interface {
   }
 };
 
-using threadpool_t = threadpool_interface<detail::simple_thread_pool>;
+template <class TP> class Scheduler {
+  std::vector<std::future<void>> futures;
+  TP &TPref;
+
+public:
+  Scheduler(TP &ref) : TPref(ref) {}
+
+  inline void schedule(worker_task_t &&task) {
+    futures.emplace_back(TPref.schedule_task(std::move(task)));
+  }
+  inline void wait() {
+    for (auto &f : futures)
+      f.get();
+  }
+};
+
+using simple_threadpool_t = threadpool_interface<detail::simple_thread_pool>;
+inline Scheduler<simple_threadpool_t> getScheduler(simple_threadpool_t &tp) {
+  return Scheduler(tp);
+}
+
+using threadpool_t = simple_threadpool_t;
+
+} // namespace native_cpu
+
+#ifdef NATIVECPU_USE_TBB
+// Simple TBB backend
+#include "oneapi/tbb.h"
+namespace native_cpu {
+
+struct TBB_threadpool {
+  inline size_t num_threads() const noexcept { return 32; }
+};
+template <> class Scheduler<TBB_threadpool> {
+  oneapi::tbb::task_group tasks;
+
+public:
+  inline void schedule(worker_task_t &&task) {
+    tasks.run([&]() { task(0); });
+  }
+  inline void wait() { tasks.wait(); }
+};
+
+inline Scheduler<TBB_threadpool> getScheduler(TBB_threadpool &tp) {
+  return Scheduler<TBB_threadpool>();
+}
 
 } // namespace native_cpu
+
+#endif

From 2e8ae3f5a4ba308cd3054459934b141fe0383212 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Tue, 22 Oct 2024 17:33:31 +0100
Subject: [PATCH 10/48] [NATIVECPU] changed back filemode

---
 source/adapters/native_cpu/enqueue.cpp    | 0
 source/adapters/native_cpu/threadpool.hpp | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 source/adapters/native_cpu/enqueue.cpp
 mode change 100755 => 100644 source/adapters/native_cpu/threadpool.hpp

diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp
old mode 100755
new mode 100644
diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp
old mode 100755
new mode 100644

From d5cf2c99a231df4b020959a3ad943d5d28c67584 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Wed, 23 Oct 2024 09:54:06 +0100
Subject: [PATCH 11/48] [NATIVECPU] fixed scheduling

---
 source/adapters/native_cpu/threadpool.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp
index b402609781..216436d29a 100644
--- a/source/adapters/native_cpu/threadpool.hpp
+++ b/source/adapters/native_cpu/threadpool.hpp
@@ -247,7 +247,7 @@ template <> class Scheduler<TBB_threadpool> {
 
 public:
   inline void schedule(worker_task_t &&task) {
-    tasks.run([&]() { task(0); });
+    tasks.run(std::function<void()>([=]() mutable { task(0); }));
   }
   inline void wait() { tasks.wait(); }
 };

From ba9b2c5f5aa716a723a19a704a4300bf75cfeeae Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Wed, 23 Oct 2024 17:32:29 +0100
Subject: [PATCH 12/48] [NATIVECPU] more shared code

---
 source/adapters/native_cpu/threadpool.hpp | 35 +++++++++++------------
 1 file changed, 17 insertions(+), 18 deletions(-)
 mode change 100644 => 100755 source/adapters/native_cpu/threadpool.hpp

diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp
old mode 100644
new mode 100755
index 216436d29a..9098d52de4
--- a/source/adapters/native_cpu/threadpool.hpp
+++ b/source/adapters/native_cpu/threadpool.hpp
@@ -209,25 +209,29 @@ template <typename ThreadPoolT> class threadpool_interface {
   }
 };
 
-template <class TP> class Scheduler {
-  std::vector<std::future<void>> futures;
+template <class TP> struct SchedulerBase {
   TP &TPref;
+  SchedulerBase(TP &ref) : TPref(ref) {}
+};
 
-public:
-  Scheduler(TP &ref) : TPref(ref) {}
+template <class TP> struct Scheduler : SchedulerBase<TP> {
+  using SchedulerBase<TP>::SchedulerBase;
 
   inline void schedule(worker_task_t &&task) {
-    futures.emplace_back(TPref.schedule_task(std::move(task)));
+    futures.emplace_back(this->TPref.schedule_task(std::move(task)));
   }
   inline void wait() {
     for (auto &f : futures)
       f.get();
   }
+
+private:
+  std::vector<std::future<void>> futures;
 };
 
 using simple_threadpool_t = threadpool_interface<detail::simple_thread_pool>;
-inline Scheduler<simple_threadpool_t> getScheduler(simple_threadpool_t &tp) {
-  return Scheduler(tp);
+template <class TPType> inline Scheduler<TPType> getScheduler(TPType &tp) {
+  return Scheduler<TPType>(tp);
 }
 
 using threadpool_t = simple_threadpool_t;
@@ -240,22 +244,17 @@ using threadpool_t = simple_threadpool_t;
 namespace native_cpu {
 
 struct TBB_threadpool {
+  oneapi::tbb::task_group tasks;
   inline size_t num_threads() const noexcept { return 32; }
 };
-template <> class Scheduler<TBB_threadpool> {
-  oneapi::tbb::task_group tasks;
-
-public:
-  inline void schedule(worker_task_t &&task) {
-    tasks.run(std::function<void()>([=]() mutable { task(0); }));
+template <> struct Scheduler<TBB_threadpool> : SchedulerBase<TBB_threadpool> {
+  using SchedulerBase<TBB_threadpool>::SchedulerBase;
+  template <class T> inline void schedule(T &&task) {
+    TPref.tasks.run(std::function<void()>([=]() mutable { task(0); }));
   }
-  inline void wait() { tasks.wait(); }
+  inline void wait() { TPref.tasks.wait(); }
 };
 
-inline Scheduler<TBB_threadpool> getScheduler(TBB_threadpool &tp) {
-  return Scheduler<TBB_threadpool>();
-}
-
 } // namespace native_cpu
 
 #endif

From 5cf59d21dd548bbf30125b7a3f9d7c17e8edcb30 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Wed, 23 Oct 2024 17:34:27 +0100
Subject: [PATCH 13/48] [NATIVECPU] reversed filemode change

---
 source/adapters/native_cpu/threadpool.hpp | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 source/adapters/native_cpu/threadpool.hpp

diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp
old mode 100755
new mode 100644

From 7077d1a1a6b5f7efdee940c50dc2d092c2605008 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Thu, 24 Oct 2024 09:55:56 +0100
Subject: [PATCH 14/48] [NATIVECPU] update oneTBB tag

---
 source/adapters/native_cpu/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt
index 57ddf11cc8..59fd1f859c 100644
--- a/source/adapters/native_cpu/CMakeLists.txt
+++ b/source/adapters/native_cpu/CMakeLists.txt
@@ -57,7 +57,7 @@ if(NATIVECPU_WITH_TBB)
   FetchContent_Declare(
     tbb
     GIT_REPOSITORY https://github.com/oneapi-src/oneTBB.git
-    GIT_TAG 42b833fe806606d05a5cad064b8b87365818d716
+    GIT_TAG 377a91431ec62c5e296dbeca683c5d1e66d69f32
     CMAKE_ARGS "-DTBB_TEST:BOOL=OFF -DTBB_EXAMPLES:BOOL=OFF -DTBB_BENCH:BOOL=OFF"
     GIT_SHALLOW ON
     OVERRIDE_FIND_PACKAGE

From a8e599cb5ac3a1c9ca814c87ed69e5cd5f24a247 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Thu, 24 Oct 2024 11:03:31 +0100
Subject: [PATCH 15/48] [NATIVECPU] added required include not needed by
 Windows

---
 source/adapters/native_cpu/common.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/source/adapters/native_cpu/common.cpp b/source/adapters/native_cpu/common.cpp
index 47afed3729..f4becf23b4 100644
--- a/source/adapters/native_cpu/common.cpp
+++ b/source/adapters/native_cpu/common.cpp
@@ -9,6 +9,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "common.hpp"
+#include  <string.h>
 
 // Global variables for UR_RESULT_ADAPTER_SPECIFIC_ERROR
 // See urGetLastResult

From 4c64575ce1ef6a7aa2ac34796d247178a6a123a3 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Thu, 24 Oct 2024 11:33:54 +0100
Subject: [PATCH 16/48] [NATIVECPU] removed strcpy_s because it's not supported
 by gcc

---
 source/adapters/native_cpu/common.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/source/adapters/native_cpu/common.cpp b/source/adapters/native_cpu/common.cpp
index f4becf23b4..b956fc8c7a 100644
--- a/source/adapters/native_cpu/common.cpp
+++ b/source/adapters/native_cpu/common.cpp
@@ -9,7 +9,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "common.hpp"
-#include  <string.h>
 
 // Global variables for UR_RESULT_ADAPTER_SPECIFIC_ERROR
 // See urGetLastResult
@@ -20,7 +19,7 @@ thread_local char ErrorMessage[MaxMessageSize];
 [[maybe_unused]] void setErrorMessage(const char *pMessage,
                                       ur_result_t ErrorCode) {
   assert(strlen(pMessage) <= MaxMessageSize);
-  strcpy_s(ErrorMessage, MaxMessageSize, pMessage);
+  strcpy(ErrorMessage, pMessage);
   ErrorMessageCode = ErrorCode;
 }
 

From 4905c44e8b04ba2630c43622aa7b3970dd3d81cf Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Thu, 24 Oct 2024 18:30:34 +0100
Subject: [PATCH 17/48] [NATIVECPU] added system headers first

---
 source/adapters/native_cpu/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt
index 59fd1f859c..9ffa00c4b2 100644
--- a/source/adapters/native_cpu/CMakeLists.txt
+++ b/source/adapters/native_cpu/CMakeLists.txt
@@ -90,6 +90,7 @@ target_link_libraries(${TARGET_NAME} PRIVATE
 )
 
 target_include_directories(${TARGET_NAME} PRIVATE
+        "${CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES}"
         "${CMAKE_CURRENT_SOURCE_DIR}/../../"
 )
 

From e426b3fc58d89bb593b2b4f58094bfb3b2de8199 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Thu, 24 Oct 2024 18:51:10 +0100
Subject: [PATCH 18/48] [NATIVECPU] cmake fix

---
 source/adapters/native_cpu/CMakeLists.txt | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt
index 9ffa00c4b2..88cd0972b0 100644
--- a/source/adapters/native_cpu/CMakeLists.txt
+++ b/source/adapters/native_cpu/CMakeLists.txt
@@ -90,7 +90,6 @@ target_link_libraries(${TARGET_NAME} PRIVATE
 )
 
 target_include_directories(${TARGET_NAME} PRIVATE
-        "${CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES}"
         "${CMAKE_CURRENT_SOURCE_DIR}/../../"
 )
 
@@ -98,8 +97,8 @@ if(NATIVECPU_WITH_TBB)
   target_link_libraries(${TARGET_NAME} PRIVATE
         TBB::tbb
   )
-  target_include_directories(${TARGET_NAME} PRIVATE
-        "${TBB_SOURCE_DIR_INTERNAL}"
-  )
+#  target_include_directories(${TARGET_NAME} PRIVATE
+#        "${TBB_SOURCE_DIR_INTERNAL}"
+#  )
   target_compile_definitions(${TARGET_NAME} PRIVATE NATIVECPU_USE_TBB)
 endif()

From 4200f305850ea6977c05798dd7112efb0aa6d80a Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Fri, 25 Oct 2024 19:17:33 +0100
Subject: [PATCH 19/48] [NATIVECPU] removed GIT_SHALLOW

---
 source/adapters/native_cpu/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt
index 88cd0972b0..fac3cc672a 100644
--- a/source/adapters/native_cpu/CMakeLists.txt
+++ b/source/adapters/native_cpu/CMakeLists.txt
@@ -59,7 +59,6 @@ if(NATIVECPU_WITH_TBB)
     GIT_REPOSITORY https://github.com/oneapi-src/oneTBB.git
     GIT_TAG 377a91431ec62c5e296dbeca683c5d1e66d69f32
     CMAKE_ARGS "-DTBB_TEST:BOOL=OFF -DTBB_EXAMPLES:BOOL=OFF -DTBB_BENCH:BOOL=OFF"
-    GIT_SHALLOW ON
     OVERRIDE_FIND_PACKAGE
   )
   set(TBB_TEST OFF CACHE INTERNAL "" FORCE)

From 53f44944d61b3e793918b9397454046c71749d99 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Fri, 1 Nov 2024 15:41:31 +0000
Subject: [PATCH 20/48] [NATIVECPU] turn CMAKE_INCLUDE_CURRENT_DIR off for tbb

---
 source/adapters/native_cpu/CMakeLists.txt | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt
index fac3cc672a..3777dbd3b7 100644
--- a/source/adapters/native_cpu/CMakeLists.txt
+++ b/source/adapters/native_cpu/CMakeLists.txt
@@ -68,6 +68,7 @@ if(NATIVECPU_WITH_TBB)
   set(TBB_FIND_PACKAGE OFF CACHE INTERNAL "" FORCE)
   set(TBB_FUZZ_TESTING OFF CACHE INTERNAL "" FORCE)
   set(TBB_INSTALL ON CACHE INTERNAL "" FORCE)
+  set (CMAKE_INCLUDE_CURRENT_DIR OFF)
   FetchContent_MakeAvailable(tbb)
 
   FetchContent_GetProperties(tbb)
@@ -96,8 +97,6 @@ if(NATIVECPU_WITH_TBB)
   target_link_libraries(${TARGET_NAME} PRIVATE
         TBB::tbb
   )
-#  target_include_directories(${TARGET_NAME} PRIVATE
-#        "${TBB_SOURCE_DIR_INTERNAL}"
-#  )
+
   target_compile_definitions(${TARGET_NAME} PRIVATE NATIVECPU_USE_TBB)
 endif()

From 2ca6a3f7dead3a9597b8b49ab0ceeefdea7a19a0 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Fri, 1 Nov 2024 15:50:58 +0000
Subject: [PATCH 21/48] [NATIVECPU] workaround for oneTBB casting away const
 qualifiers

---
 source/adapters/native_cpu/CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt
index 3777dbd3b7..150dafac8c 100644
--- a/source/adapters/native_cpu/CMakeLists.txt
+++ b/source/adapters/native_cpu/CMakeLists.txt
@@ -97,6 +97,11 @@ if(NATIVECPU_WITH_TBB)
   target_link_libraries(${TARGET_NAME} PRIVATE
         TBB::tbb
   )
+  if (MSVC)
+  else()
+    # oneTBB currently casts away some const qualifiers
+    target_compile_options(tbb PRIVATE -Wno-cast-qual)
+  endif()
 
   target_compile_definitions(${TARGET_NAME} PRIVATE NATIVECPU_USE_TBB)
 endif()

From 835ce2f4bebccffba8ca7606b91aa1153f095a5a Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Fri, 1 Nov 2024 17:25:51 +0000
Subject: [PATCH 22/48] [NATIVECPU] workaround for oneTBB casting away const
 qualifiers

---
 source/adapters/native_cpu/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt
index 150dafac8c..c6b7e4725c 100644
--- a/source/adapters/native_cpu/CMakeLists.txt
+++ b/source/adapters/native_cpu/CMakeLists.txt
@@ -101,6 +101,7 @@ if(NATIVECPU_WITH_TBB)
   else()
     # oneTBB currently casts away some const qualifiers
     target_compile_options(tbb PRIVATE -Wno-cast-qual)
+    target_compile_options(tbbmalloc PRIVATE -Wno-cast-qual)
   endif()
 
   target_compile_definitions(${TARGET_NAME} PRIVATE NATIVECPU_USE_TBB)

From 42b1e6e09a4b1ac53014d80a09fb6a085c920673 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Fri, 1 Nov 2024 19:03:56 +0000
Subject: [PATCH 23/48] [NATIVECPU] remove potentially unneeded cmake

---
 source/adapters/native_cpu/CMakeLists.txt | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt
index c6b7e4725c..d55ce2544a 100644
--- a/source/adapters/native_cpu/CMakeLists.txt
+++ b/source/adapters/native_cpu/CMakeLists.txt
@@ -70,14 +70,6 @@ if(NATIVECPU_WITH_TBB)
   set(TBB_INSTALL ON CACHE INTERNAL "" FORCE)
   set (CMAKE_INCLUDE_CURRENT_DIR OFF)
   FetchContent_MakeAvailable(tbb)
-
-  FetchContent_GetProperties(tbb)
-
-  if(NOT tbb_POPULATED)
-    FetchContent_Populate(tbb)
-  endif()
-  set(TBB_SOURCE_DIR_INTERNAL ${tbb_SOURCE_DIR}/include)
-  set(TBB_BINARY_DIR_INTERNAL ${tbb_BINARY_DIR})
 endif()
 
 find_package(Threads REQUIRED)

From 9c6fb07121ec3c8dad52cf401a158b8c7e9cd2db Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Mon, 4 Nov 2024 12:51:03 +0000
Subject: [PATCH 24/48] [NATIVECPU] oneTBB disabled by default

---
 source/adapters/native_cpu/CMakeLists.txt | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt
index d55ce2544a..781fd36392 100644
--- a/source/adapters/native_cpu/CMakeLists.txt
+++ b/source/adapters/native_cpu/CMakeLists.txt
@@ -49,9 +49,10 @@ set_target_properties(${TARGET_NAME} PROPERTIES
         SOVERSION "${PROJECT_VERSION_MAJOR}"
 )
 
-option(NATIVECPU_WITH_TBB "Use TBB as backend for Native CPU" ON)
+# oneTBB is an optional NativeCPU backend and disabled by default.
+option(NATIVECPU_WITH_TBB "Use oneTBB as backend for Native CPU" OFF)
 if(NATIVECPU_WITH_TBB)
-  message(STATUS "Building Native CPU adapter with TBB backend.")
+  message(STATUS "Configuring Native CPU adapter with TBB backend.")
 
   include(FetchContent)
   FetchContent_Declare(

From de98e9b31abfa24257c9ab68b78c6676cf1088f1 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Mon, 4 Nov 2024 13:20:13 +0000
Subject: [PATCH 25/48] [NATIVECPU] tbb to oneTBB

---
 source/adapters/native_cpu/CMakeLists.txt | 6 +++---
 source/adapters/native_cpu/device.hpp     | 2 +-
 source/adapters/native_cpu/threadpool.hpp | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt
index 781fd36392..a57eff8cfc 100644
--- a/source/adapters/native_cpu/CMakeLists.txt
+++ b/source/adapters/native_cpu/CMakeLists.txt
@@ -50,8 +50,8 @@ set_target_properties(${TARGET_NAME} PROPERTIES
 )
 
 # oneTBB is an optional NativeCPU backend and disabled by default.
-option(NATIVECPU_WITH_TBB "Use oneTBB as backend for Native CPU" OFF)
-if(NATIVECPU_WITH_TBB)
+option(NATIVECPU_WITH_ONETBB "Use oneTBB as backend for Native CPU" OFF)
+if(NATIVECPU_WITH_ONETBB)
   message(STATUS "Configuring Native CPU adapter with TBB backend.")
 
   include(FetchContent)
@@ -97,5 +97,5 @@ if(NATIVECPU_WITH_TBB)
     target_compile_options(tbbmalloc PRIVATE -Wno-cast-qual)
   endif()
 
-  target_compile_definitions(${TARGET_NAME} PRIVATE NATIVECPU_USE_TBB)
+  target_compile_definitions(${TARGET_NAME} PRIVATE NATIVECPU_WITH_ONETBB)
 endif()
diff --git a/source/adapters/native_cpu/device.hpp b/source/adapters/native_cpu/device.hpp
index 1a6b0d091a..e9f7602930 100644
--- a/source/adapters/native_cpu/device.hpp
+++ b/source/adapters/native_cpu/device.hpp
@@ -14,7 +14,7 @@
 #include <ur/ur.hpp>
 
 struct ur_device_handle_t_ {
-#ifdef NATIVECPU_USE_TBB
+#ifdef NATIVECPU_WITH_ONETBB
   native_cpu::TBB_threadpool tp;
 #else
   native_cpu::threadpool_t tp;
diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp
index 9098d52de4..3d1dacb93a 100644
--- a/source/adapters/native_cpu/threadpool.hpp
+++ b/source/adapters/native_cpu/threadpool.hpp
@@ -238,7 +238,7 @@ using threadpool_t = simple_threadpool_t;
 
 } // namespace native_cpu
 
-#ifdef NATIVECPU_USE_TBB
+#ifdef NATIVECPU_WITH_ONETBB
 // Simple TBB backend
 #include "oneapi/tbb.h"
 namespace native_cpu {

From 51e915adf102488445967b12414ca6a5ef05db71 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Mon, 4 Nov 2024 14:52:10 +0000
Subject: [PATCH 26/48] [NATIVECPU] improved comment

---
 source/adapters/native_cpu/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt
index a57eff8cfc..4a1aa0b253 100644
--- a/source/adapters/native_cpu/CMakeLists.txt
+++ b/source/adapters/native_cpu/CMakeLists.txt
@@ -49,7 +49,7 @@ set_target_properties(${TARGET_NAME} PROPERTIES
         SOVERSION "${PROJECT_VERSION_MAJOR}"
 )
 
-# oneTBB is an optional NativeCPU backend and disabled by default.
+# oneTBB is used as an optional NativeCPU backend and disabled by default.
 option(NATIVECPU_WITH_ONETBB "Use oneTBB as backend for Native CPU" OFF)
 if(NATIVECPU_WITH_ONETBB)
   message(STATUS "Configuring Native CPU adapter with TBB backend.")

From dd8b027ee63d417e53ba1031dbf55c0602b4db27 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Mon, 4 Nov 2024 15:07:17 +0000
Subject: [PATCH 27/48] [NATIVECPU] tbb to oneTBB

---
 source/adapters/native_cpu/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt
index 4a1aa0b253..620e163ed4 100644
--- a/source/adapters/native_cpu/CMakeLists.txt
+++ b/source/adapters/native_cpu/CMakeLists.txt
@@ -86,7 +86,7 @@ target_include_directories(${TARGET_NAME} PRIVATE
         "${CMAKE_CURRENT_SOURCE_DIR}/../../"
 )
 
-if(NATIVECPU_WITH_TBB)
+if(NATIVECPU_WITH_ONETBB)
   target_link_libraries(${TARGET_NAME} PRIVATE
         TBB::tbb
   )

From 4a5238fe61ee65941423b62ef52db84e9224ddc7 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Mon, 4 Nov 2024 15:13:06 +0000
Subject: [PATCH 28/48] [NATIVECPU] tbb to oneTBB

---
 source/adapters/native_cpu/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt
index 620e163ed4..c9e89cbdb6 100644
--- a/source/adapters/native_cpu/CMakeLists.txt
+++ b/source/adapters/native_cpu/CMakeLists.txt
@@ -52,7 +52,7 @@ set_target_properties(${TARGET_NAME} PROPERTIES
 # oneTBB is used as an optional NativeCPU backend and disabled by default.
 option(NATIVECPU_WITH_ONETBB "Use oneTBB as backend for Native CPU" OFF)
 if(NATIVECPU_WITH_ONETBB)
-  message(STATUS "Configuring Native CPU adapter with TBB backend.")
+  message(STATUS "Configuring Native CPU adapter with oneTBB backend.")
 
   include(FetchContent)
   FetchContent_Declare(

From 3f697aef0c47d2235b8aaf4f65ccd38430698283 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Mon, 4 Nov 2024 15:26:45 +0000
Subject: [PATCH 29/48] [NATIVECPU] num_threads with oneTBB

---
 source/adapters/native_cpu/threadpool.hpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp
index 3d1dacb93a..d6da676e4d 100644
--- a/source/adapters/native_cpu/threadpool.hpp
+++ b/source/adapters/native_cpu/threadpool.hpp
@@ -245,7 +245,9 @@ namespace native_cpu {
 
 struct TBB_threadpool {
   oneapi::tbb::task_group tasks;
-  inline size_t num_threads() const noexcept { return 32; }
+  inline size_t num_threads() const noexcept {
+    return oneapi::tbb::info::default_concurrency();
+  }
 };
 template <> struct Scheduler<TBB_threadpool> : SchedulerBase<TBB_threadpool> {
   using SchedulerBase<TBB_threadpool>::SchedulerBase;

From 5f687cc85bdd49733e207d589776fa00b6c51e1d Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Wed, 6 Nov 2024 13:34:49 +0000
Subject: [PATCH 30/48] [NATIVECPU] added comment to cmake

---
 source/adapters/native_cpu/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt
index c9e89cbdb6..5b72cbf773 100644
--- a/source/adapters/native_cpu/CMakeLists.txt
+++ b/source/adapters/native_cpu/CMakeLists.txt
@@ -90,9 +90,9 @@ if(NATIVECPU_WITH_ONETBB)
   target_link_libraries(${TARGET_NAME} PRIVATE
         TBB::tbb
   )
-  if (MSVC)
-  else()
+  if (NOT MSVC)
     # oneTBB currently casts away some const qualifiers
+    # todo: check if compiler actually supports these options
     target_compile_options(tbb PRIVATE -Wno-cast-qual)
     target_compile_options(tbbmalloc PRIVATE -Wno-cast-qual)
   endif()

From b651d299f24dd3a713ac451fb8cfadf2791605f7 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Wed, 6 Nov 2024 13:35:46 +0000
Subject: [PATCH 31/48] [NATIVECPU] waiting for tasks when using local args

---
 source/adapters/native_cpu/enqueue.cpp    | 11 +++++++++--
 source/adapters/native_cpu/threadpool.hpp |  1 +
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp
index 2e7a210083..5b3f071e01 100644
--- a/source/adapters/native_cpu/enqueue.cpp
+++ b/source/adapters/native_cpu/enqueue.cpp
@@ -191,8 +191,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
               kernel._subhandler(kernel._args.data(), &state);
             }
           });
-          if (++threadId == numParallelThreads)
+          if (++threadId == numParallelThreads) {
             threadId = 0;
+            if (!hKernel->_localArgInfo.empty())
+              Tasks.wait();
+          }
         }
       }
     } else {
@@ -226,9 +229,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
               }
             });
       }
-
       // schedule the remaining tasks
       if (remainder) {
+        if (thread) {
+          thread = 0;
+          if (!hKernel->_localArgInfo.empty())
+            Tasks.wait();
+        }
         Tasks.schedule([&groups, remainder, thread,
                         scheduled = numParallelThreads * groupsPerThread,
                         hKernel](size_t /*threadId*/) {
diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp
index d6da676e4d..5a49cbc88e 100644
--- a/source/adapters/native_cpu/threadpool.hpp
+++ b/source/adapters/native_cpu/threadpool.hpp
@@ -223,6 +223,7 @@ template <class TP> struct Scheduler : SchedulerBase<TP> {
   inline void wait() {
     for (auto &f : futures)
       f.get();
+    futures.clear();
   }
 
 private:

From 4f64538d70c8526d85205cb0011ec97a88baea6a Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Wed, 6 Nov 2024 18:25:27 +0000
Subject: [PATCH 32/48] [NATIVECPU] using old task ids with tbb (WIP)

---
 source/adapters/native_cpu/enqueue.cpp    | 28 +++++++----------------
 source/adapters/native_cpu/threadpool.hpp |  7 +++++-
 2 files changed, 14 insertions(+), 21 deletions(-)
 mode change 100644 => 100755 source/adapters/native_cpu/threadpool.hpp

diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp
index 5b3f071e01..86b5d1116f 100644
--- a/source/adapters/native_cpu/enqueue.cpp
+++ b/source/adapters/native_cpu/enqueue.cpp
@@ -178,24 +178,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
 
   } else {
     // We are running a parallel_for over an nd_range
-    size_t threadId = 0;
     if (numWG1 * numWG2 >= numParallelThreads) {
       // Dimensions 1 and 2 have enough work, split them across the threadpool
       for (unsigned g2 = 0; g2 < numWG2; g2++) {
         for (unsigned g1 = 0; g1 < numWG1; g1++) {
-          Tasks.schedule([state, kernel = *hKernel, numWG0, g1, g2, threadId,
-                          numParallelThreads](size_t /*threadId*/) mutable {
+          Tasks.schedule([state, kernel = *hKernel, numWG0, g1, g2,
+                          numParallelThreads](size_t threadId) mutable {
             for (unsigned g0 = 0; g0 < numWG0; g0++) {
               kernel.handleLocalArgs(numParallelThreads, threadId);
               state.update(g0, g1, g2);
               kernel._subhandler(kernel._args.data(), &state);
             }
           });
-          if (++threadId == numParallelThreads) {
-            threadId = 0;
-            if (!hKernel->_localArgInfo.empty())
-              Tasks.wait();
-          }
         }
       }
     } else {
@@ -219,29 +213,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
       auto numGroups = groups.size();
       auto groupsPerThread = numGroups / numParallelThreads;
       auto remainder = numGroups % numParallelThreads;
-      unsigned thread = 0;
-      for (; groupsPerThread && thread < numParallelThreads; thread++) {
+      for (size_t thread = 0; groupsPerThread && thread < numParallelThreads; thread++) {
         Tasks.schedule(
-            [&groups, thread, groupsPerThread, hKernel](size_t /*threadId*/) {
+            [&groups, thread, groupsPerThread, hKernel](size_t threadId) {
               for (unsigned i = 0; i < groupsPerThread; i++) {
                 auto index = thread * groupsPerThread + i;
-                groups[index](thread /*Id*/, *hKernel);
+                groups[index](threadId, *hKernel);
               }
             });
       }
       // schedule the remaining tasks
       if (remainder) {
-        if (thread) {
-          thread = 0;
-          if (!hKernel->_localArgInfo.empty())
-            Tasks.wait();
-        }
-        Tasks.schedule([&groups, remainder, thread,
+        Tasks.schedule([&groups, remainder,
                         scheduled = numParallelThreads * groupsPerThread,
-                        hKernel](size_t /*threadId*/) {
+                        hKernel](size_t threadId) {
           for (unsigned i = 0; i < remainder; i++) {
             auto index = scheduled + i;
-            groups[index](thread /*Id*/, *hKernel);
+            groups[index](threadId, *hKernel);
           }
         });
       }
diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp
old mode 100644
new mode 100755
index 5a49cbc88e..25852bed79
--- a/source/adapters/native_cpu/threadpool.hpp
+++ b/source/adapters/native_cpu/threadpool.hpp
@@ -253,7 +253,12 @@ struct TBB_threadpool {
 template <> struct Scheduler<TBB_threadpool> : SchedulerBase<TBB_threadpool> {
   using SchedulerBase<TBB_threadpool>::SchedulerBase;
   template <class T> inline void schedule(T &&task) {
-    TPref.tasks.run(std::function<void()>([=]() mutable { task(0); }));
+    TPref.tasks.run(std::function<void()>([=]() mutable {
+      auto thread_id = tbb::this_task_arena::current_thread_index();
+      assert(thread_id >= 0 &&
+             thread_id < oneapi::tbb::info::default_concurrency());
+      task(thread_id);
+    }));
   }
   inline void wait() { TPref.tasks.wait(); }
 };

From 6330a292951fdf19510ee7eed50eab9f3af9b005 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Wed, 6 Nov 2024 18:27:01 +0000
Subject: [PATCH 33/48] [NATIVECPU] changed back filemode

---
 source/adapters/native_cpu/threadpool.hpp | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 source/adapters/native_cpu/threadpool.hpp

diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp
old mode 100755
new mode 100644

From f566f803d9583364b8262c1d9f2cdd1568cb3e81 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Thu, 7 Nov 2024 10:21:41 +0000
Subject: [PATCH 34/48] [NATIVECPU] removed unneeded clear

---
 source/adapters/native_cpu/threadpool.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp
index 25852bed79..28d071f29b 100644
--- a/source/adapters/native_cpu/threadpool.hpp
+++ b/source/adapters/native_cpu/threadpool.hpp
@@ -223,7 +223,6 @@ template <class TP> struct Scheduler : SchedulerBase<TP> {
   inline void wait() {
     for (auto &f : futures)
       f.get();
-    futures.clear();
   }
 
 private:

From 73576520a524e9b44d63fe6c24ae737bc9d0b0ca Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Wed, 13 Nov 2024 18:09:32 +0000
Subject: [PATCH 35/48] [NATIVECPU] removed MS extensions

---
 source/adapters/native_cpu/event.hpp      | 2 +-
 source/adapters/native_cpu/threadpool.hpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/source/adapters/native_cpu/event.hpp b/source/adapters/native_cpu/event.hpp
index de51231d8f..713971b24f 100644
--- a/source/adapters/native_cpu/event.hpp
+++ b/source/adapters/native_cpu/event.hpp
@@ -42,7 +42,7 @@ struct ur_event_handle_t_ : RefCounted {
   ur_command_t getCommandType() const { return command_type; }
 
   // todo: get rid of this function
-  void set_futures(native_cpu::TasksInfoType &fs) {
+  void set_futures(native_cpu::TasksInfoType &&fs) {
     std::lock_guard<std::mutex> lock(mutex);
     futures = std::move(fs);
   }
diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp
index 1763d353d5..f1bc70bd65 100644
--- a/source/adapters/native_cpu/threadpool.hpp
+++ b/source/adapters/native_cpu/threadpool.hpp
@@ -231,7 +231,7 @@ template <class TP> struct Scheduler : Scheduler_base<TP, TasksInfo_TP> {
   using Scheduler_base<TP, TasksInfo_TP>::Scheduler_base;
 
   inline void schedule(worker_task_t &&task) {
-    ti.schedule(this->ref.schedule_task(std::move(task)));
+    this->ti.schedule(this->ref.schedule_task(std::move(task)));
   }
 };
 

From a3e52e62c9b4489d1091b687f31f1294950f9770 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Thu, 14 Nov 2024 09:56:49 +0000
Subject: [PATCH 36/48] [NATIVECPU] fix merge with events update

---
 source/adapters/native_cpu/device.hpp     |  2 +-
 source/adapters/native_cpu/event.cpp      |  3 ++-
 source/adapters/native_cpu/event.hpp      |  4 ++--
 source/adapters/native_cpu/threadpool.hpp | 17 ++++++++++-------
 4 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/source/adapters/native_cpu/device.hpp b/source/adapters/native_cpu/device.hpp
index e0cf3872b5..358e9a37b6 100644
--- a/source/adapters/native_cpu/device.hpp
+++ b/source/adapters/native_cpu/device.hpp
@@ -14,7 +14,7 @@
 #include <ur/ur.hpp>
 
 struct ur_device_handle_t_ {
-  native_cpu::ThreadPoolType tp;
+  native_cpu::threadpool_t tp;
 
   ur_device_handle_t_(ur_platform_handle_t ArgPlt);
 
diff --git a/source/adapters/native_cpu/event.cpp b/source/adapters/native_cpu/event.cpp
index adb560bca9..6a5ff41e28 100644
--- a/source/adapters/native_cpu/event.cpp
+++ b/source/adapters/native_cpu/event.cpp
@@ -13,6 +13,7 @@
 #include "common.hpp"
 #include "event.hpp"
 #include "queue.hpp"
+#include "device.hpp"
 #include <cstdint>
 #include <mutex>
 
@@ -123,7 +124,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp(
 ur_event_handle_t_::ur_event_handle_t_(ur_queue_handle_t queue,
                                        ur_command_t command_type)
     : queue(queue), context(queue->getContext()), command_type(command_type),
-      done(false) {
+      done(false), futures(queue->getDevice()->tp) {
   this->queue->addEvent(this);
 }
 
diff --git a/source/adapters/native_cpu/event.hpp b/source/adapters/native_cpu/event.hpp
index 713971b24f..4b51875afd 100644
--- a/source/adapters/native_cpu/event.hpp
+++ b/source/adapters/native_cpu/event.hpp
@@ -42,7 +42,7 @@ struct ur_event_handle_t_ : RefCounted {
   ur_command_t getCommandType() const { return command_type; }
 
   // todo: get rid of this function
-  void set_futures(native_cpu::TasksInfoType &&fs) {
+  void set_futures(native_cpu::tasksinfo_t &&fs) {
     std::lock_guard<std::mutex> lock(mutex);
     futures = std::move(fs);
   }
@@ -61,7 +61,7 @@ struct ur_event_handle_t_ : RefCounted {
   ur_command_t command_type;
   bool done;
   std::mutex mutex;
-  native_cpu::TasksInfoType futures;
+  native_cpu::tasksinfo_t futures;
   std::function<void()> callback;
   uint64_t timestamp_start = 0;
   uint64_t timestamp_end = 0;
diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp
index f1bc70bd65..f2f907f4b5 100644
--- a/source/adapters/native_cpu/threadpool.hpp
+++ b/source/adapters/native_cpu/threadpool.hpp
@@ -208,6 +208,7 @@ template <typename ThreadPoolT> class threadpool_interface {
     return workerTask->get_future();
   }
 };
+using simple_threadpool_t = threadpool_interface<detail::simple_thread_pool>;
 
 class TasksInfo_TP {
   using FType = std::future<void>;
@@ -218,12 +219,13 @@ class TasksInfo_TP {
     for (auto &f : futures)
       f.wait();
   }
+  TasksInfo_TP(simple_threadpool_t &) {}
 };
 
 template <class TP, class TaskInfo> struct Scheduler_base {
   TP &ref;
   TaskInfo ti;
-  Scheduler_base(TP &ref_) : ref(ref_) {}
+  Scheduler_base(TP &ref_) : ref(ref_), ti(ref_) {}
   TaskInfo getTaskInfo() { return std::move(ti); }
 };
 
@@ -235,7 +237,6 @@ template <class TP> struct Scheduler : Scheduler_base<TP, TasksInfo_TP> {
   }
 };
 
-using simple_threadpool_t = threadpool_interface<detail::simple_thread_pool>;
 template <class TPType> inline Scheduler<TPType> getScheduler(TPType &tp) {
   return Scheduler<TPType>(tp);
 }
@@ -254,9 +255,11 @@ struct TBB_threadpool {
   }
 };
 
-struct TBB_TasksInfo {
+class TBB_TasksInfo {
   TBB_threadpool *tp;
+public:
   inline void wait() { tp->tasks.wait(); }
+  TBB_TasksInfo(TBB_threadpool &t) : tp(&t) {}
 };
 
 template <> struct Scheduler<TBB_threadpool> :  Scheduler_base<TBB_threadpool, TBB_TasksInfo> {
@@ -271,14 +274,14 @@ template <> struct Scheduler<TBB_threadpool> :  Scheduler_base<TBB_threadpool, T
   }
 };
 
-using TasksInfoType = TBB_TasksInfo;
-using ThreadPoolType = TBB_threadpool;
+using tasksinfo_t = TBB_TasksInfo;
+using threadpool_t = TBB_threadpool;
 } // namespace native_cpu
 
 #else
 // The default backend
 namespace native_cpu {
-using TasksInfoType = TasksInfo_TP;
-using ThreadPoolType = simple_threadpool_t;
+using tasksinfo_t = TasksInfo_TP;
+using threadpool_t = simple_threadpool_t;
 }
 #endif

From 56afb9a7b8c3961764f6c97698a130479dc4fce1 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Thu, 14 Nov 2024 12:29:39 +0000
Subject: [PATCH 37/48] [NATIVECPU] revert noise

---
 source/adapters/native_cpu/device.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/source/adapters/native_cpu/device.hpp b/source/adapters/native_cpu/device.hpp
index 358e9a37b6..2308c1a7f4 100644
--- a/source/adapters/native_cpu/device.hpp
+++ b/source/adapters/native_cpu/device.hpp
@@ -15,7 +15,6 @@
 
 struct ur_device_handle_t_ {
   native_cpu::threadpool_t tp;
-
   ur_device_handle_t_(ur_platform_handle_t ArgPlt);
 
   const uint64_t mem_size;

From 488b641055e420db2f756de17e066b949d2a19e6 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Thu, 14 Nov 2024 12:41:46 +0000
Subject: [PATCH 38/48] [NATIVECPU] fix integer size warnings

---
 source/adapters/native_cpu/context.hpp | 2 +-
 source/adapters/native_cpu/enqueue.cpp | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/source/adapters/native_cpu/context.hpp b/source/adapters/native_cpu/context.hpp
index b9d2d22dd1..8168e0d10e 100644
--- a/source/adapters/native_cpu/context.hpp
+++ b/source/adapters/native_cpu/context.hpp
@@ -116,7 +116,7 @@ struct ur_context_handle_t_ : RefCounted {
     // We need to ensure that we align to at least alignof(usm_alloc_info),
     // otherwise its start address may be unaligned.
     alignment =
-        std::max<size_t>(alignment, alignof(native_cpu::usm_alloc_info));
+        std::max<uint32_t>(alignment, alignof(native_cpu::usm_alloc_info));
     void *alloc = native_cpu::malloc_impl(alignment, size);
     if (!alloc)
       return nullptr;
diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp
index c4dcd0a04f..bda547bf36 100644
--- a/source/adapters/native_cpu/enqueue.cpp
+++ b/source/adapters/native_cpu/enqueue.cpp
@@ -445,7 +445,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
         // TODO: error checking
         // TODO: handle async
         void *startingPtr = hBuffer->_mem + offset;
-        unsigned steps = size / patternSize;
+        size_t steps = size / patternSize;
         for (unsigned i = 0; i < steps; i++) {
           memcpy(static_cast<int8_t *>(startingPtr) + i * patternSize, pPattern,
                  patternSize);
@@ -586,7 +586,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
           break;
         }
         default: {
-          for (unsigned int step{0}; step < size; step += patternSize) {
+          for (size_t step{0}; step < size; step += patternSize) {
             auto *dest = reinterpret_cast<void *>(
                 reinterpret_cast<uint8_t *>(ptr) + step);
             memcpy(dest, pPattern, patternSize);

From 75288ce526d11a0b1afe30ae5b2592276b16dd37 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Tue, 26 Nov 2024 17:26:27 +0000
Subject: [PATCH 39/48] [NATIVECPU] update oneTBB tag

---
 source/adapters/native_cpu/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt
index 5b72cbf773..83dc07af57 100644
--- a/source/adapters/native_cpu/CMakeLists.txt
+++ b/source/adapters/native_cpu/CMakeLists.txt
@@ -58,7 +58,7 @@ if(NATIVECPU_WITH_ONETBB)
   FetchContent_Declare(
     tbb
     GIT_REPOSITORY https://github.com/oneapi-src/oneTBB.git
-    GIT_TAG 377a91431ec62c5e296dbeca683c5d1e66d69f32
+    GIT_TAG bef1519a4216d77042637c3f48af2c060a5213d1
     CMAKE_ARGS "-DTBB_TEST:BOOL=OFF -DTBB_EXAMPLES:BOOL=OFF -DTBB_BENCH:BOOL=OFF"
     OVERRIDE_FIND_PACKAGE
   )

From fc992e3c2e84156164d32cc01e0736db85a44814 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Fri, 29 Nov 2024 10:56:53 +0000
Subject: [PATCH 40/48] [NATIVECPU] use oneTBB UXL github

---
 source/adapters/native_cpu/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt
index 83dc07af57..d12843e059 100644
--- a/source/adapters/native_cpu/CMakeLists.txt
+++ b/source/adapters/native_cpu/CMakeLists.txt
@@ -57,7 +57,7 @@ if(NATIVECPU_WITH_ONETBB)
   include(FetchContent)
   FetchContent_Declare(
     tbb
-    GIT_REPOSITORY https://github.com/oneapi-src/oneTBB.git
+    GIT_REPOSITORY https://github.com/uxlfoundation/oneTBB.git
     GIT_TAG bef1519a4216d77042637c3f48af2c060a5213d1
     CMAKE_ARGS "-DTBB_TEST:BOOL=OFF -DTBB_EXAMPLES:BOOL=OFF -DTBB_BENCH:BOOL=OFF"
     OVERRIDE_FIND_PACKAGE

From 469f27f3c173ffa23329cafae54466d6a42277c4 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Thu, 12 Dec 2024 19:07:34 +0000
Subject: [PATCH 41/48] [NATIVECPU] undefine _DEBUG in release builds for tbb

---
 source/adapters/native_cpu/CMakeLists.txt | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt
index d12843e059..3ac4b9bf87 100644
--- a/source/adapters/native_cpu/CMakeLists.txt
+++ b/source/adapters/native_cpu/CMakeLists.txt
@@ -97,5 +97,11 @@ if(NATIVECPU_WITH_ONETBB)
     target_compile_options(tbbmalloc PRIVATE -Wno-cast-qual)
   endif()
 
+  # Undefine _DEBUG option in release builds to find
+  # release tbbbind
+  if (NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG")
+    target_compile_options(tbb PRIVATE -U_DEBUG)
+  endif()
+
   target_compile_definitions(${TARGET_NAME} PRIVATE NATIVECPU_WITH_ONETBB)
 endif()

From 979072f93db951567b032fc3efe2156b575d6b48 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Mon, 27 Jan 2025 15:25:39 +0000
Subject: [PATCH 42/48] [NATIVECPU] oneTBB bump

---
 source/adapters/native_cpu/CMakeLists.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/source/adapters/native_cpu/CMakeLists.txt b/source/adapters/native_cpu/CMakeLists.txt
index cf0b109e92..dfb9d0a655 100644
--- a/source/adapters/native_cpu/CMakeLists.txt
+++ b/source/adapters/native_cpu/CMakeLists.txt
@@ -60,7 +60,11 @@ if(NATIVECPU_WITH_ONETBB)
   FetchContent_Declare(
     tbb
     GIT_REPOSITORY https://github.com/uxlfoundation/oneTBB.git
-    GIT_TAG bef1519a4216d77042637c3f48af2c060a5213d1
+#commit 9d4578723827f31defd79389819a5fbf659577f7 (HEAD -> master, origin/master, origin/HEAD)
+#Author: Konstantin Boyarinov <konstantin.boyarinov@intel.com>
+#Date:   Fri Jan 24 23:23:59 2025 +0200
+#    Add explicit deduction guides for blocked_nd_range (#1525)
+    GIT_TAG 9d4578723827f31defd79389819a5fbf659577f7
     CMAKE_ARGS "-DTBB_TEST:BOOL=OFF -DTBB_EXAMPLES:BOOL=OFF -DTBB_BENCH:BOOL=OFF"
     OVERRIDE_FIND_PACKAGE
   )

From dac6f01ae0f7b3be02f85c08db30d867aedb8cd3 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Tue, 28 Jan 2025 10:09:32 +0000
Subject: [PATCH 43/48] [NATIVECPU] clang-format and removed one inline

---
 source/adapters/native_cpu/enqueue.cpp        | 40 +++++++++----------
 source/adapters/native_cpu/event.cpp          |  2 +-
 source/adapters/native_cpu/event.hpp          |  2 +-
 source/adapters/native_cpu/kernel.hpp         |  2 +-
 .../adapters/native_cpu/nativecpu_state.hpp   |  6 +--
 source/adapters/native_cpu/threadpool.hpp     |  6 ++-
 6 files changed, 31 insertions(+), 27 deletions(-)

diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp
index d670626258..96100ac945 100644
--- a/source/adapters/native_cpu/enqueue.cpp
+++ b/source/adapters/native_cpu/enqueue.cpp
@@ -184,13 +184,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
       for (unsigned g2 = 0; g2 < numWG2; g2++) {
         for (unsigned g1 = 0; g1 < numWG1; g1++) {
           Tasks.schedule([state, kernel = *hKernel, numWG0, g1, g2,
-                                numParallelThreads](size_t threadId) mutable {
-                for (unsigned g0 = 0; g0 < numWG0; g0++) {
-                  kernel.handleLocalArgs(numParallelThreads, threadId);
-                  state.update(g0, g1, g2);
-                  kernel._subhandler(kernel.getArgs().data(), &state);
-                }
-              });
+                          numParallelThreads](size_t threadId) mutable {
+            for (unsigned g0 = 0; g0 < numWG0; g0++) {
+              kernel.handleLocalArgs(numParallelThreads, threadId);
+              state.update(g0, g1, g2);
+              kernel._subhandler(kernel.getArgs().data(), &state);
+            }
+          });
         }
       }
     } else {
@@ -217,23 +217,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
       for (unsigned thread = 0; groupsPerThread && thread < numParallelThreads;
            thread++) {
         Tasks.schedule([groups, thread, groupsPerThread,
-                              kernel = *hKernel](size_t threadId) {
-              for (unsigned i = 0; i < groupsPerThread; i++) {
-                auto index = thread * groupsPerThread + i;
-                groups[index](threadId, kernel);
-              }
-            });
+                        kernel = *hKernel](size_t threadId) {
+          for (unsigned i = 0; i < groupsPerThread; i++) {
+            auto index = thread * groupsPerThread + i;
+            groups[index](threadId, kernel);
+          }
+        });
       }
       // schedule the remaining tasks
       if (remainder) {
         Tasks.schedule([groups, remainder,
-                              scheduled = numParallelThreads * groupsPerThread,
-                              kernel = *hKernel](size_t threadId) {
-              for (unsigned i = 0; i < remainder; i++) {
-                auto index = scheduled + i;
-                groups[index](threadId, kernel);
-              }
-            });
+                        scheduled = numParallelThreads * groupsPerThread,
+                        kernel = *hKernel](size_t threadId) {
+          for (unsigned i = 0; i < remainder; i++) {
+            auto index = scheduled + i;
+            groups[index](threadId, kernel);
+          }
+        });
       }
     }
   }
diff --git a/source/adapters/native_cpu/event.cpp b/source/adapters/native_cpu/event.cpp
index 6a5ff41e28..13afc9f66a 100644
--- a/source/adapters/native_cpu/event.cpp
+++ b/source/adapters/native_cpu/event.cpp
@@ -11,9 +11,9 @@
 #include "ur_api.h"
 
 #include "common.hpp"
+#include "device.hpp"
 #include "event.hpp"
 #include "queue.hpp"
-#include "device.hpp"
 #include <cstdint>
 #include <mutex>
 
diff --git a/source/adapters/native_cpu/event.hpp b/source/adapters/native_cpu/event.hpp
index 4b51875afd..ac3a322e21 100644
--- a/source/adapters/native_cpu/event.hpp
+++ b/source/adapters/native_cpu/event.hpp
@@ -9,12 +9,12 @@
 //===----------------------------------------------------------------------===//
 #pragma once
 #include "common.hpp"
+#include "threadpool.hpp"
 #include "ur_api.h"
 #include <cstdint>
 #include <future>
 #include <mutex>
 #include <vector>
-#include "threadpool.hpp"
 
 struct ur_event_handle_t_ : RefCounted {
 
diff --git a/source/adapters/native_cpu/kernel.hpp b/source/adapters/native_cpu/kernel.hpp
index 59779e439c..6ca3eae777 100644
--- a/source/adapters/native_cpu/kernel.hpp
+++ b/source/adapters/native_cpu/kernel.hpp
@@ -28,7 +28,7 @@ struct local_arg_info_t {
 
 struct ur_kernel_handle_t_ : RefCounted {
 
-  inline ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name,
+  ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name,
                       nativecpu_task_t subhandler)
       : hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)} {}
 
diff --git a/source/adapters/native_cpu/nativecpu_state.hpp b/source/adapters/native_cpu/nativecpu_state.hpp
index c802229326..b9109f647e 100644
--- a/source/adapters/native_cpu/nativecpu_state.hpp
+++ b/source/adapters/native_cpu/nativecpu_state.hpp
@@ -20,9 +20,9 @@ struct state {
   size_t MNumGroups[3];
   size_t MGlobalOffset[3];
   uint32_t NumSubGroups, SubGroup_id, SubGroup_local_id, SubGroup_size;
-  inline state(size_t globalR0, size_t globalR1, size_t globalR2, size_t localR0,
-        size_t localR1, size_t localR2, size_t globalO0, size_t globalO1,
-        size_t globalO2)
+  inline state(size_t globalR0, size_t globalR1, size_t globalR2,
+               size_t localR0, size_t localR1, size_t localR2, size_t globalO0,
+               size_t globalO1, size_t globalO2)
       : MGlobal_range{globalR0, globalR1, globalR2},
         MWorkGroup_size{localR0, localR1, localR2},
         MNumGroups{globalR0 / localR0, globalR1 / localR1, globalR2 / localR2},
diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp
index f2f907f4b5..8b589e4cd4 100644
--- a/source/adapters/native_cpu/threadpool.hpp
+++ b/source/adapters/native_cpu/threadpool.hpp
@@ -213,6 +213,7 @@ using simple_threadpool_t = threadpool_interface<detail::simple_thread_pool>;
 class TasksInfo_TP {
   using FType = std::future<void>;
   std::vector<FType> futures;
+
 public:
   inline void schedule(FType &&f) { futures.emplace_back(std::move(f)); }
   inline void wait() {
@@ -257,12 +258,15 @@ struct TBB_threadpool {
 
 class TBB_TasksInfo {
   TBB_threadpool *tp;
+
 public:
   inline void wait() { tp->tasks.wait(); }
   TBB_TasksInfo(TBB_threadpool &t) : tp(&t) {}
 };
 
-template <> struct Scheduler<TBB_threadpool> :  Scheduler_base<TBB_threadpool, TBB_TasksInfo> {
+template <>
+struct Scheduler<TBB_threadpool>
+    : Scheduler_base<TBB_threadpool, TBB_TasksInfo> {
   using Scheduler_base<TBB_threadpool, TBB_TasksInfo>::Scheduler_base;
   template <class T> inline void schedule(T &&task) {
     ref.tasks.run(std::function<void()>([=]() mutable {

From 2abe90aa0670fd64b0b5fb8dadab45bb5716b448 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Tue, 28 Jan 2025 12:10:04 +0000
Subject: [PATCH 44/48] [NATIVECPU] clang-format

---
 source/adapters/native_cpu/nativecpu_state.hpp | 2 +-
 source/adapters/native_cpu/threadpool.hpp      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/source/adapters/native_cpu/nativecpu_state.hpp b/source/adapters/native_cpu/nativecpu_state.hpp
index b9109f647e..68743c33cf 100644
--- a/source/adapters/native_cpu/nativecpu_state.hpp
+++ b/source/adapters/native_cpu/nativecpu_state.hpp
@@ -43,7 +43,7 @@ struct state {
   }
 
   inline void update(size_t group0, size_t group1, size_t group2, size_t local0,
-              size_t local1, size_t local2) {
+                     size_t local1, size_t local2) {
     MWorkGroup_id[0] = group0;
     MWorkGroup_id[1] = group1;
     MWorkGroup_id[2] = group2;
diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp
index 8b589e4cd4..9b2abc45dc 100644
--- a/source/adapters/native_cpu/threadpool.hpp
+++ b/source/adapters/native_cpu/threadpool.hpp
@@ -287,5 +287,5 @@ using threadpool_t = TBB_threadpool;
 namespace native_cpu {
 using tasksinfo_t = TasksInfo_TP;
 using threadpool_t = simple_threadpool_t;
-}
+} // namespace native_cpu
 #endif

From bddd831c1efdae43e566007182ccfd7fc1273f09 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Tue, 28 Jan 2025 12:47:34 +0000
Subject: [PATCH 45/48] [NATIVECPU] removed inline

---
 source/adapters/native_cpu/enqueue.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp
index 96100ac945..7f5241a43c 100644
--- a/source/adapters/native_cpu/enqueue.cpp
+++ b/source/adapters/native_cpu/enqueue.cpp
@@ -26,8 +26,8 @@ struct NDRDescT {
   RangeT GlobalOffset;
   RangeT GlobalSize;
   RangeT LocalSize;
-  inline NDRDescT(uint32_t WorkDim, const size_t *GlobalWorkOffset,
-                  const size_t *GlobalWorkSize, const size_t *LocalWorkSize)
+  NDRDescT(uint32_t WorkDim, const size_t *GlobalWorkOffset,
+           const size_t *GlobalWorkSize, const size_t *LocalWorkSize)
       : WorkDim(WorkDim) {
     for (uint32_t I = 0; I < WorkDim; I++) {
       GlobalOffset[I] = GlobalWorkOffset[I];

From e7d9ff72dc759a92e2df105e025510c8f6e4eb1d Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Tue, 28 Jan 2025 12:59:53 +0000
Subject: [PATCH 46/48] [NATIVECPU] added separate if statement for clarity

---
 source/adapters/native_cpu/enqueue.cpp | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp
index 7f5241a43c..22cf26602f 100644
--- a/source/adapters/native_cpu/enqueue.cpp
+++ b/source/adapters/native_cpu/enqueue.cpp
@@ -213,18 +213,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
       }
       auto numGroups = groups.size();
       auto groupsPerThread = numGroups / numParallelThreads;
-      auto remainder = numGroups % numParallelThreads;
-      for (unsigned thread = 0; groupsPerThread && thread < numParallelThreads;
-           thread++) {
-        Tasks.schedule([groups, thread, groupsPerThread,
-                        kernel = *hKernel](size_t threadId) {
-          for (unsigned i = 0; i < groupsPerThread; i++) {
-            auto index = thread * groupsPerThread + i;
-            groups[index](threadId, kernel);
-          }
-        });
+      if (groupsPerThread) {
+        for (unsigned thread = 0; thread < numParallelThreads; thread++) {
+          Tasks.schedule([groups, thread, groupsPerThread,
+                          kernel = *hKernel](size_t threadId) {
+            for (unsigned i = 0; i < groupsPerThread; i++) {
+              auto index = thread * groupsPerThread + i;
+              groups[index](threadId, kernel);
+            }
+          });
+        }
       }
       // schedule the remaining tasks
+      auto remainder = numGroups % numParallelThreads;
       if (remainder) {
         Tasks.schedule([groups, remainder,
                         scheduled = numParallelThreads * groupsPerThread,

From 4e9bd6780a9303b13ac45dd9684692019adb8bd1 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Tue, 28 Jan 2025 13:08:22 +0000
Subject: [PATCH 47/48] [NATIVECPU] renamed wait to wait_all

---
 source/adapters/native_cpu/event.cpp      | 2 +-
 source/adapters/native_cpu/threadpool.hpp | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/source/adapters/native_cpu/event.cpp b/source/adapters/native_cpu/event.cpp
index 13afc9f66a..b03591dc57 100644
--- a/source/adapters/native_cpu/event.cpp
+++ b/source/adapters/native_cpu/event.cpp
@@ -139,7 +139,7 @@ void ur_event_handle_t_::wait() {
   if (done) {
     return;
   }
-  this->futures.wait();
+  this->futures.wait_all();
   queue->removeEvent(this);
   done = true;
   // The callback may need to acquire the lock, so we unlock it here
diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp
index 9b2abc45dc..b38ccad83a 100644
--- a/source/adapters/native_cpu/threadpool.hpp
+++ b/source/adapters/native_cpu/threadpool.hpp
@@ -216,7 +216,7 @@ class TasksInfo_TP {
 
 public:
   inline void schedule(FType &&f) { futures.emplace_back(std::move(f)); }
-  inline void wait() {
+  inline void wait_all() {
     for (auto &f : futures)
       f.wait();
   }
@@ -260,7 +260,7 @@ class TBB_TasksInfo {
   TBB_threadpool *tp;
 
 public:
-  inline void wait() { tp->tasks.wait(); }
+  inline void wait_all() { tp->tasks.wait(); }
   TBB_TasksInfo(TBB_threadpool &t) : tp(&t) {}
 };
 

From 20668ff36e92b5050d3a8c587523f45568c34d39 Mon Sep 17 00:00:00 2001
From: Uwe Dolinsky <uwe@codeplay.com>
Date: Mon, 3 Feb 2025 20:30:51 +0000
Subject: [PATCH 48/48] [NATIVECPU] move

---
 source/adapters/native_cpu/threadpool.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp
index b38ccad83a..a016131f67 100644
--- a/source/adapters/native_cpu/threadpool.hpp
+++ b/source/adapters/native_cpu/threadpool.hpp
@@ -268,8 +268,8 @@ template <>
 struct Scheduler<TBB_threadpool>
     : Scheduler_base<TBB_threadpool, TBB_TasksInfo> {
   using Scheduler_base<TBB_threadpool, TBB_TasksInfo>::Scheduler_base;
-  template <class T> inline void schedule(T &&task) {
-    ref.tasks.run(std::function<void()>([=]() mutable {
+  template <class T> inline void schedule(T &&task_) {
+    ref.tasks.run(std::function<void()>([task = std::move(task_)]() mutable {
       auto thread_id = tbb::this_task_arena::current_thread_index();
       assert(thread_id >= 0 &&
              thread_id < oneapi::tbb::info::default_concurrency());