oneapi-src · uwedolinsky · Aug 26, 2024 · Aug 28, 2024 · Oct 17, 2024 · Oct 21, 2024
@@ -34,6 +34,7 @@ add_ur_adapter(${TARGET_NAME}
         ${CMAKE_CURRENT_SOURCE_DIR}/queue.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/queue.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/sampler.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/threadpool.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/ur_interface_loader.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/usm_p2p.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/virtual_mem.cpp

@@ -25,8 +25,8 @@ struct NDRDescT {
   RangeT GlobalOffset;
   RangeT GlobalSize;
   RangeT LocalSize;
-  NDRDescT(uint32_t WorkDim, const size_t *GlobalWorkOffset,
-           const size_t *GlobalWorkSize, const size_t *LocalWorkSize)
+  inline NDRDescT(uint32_t WorkDim, const size_t *GlobalWorkOffset,
+                  const size_t *GlobalWorkSize, const size_t *LocalWorkSize)
       : WorkDim(WorkDim) {
     for (uint32_t I = 0; I < WorkDim; I++) {
       GlobalOffset[I] = GlobalWorkOffset[I];
@@ -52,8 +52,8 @@ struct NDRDescT {
 } // namespace native_cpu
 
 #ifdef NATIVECPU_USE_OCK
-static native_cpu::state getResizedState(const native_cpu::NDRDescT &ndr,
-                                         size_t itemsPerThread) {
+static inline native_cpu::state getResizedState(const native_cpu::NDRDescT &ndr,
+                                                size_t itemsPerThread) {
   native_cpu::state resized_state(
       ndr.GlobalSize[0], ndr.GlobalSize[1], ndr.GlobalSize[2], itemsPerThread,
       ndr.LocalSize[1], ndr.LocalSize[2], ndr.GlobalOffset[0],
@@ -168,7 +168,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
         }
         // Peel the remaining work items. Since the local size is 1, we iterate
         // over the work groups.
-        for (unsigned g0 = new_num_work_groups_0 * itemsPerThread; g0 < numWG0;
+        for (size_t g0 = new_num_work_groups_0 * itemsPerThread; g0 < numWG0;
              g0++) {
           state.update(g0, g1, g2);
           hKernel->_subhandler(hKernel->_args.data(), &state);
@@ -178,25 +178,28 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
 
   } else {
     // We are running a parallel_for over an nd_range
-
+    size_t threadId = 0;
     if (numWG1 * numWG2 >= numParallelThreads) {
       // Dimensions 1 and 2 have enough work, split them across the threadpool
       for (unsigned g2 = 0; g2 < numWG2; g2++) {
         for (unsigned g1 = 0; g1 < numWG1; g1++) {
-          Tasks.schedule([state, kernel = *hKernel, numWG0, g1, g2,
-                          numParallelThreads](size_t threadId) mutable {
+          Tasks.schedule([state, kernel = *hKernel, numWG0, g1, g2, threadId,
+                          numParallelThreads](size_t /*threadId*/) mutable {
             for (unsigned g0 = 0; g0 < numWG0; g0++) {
               kernel.handleLocalArgs(numParallelThreads, threadId);
               state.update(g0, g1, g2);
               kernel._subhandler(kernel._args.data(), &state);
             }
           });
+          if (++threadId == numParallelThreads)
+            threadId = 0;
         }
       }
     } else {
       // Split dimension 0 across the threadpool
       // Here we try to create groups of workgroups in order to reduce
       // synchronization overhead
+      groups.reserve(numWG2 * numWG1 * numWG0);
       for (unsigned g2 = 0; g2 < numWG2; g2++) {
         for (unsigned g1 = 0; g1 < numWG1; g1++) {
           for (unsigned g0 = 0; g0 < numWG0; g0++) {
@@ -213,24 +216,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
       auto numGroups = groups.size();
       auto groupsPerThread = numGroups / numParallelThreads;
       auto remainder = numGroups % numParallelThreads;
-      for (unsigned thread = 0; thread < numParallelThreads; thread++) {
+      unsigned thread = 0;
+      for (; groupsPerThread && thread < numParallelThreads; thread++) {
         Tasks.schedule(
-            [&groups, thread, groupsPerThread, hKernel](size_t threadId) {
+            [&groups, thread, groupsPerThread, hKernel](size_t /*threadId*/) {
               for (unsigned i = 0; i < groupsPerThread; i++) {
                 auto index = thread * groupsPerThread + i;
-                groups[index](threadId, *hKernel);
+                groups[index](thread /*Id*/, *hKernel);
               }
             });
       }
 
       // schedule the remaining tasks
       if (remainder) {
-        Tasks.schedule([&groups, remainder,
+        Tasks.schedule([&groups, remainder, thread,
                         scheduled = numParallelThreads * groupsPerThread,
-                        hKernel](size_t threadId) {
+                        hKernel](size_t /*threadId*/) {
           for (unsigned i = 0; i < remainder; i++) {
             auto index = scheduled + i;
-            groups[index](threadId, *hKernel);
+            groups[index](thread /*Id*/, *hKernel);
           }
         });
       }
@@ -407,7 +411,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
   // TODO: error checking
   // TODO: handle async
   void *startingPtr = hBuffer->_mem + offset;
-  unsigned steps = size / patternSize;
+  size_t steps = size / patternSize;
   for (unsigned i = 0; i < steps; i++) {
     memcpy(static_cast<int8_t *>(startingPtr) + i * patternSize, pPattern,
            patternSize);
@@ -553,7 +557,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
     break;
   }
   default: {
-    for (unsigned int step{0}; step < size; step += patternSize) {
+    for (size_t step{0}; step < size; step += patternSize) {
       auto *dest =
           reinterpret_cast<void *>(reinterpret_cast<uint8_t *>(ptr) + step);
       memcpy(dest, pPattern, patternSize);

@@ -20,7 +20,7 @@ namespace native_cpu {
 struct NativeCPUArgDesc {
   void *MPtr;
 
-  NativeCPUArgDesc(void *Ptr) : MPtr(Ptr){};
+  inline NativeCPUArgDesc(void *Ptr) : MPtr(Ptr){};
 };
 
 } // namespace native_cpu
@@ -33,17 +33,17 @@ using nativecpu_task_t = std::function<nativecpu_kernel_t>;
 struct local_arg_info_t {
   uint32_t argIndex;
   size_t argSize;
-  local_arg_info_t(uint32_t argIndex, size_t argSize)
+  inline local_arg_info_t(uint32_t argIndex, size_t argSize)
       : argIndex(argIndex), argSize(argSize) {}
 };
 
 struct ur_kernel_handle_t_ : RefCounted {
 
-  ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name,
+  inline ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name,
                       nativecpu_task_t subhandler)
       : hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)} {}
 
-  ur_kernel_handle_t_(const ur_kernel_handle_t_ &other)
+  inline ur_kernel_handle_t_(const ur_kernel_handle_t_ &other)
       : hProgram(other.hProgram), _name(other._name),
         _subhandler(other._subhandler), _args(other._args),
         _localArgInfo(other._localArgInfo), _localMemPool(other._localMemPool),
@@ -52,12 +52,12 @@ struct ur_kernel_handle_t_ : RefCounted {
     incrementReferenceCount();
   }
 
-  ~ur_kernel_handle_t_() {
+  inline ~ur_kernel_handle_t_() {
     if (decrementReferenceCount() == 0) {
       free(_localMemPool);
     }
   }
-  ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name,
+  inline ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name,
                       nativecpu_task_t subhandler,
                       std::optional<native_cpu::WGSize_t> ReqdWGSize,
                       std::optional<native_cpu::WGSize_t> MaxWGSize,

@@ -20,7 +20,7 @@ struct state {
   size_t MNumGroups[3];
   size_t MGlobalOffset[3];
   uint32_t NumSubGroups, SubGroup_id, SubGroup_local_id, SubGroup_size;
-  state(size_t globalR0, size_t globalR1, size_t globalR2, size_t localR0,
+  inline state(size_t globalR0, size_t globalR1, size_t globalR2, size_t localR0,
         size_t localR1, size_t localR2, size_t globalO0, size_t globalO1,
         size_t globalO2)
       : MGlobal_range{globalR0, globalR1, globalR2}, MWorkGroup_size{localR0,
@@ -43,7 +43,7 @@ struct state {
     SubGroup_size = 1;
   }
 
-  void update(size_t group0, size_t group1, size_t group2, size_t local0,
+  inline void update(size_t group0, size_t group1, size_t group2, size_t local0,
               size_t local1, size_t local2) {
     MWorkGroup_id[0] = group0;
     MWorkGroup_id[1] = group1;
@@ -59,7 +59,7 @@ struct state {
         MWorkGroup_size[2] * MWorkGroup_id[2] + MLocal_id[2] + MGlobalOffset[2];
   }
 
-  void update(size_t group0, size_t group1, size_t group2) {
+  inline void update(size_t group0, size_t group1, size_t group2) {
     MWorkGroup_id[0] = group0;
     MWorkGroup_id[1] = group1;
     MWorkGroup_id[2] = group2;