Implement grid sync and cooperative kernel functionality for Intel TBB

alpaka-group · MichaelVarvarin · Jul 1, 2024 · Jul 26, 2024 · Jul 26, 2024 · Jul 29, 2024
commit f2af59ae8f26b6b93d32c19607e376efce1e0f84
diff --git a/include/alpaka/acc/AccCpuTbbBlocks.hpp b/include/alpaka/acc/AccCpuTbbBlocks.hpp
@@ -13,6 +13,7 @@
 #include "alpaka/block/shared/st/BlockSharedMemStMember.hpp"
 #include "alpaka/block/sync/BlockSyncNoOp.hpp"
 #include "alpaka/core/DemangleTypeNames.hpp"
+#include "alpaka/grid/GridSyncCpuTbbBlocks.hpp"
 #include "alpaka/idx/bt/IdxBtZero.hpp"
 #include "alpaka/idx/gb/IdxGbRef.hpp"
 #include "alpaka/intrinsic/IntrinsicCpu.hpp"
@@ -62,6 +63,7 @@ namespace alpaka
         , public BlockSharedMemDynMember<>
         , public BlockSharedMemStMember<>
         , public BlockSyncNoOp
+        , public GridSyncBarrierTbb<TIdx>
         , public IntrinsicCpu
         , public MemFenceCpu
 #    ifdef ALPAKA_DISABLE_VENDOR_RNG
@@ -94,6 +96,7 @@ namespace alpaka
             , BlockSharedMemDynMember<>(blockSharedMemDynSizeBytes)
             , BlockSharedMemStMember<>(staticMemBegin(), staticMemCapacity())
             , m_gridBlockIdx(Vec<TDim, TIdx>::zeros())
+            , GridSyncBarrierTbb<TIdx>(getWorkDiv<Grid, Threads>(workDiv).prod())
         {
         }
 
@@ -148,7 +151,7 @@ namespace alpaka
                         // m_globalMemSizeBytes
                         getMemBytes(dev),
                         // m_cooperativeLaunch
-                        false};
+                        true};
             }
         };
 
@@ -199,6 +202,39 @@ namespace alpaka
             }
         };
 
+        //! The CPU TBB block accelerator execution cooperative task type trait specialization.
+        template<typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
+        struct CreateTaskCooperativeKernel<AccCpuTbbBlocks<TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
+        {
+            ALPAKA_FN_HOST static auto createTaskCooperativeKernel(
+                TWorkDiv const& workDiv,
+                TKernelFnObj const& kernelFnObj,
+                TArgs&&... args)
+            {
+                if(workDiv.m_blockThreadExtent.prod() != static_cast<TIdx>(1u))
+                {
+                    throw std::runtime_error(
+                        "The given work division is not valid for a single thread Acc: "
+                        + getAccName<AccCpuTbbBlocks<TDim, TIdx>>() + ". Threads per block should be 1!");
+                }
+                auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(workDiv);
+                auto const maxBlocks = tbb::this_task_arena::max_concurrency();
+                if(gridBlockExtent.prod() > static_cast<TIdx>(maxBlocks))
+                {
+                    throw std::runtime_error(
+                        "The number of requested blocks is larger than maximuma of the device for TBB "
+                        "accelerator. Requested: "
+                        + std::to_string(gridBlockExtent.prod()) + ", maximum allowed: " + std::to_string(maxBlocks)
+                        + ". Use getMaxActiveBlocks().");
+                }
+
+                return TaskKernelCpuTbbBlocks<TDim, TIdx, TKernelFnObj, TArgs...>(
+                    workDiv,
+                    kernelFnObj,
+                    std::forward<TArgs>(args)...);
+            }
+        };
+
         //! The CPU TBB block execution task platform type trait specialization.
         template<typename TDim, typename TIdx>
         struct PlatformType<AccCpuTbbBlocks<TDim, TIdx>>

diff --git a/include/alpaka/alpaka.hpp b/include/alpaka/alpaka.hpp
@@ -57,6 +57,7 @@
 #include "alpaka/core/Align.hpp"
 #include "alpaka/core/AlignedAlloc.hpp"
 #include "alpaka/core/Assert.hpp"
+#include "alpaka/core/BarrierTbb.h"
 #include "alpaka/core/BarrierThread.hpp"
 #include "alpaka/core/BoostPredef.hpp"
 #include "alpaka/core/ClipCast.hpp"
@@ -108,6 +109,7 @@
 // grid
 #include "alpaka/grid/GridSyncBarrierCpuOmp.hpp"
 #include "alpaka/grid/GridSyncBarrierCpuThread.hpp"
+#include "alpaka/grid/GridSyncCpuTbbBlocks.hpp"
 #include "alpaka/grid/GridSyncGpuCudaHip.hpp"
 #include "alpaka/grid/GridSyncNoOp.hpp"
 #include "alpaka/grid/Traits.hpp"

diff --git a/include/alpaka/core/BarrierTbb.h b/include/alpaka/core/BarrierTbb.h
@@ -0,0 +1,92 @@
+/* Copyright 2024 Mykhailo Varvarin
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+// Comment this out to switch to tbb::task::suspend implementation. It utilizes sleep, instead of properly waiting
+#define ALPAKA_TBB_BARRIER_USE_MUTEX
+
+#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
+
+#include "alpaka/grid/Traits.hpp"
+#include "alpaka/core/Common.hpp"
+
+#ifdef ALPAKA_TBB_BARRIER_USE_MUTEX
+#   include <condition_variable>
+#   include <mutex>
+#else
+#   include <oneapi/tbb/task.h>
+#   include <atomic>
+#endif
+
+namespace alpaka::core
+{
+    namespace tbb
+    {
+        //! A self-resetting barrier.
+        template<typename TIdx>
+        class BarrierThread final
+        {
+        public:
+            explicit BarrierThread(TIdx const& threadCount)
+                : m_threadCount(threadCount)
+                , m_curThreadCount(threadCount)
+                , m_generation(0)
+            {
+            }
+
+            //! Waits for all the other threads to reach the barrier.
+            auto wait() -> void
+            {
+                TIdx const generationWhenEnteredTheWait = m_generation;
+#ifdef ALPAKA_TBB_BARRIER_USE_MUTEX
+                std::unique_lock<std::mutex> lock(m_mtxBarrier);
+#endif
+                if(--m_curThreadCount == 0)
+                {
+                    m_curThreadCount = m_threadCount;
+                    ++m_generation;
+#ifdef ALPAKA_TBB_BARRIER_USE_MUTEX
+                    m_cvAllThreadsReachedBarrier.notify_all();
+#endif
+                }
+                else
+                {
+#ifdef ALPAKA_TBB_BARRIER_USE_MUTEX
+                    m_cvAllThreadsReachedBarrier.wait(
+                        lock,
+                        [this, generationWhenEnteredTheWait] { return generationWhenEnteredTheWait != m_generation; });
+#else
+                    oneapi::tbb::task::suspend([&generationWhenEnteredTheWait, this] (oneapi::tbb::task::suspend_point tag)
+                    {
+                        while(generationWhenEnteredTheWait == this->m_generation)
+                        {
+                            //sleep for 100 microseconds
+                            usleep(100);
+                        }
+                        oneapi::tbb::task::resume(tag);
+                    });
+#endif
+                }
+            }
+
+        private:
+#ifdef ALPAKA_TBB_BARRIER_USE_MUTEX
+            std::mutex m_mtxBarrier;
+            std::condition_variable m_cvAllThreadsReachedBarrier;
+#endif
+            const TIdx m_threadCount;
+#ifdef ALPAKA_TBB_BARRIER_USE_MUTEX
+            TIdx m_curThreadCount;
+            TIdx m_generation;
+#else
+            std::atomic<TIdx> m_curThreadCount;
+            std::atomic<TIdx> m_generation;
+            oneapi::tbb::task::suspend_point m_tag;
+#endif
+        };
+    } // namespace tbb
+} // namespace alpaka::core
+
+#endif
diff --git a/include/alpaka/grid/GridSyncCpuTbbBlocks.hpp b/include/alpaka/grid/GridSyncCpuTbbBlocks.hpp
@@ -0,0 +1,43 @@
+/* Copyright 2024 Mykhailo Varvarin
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+#include "alpaka/grid/Traits.hpp"
+
+#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
+#    include "alpaka/core/BarrierTbb.h"
+
+namespace alpaka
+{
+    //! The thread id map barrier grid synchronization for TBB.
+    template<typename TIdx>
+    class GridSyncBarrierTbb : public interface::Implements<ConceptGridSync, GridSyncBarrierTbb<TIdx>>
+    {
+    public:
+        using Barrier = core::tbb::BarrierThread<TIdx>;
+
+        ALPAKA_FN_HOST explicit GridSyncBarrierTbb(TIdx const& gridThreadCount) : m_barrier(gridThreadCount)
+        {
+        }
+
+        Barrier mutable m_barrier;
+    };
+
+    namespace trait
+    {
+        template<typename TIdx>
+        struct SyncGridThreads<GridSyncBarrierTbb<TIdx>>
+        {
+            ALPAKA_FN_HOST static auto syncGridThreads(GridSyncBarrierTbb<TIdx> const& gridSync) -> void
+            {
+                gridSync.m_barrier.wait();
+            }
+        };
+
+    } // namespace trait
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/kernel/TaskKernelCpuTbbBlocks.hpp b/include/alpaka/kernel/TaskKernelCpuTbbBlocks.hpp
@@ -86,15 +86,15 @@ namespace alpaka
             tbb::this_task_arena::isolate(
                 [&]
                 {
+                    AccCpuTbbBlocks<TDim, TIdx> acc(
+                        *static_cast<WorkDivMembers<TDim, TIdx> const*>(this),
+                        blockSharedMemDynSizeBytes);
+
                     tbb::parallel_for(
                         static_cast<TIdx>(0),
                         static_cast<TIdx>(numBlocksInGrid),
                         [&](TIdx i)
                         {
-                            AccCpuTbbBlocks<TDim, TIdx> acc(
-                                *static_cast<WorkDivMembers<TDim, TIdx> const*>(this),
-                                blockSharedMemDynSizeBytes);
-
                             acc.m_gridBlockIdx
                                 = mapIdx<TDim::value>(Vec<DimInt<1u>, TIdx>(static_cast<TIdx>(i)), gridBlockExtent);
 
@@ -177,6 +177,22 @@ namespace alpaka
                 return kernelFunctionAttributes;
             }
         };
+
+        //! The CPU CPU OMP2 blocks get max active blocks for cooperative kernel specialization.
+        template<typename TDev, typename TKernelFnObj, typename TDim, typename TIdx, typename... TArgs>
+        struct MaxActiveBlocks<AccCpuTbbBlocks<TDim, TIdx>, TDev, TKernelFnObj, TDim, TIdx, TArgs...>
+        {
+            ALPAKA_FN_HOST static auto getMaxActiveBlocks(
+                TKernelFnObj const& /*kernelFnObj*/,
+                TDev const& device,
+                alpaka::Vec<TDim, TIdx> const& /*blockThreadExtent*/,
+                alpaka::Vec<TDim, TIdx> const& /*threadElemExtent*/,
+                TArgs const&... /*args*/) -> int
+            {
+                return static_cast<int>(
+                    trait::GetAccDevProps<AccCpuTbbBlocks<TDim, TIdx>>::getAccDevProps(device).m_multiProcessorCount);
+            }
+        };
     } // namespace trait
 } // namespace alpaka