Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for cooperative groups and grid synchronization #2307

Draft
wants to merge 42 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
cadb1af
Add CreateTaskCooperativeKernel, grid sync and HelloWorldGridSyncExam…
MichaelVarvarin Jul 1, 2024
0453b9f
Add comment about issue with grid sync on CUDA Clang
MichaelVarvarin Jul 26, 2024
df5d4fd
Add cooperative kernel launch and grid sync support for HIP
MichaelVarvarin Jul 26, 2024
978f195
Add m_cooperativeLaunch device prop and runtime check for CG support …
MichaelVarvarin Jul 29, 2024
b7aee7a
Clean errors in previous commit
MichaelVarvarin Aug 1, 2024
3acaf11
Clean formatting
MichaelVarvarin Aug 2, 2024
5e9c5ce
Add getMaxActiveBlocks to get the maximum allowed block count for lau…
MichaelVarvarin Aug 7, 2024
a7a9b03
Rename maxActiveBlocks trait
MichaelVarvarin Aug 10, 2024
08e18cb
Fix issues from bad rebase
MichaelVarvarin Aug 12, 2024
c6dc462
Add cooperative kernel launch, grid sync and getMaxActiveBlocks for A…
MichaelVarvarin Aug 12, 2024
e88e0c1
Clean formatting
MichaelVarvarin Aug 13, 2024
bf9ddf0
Correct the comment
MichaelVarvarin Aug 13, 2024
148c521
Add cooperative kernel launch, grid sync and getMaxActiveBlocks for O…
MichaelVarvarin Aug 20, 2024
db43ec9
Clean formatting
MichaelVarvarin Aug 20, 2024
5f0bfa8
Update comments
MichaelVarvarin Aug 20, 2024
9247156
Add include gridSync OMP to alpaka.hpp
MichaelVarvarin Aug 27, 2024
907c0b9
Add cooperative kernel launch, grid sync and getMaxActiveBlocks for s…
MichaelVarvarin Aug 27, 2024
0455158
Clean warnings for CPU accelerators
MichaelVarvarin Sep 9, 2024
f7da2fe
Clean warnings for the HIP accelerator
MichaelVarvarin Sep 9, 2024
b20ddf2
Merge SYCL changes (#2)
MichaelVarvarin Nov 11, 2024
7cf652e
Revert "Merge SYCL changes (#2)" (#3)
MichaelVarvarin Nov 11, 2024
aaed855
Add cooperative groups and grid sync functionality to SYCL
MichaelVarvarin Nov 11, 2024
fab1f24
Rewrite example to use executeForEachAccTag
MichaelVarvarin Nov 12, 2024
1222309
Change from using concepts to interface due to rebase
MichaelVarvarin Nov 12, 2024
f2af59a
Implement grid sync and cooperative kernel functionality for Intel TBB
MichaelVarvarin Nov 16, 2024
215a292
BUGFIX: Change m_cooperativeLaunch prop for SYCL to true m_cooperativ…
MichaelVarvarin Nov 16, 2024
c95b136
Change AccCpuTbbBlocks object back to being local for each thread
MichaelVarvarin Nov 17, 2024
53e51c2
BUGFIX: add includes to achieve compilation
MichaelVarvarin Nov 17, 2024
0ff0779
Add gridSyncTest to test syncGridThreads()
MichaelVarvarin Nov 17, 2024
1070851
Clearer code and comments for helloWorldGridSync
MichaelVarvarin Nov 18, 2024
d83dfed
Add check if grid sync was successful
MichaelVarvarin Nov 18, 2024
0115af2
Remove template from example kernel
MichaelVarvarin Nov 18, 2024
07aed98
Try different parameter expansion
MichaelVarvarin Nov 18, 2024
62a58cc
Revert "Try different parameter expansion"
MichaelVarvarin Nov 18, 2024
4d914bd
Formatting changes
MichaelVarvarin Nov 18, 2024
ceac5d3
Formatting changes
MichaelVarvarin Nov 18, 2024
0a990d4
BUGFIX: Add explicit conversions
MichaelVarvarin Nov 19, 2024
dfcfb28
BUGFIX: add missing argument in GridSyncTest
MichaelVarvarin Nov 25, 2024
1203c09
BUGFIX: fix SYCL compilation error by passing additional template arg…
MichaelVarvarin Nov 25, 2024
310ad88
BUGFIX: add runtime check for cooperative launch support
MichaelVarvarin Nov 26, 2024
05effa5
Suppress GCC warning about pointer to object conversion
MichaelVarvarin Nov 26, 2024
9568256
Change format specifiers
MichaelVarvarin Nov 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Implement grid sync and cooperative kernel functionality for Intel TBB
  • Loading branch information
MichaelVarvarin committed Nov 16, 2024
commit f2af59ae8f26b6b93d32c19607e376efce1e0f84
38 changes: 37 additions & 1 deletion include/alpaka/acc/AccCpuTbbBlocks.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "alpaka/block/shared/st/BlockSharedMemStMember.hpp"
#include "alpaka/block/sync/BlockSyncNoOp.hpp"
#include "alpaka/core/DemangleTypeNames.hpp"
#include "alpaka/grid/GridSyncCpuTbbBlocks.hpp"
#include "alpaka/idx/bt/IdxBtZero.hpp"
#include "alpaka/idx/gb/IdxGbRef.hpp"
#include "alpaka/intrinsic/IntrinsicCpu.hpp"
Expand Down Expand Up @@ -62,6 +63,7 @@ namespace alpaka
, public BlockSharedMemDynMember<>
, public BlockSharedMemStMember<>
, public BlockSyncNoOp
, public GridSyncBarrierTbb<TIdx>
, public IntrinsicCpu
, public MemFenceCpu
# ifdef ALPAKA_DISABLE_VENDOR_RNG
Expand Down Expand Up @@ -94,6 +96,7 @@ namespace alpaka
, BlockSharedMemDynMember<>(blockSharedMemDynSizeBytes)
, BlockSharedMemStMember<>(staticMemBegin(), staticMemCapacity())
, m_gridBlockIdx(Vec<TDim, TIdx>::zeros())
, GridSyncBarrierTbb<TIdx>(getWorkDiv<Grid, Threads>(workDiv).prod())
{
}

Expand Down Expand Up @@ -148,7 +151,7 @@ namespace alpaka
// m_globalMemSizeBytes
getMemBytes(dev),
// m_cooperativeLaunch
false};
true};
}
};

Expand Down Expand Up @@ -199,6 +202,39 @@ namespace alpaka
}
};

//! The CPU TBB block accelerator execution cooperative task type trait specialization.
template<typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
struct CreateTaskCooperativeKernel<AccCpuTbbBlocks<TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
{
ALPAKA_FN_HOST static auto createTaskCooperativeKernel(
TWorkDiv const& workDiv,
TKernelFnObj const& kernelFnObj,
TArgs&&... args)
{
if(workDiv.m_blockThreadExtent.prod() != static_cast<TIdx>(1u))
{
throw std::runtime_error(
"The given work division is not valid for a single thread Acc: "
+ getAccName<AccCpuTbbBlocks<TDim, TIdx>>() + ". Threads per block should be 1!");
}
auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(workDiv);
auto const maxBlocks = tbb::this_task_arena::max_concurrency();
if(gridBlockExtent.prod() > static_cast<TIdx>(maxBlocks))
{
throw std::runtime_error(
"The number of requested blocks is larger than maximuma of the device for TBB "
"accelerator. Requested: "
+ std::to_string(gridBlockExtent.prod()) + ", maximum allowed: " + std::to_string(maxBlocks)
+ ". Use getMaxActiveBlocks().");
}

return TaskKernelCpuTbbBlocks<TDim, TIdx, TKernelFnObj, TArgs...>(
workDiv,
kernelFnObj,
std::forward<TArgs>(args)...);
}
};

//! The CPU TBB block execution task platform type trait specialization.
template<typename TDim, typename TIdx>
struct PlatformType<AccCpuTbbBlocks<TDim, TIdx>>
Expand Down
2 changes: 2 additions & 0 deletions include/alpaka/alpaka.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
#include "alpaka/core/Align.hpp"
#include "alpaka/core/AlignedAlloc.hpp"
#include "alpaka/core/Assert.hpp"
#include "alpaka/core/BarrierTbb.h"
#include "alpaka/core/BarrierThread.hpp"
#include "alpaka/core/BoostPredef.hpp"
#include "alpaka/core/ClipCast.hpp"
Expand Down Expand Up @@ -108,6 +109,7 @@
// grid
#include "alpaka/grid/GridSyncBarrierCpuOmp.hpp"
#include "alpaka/grid/GridSyncBarrierCpuThread.hpp"
#include "alpaka/grid/GridSyncCpuTbbBlocks.hpp"
#include "alpaka/grid/GridSyncGpuCudaHip.hpp"
#include "alpaka/grid/GridSyncNoOp.hpp"
#include "alpaka/grid/Traits.hpp"
Expand Down
92 changes: 92 additions & 0 deletions include/alpaka/core/BarrierTbb.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
/* Copyright 2024 Mykhailo Varvarin
* SPDX-License-Identifier: MPL-2.0
*/

#pragma once

// Comment this out to switch to tbb::task::suspend implementation. It utilizes sleep, instead of properly waiting
#define ALPAKA_TBB_BARRIER_USE_MUTEX

#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED

#include "alpaka/grid/Traits.hpp"
#include "alpaka/core/Common.hpp"

#ifdef ALPAKA_TBB_BARRIER_USE_MUTEX
# include <condition_variable>
# include <mutex>
#else
# include <oneapi/tbb/task.h>
# include <atomic>
#endif

namespace alpaka::core
{
namespace tbb
{
//! A self-resetting barrier.
template<typename TIdx>
class BarrierThread final
{
public:
explicit BarrierThread(TIdx const& threadCount)
: m_threadCount(threadCount)
, m_curThreadCount(threadCount)
, m_generation(0)
{
}

//! Waits for all the other threads to reach the barrier.
auto wait() -> void
{
TIdx const generationWhenEnteredTheWait = m_generation;
#ifdef ALPAKA_TBB_BARRIER_USE_MUTEX
std::unique_lock<std::mutex> lock(m_mtxBarrier);
#endif
if(--m_curThreadCount == 0)
{
m_curThreadCount = m_threadCount;
++m_generation;
#ifdef ALPAKA_TBB_BARRIER_USE_MUTEX
m_cvAllThreadsReachedBarrier.notify_all();
#endif
}
else
{
#ifdef ALPAKA_TBB_BARRIER_USE_MUTEX
m_cvAllThreadsReachedBarrier.wait(
lock,
[this, generationWhenEnteredTheWait] { return generationWhenEnteredTheWait != m_generation; });
#else
oneapi::tbb::task::suspend([&generationWhenEnteredTheWait, this] (oneapi::tbb::task::suspend_point tag)
{
while(generationWhenEnteredTheWait == this->m_generation)
{
//sleep for 100 microseconds
usleep(100);
}
oneapi::tbb::task::resume(tag);
});
#endif
}
}

private:
#ifdef ALPAKA_TBB_BARRIER_USE_MUTEX
std::mutex m_mtxBarrier;
std::condition_variable m_cvAllThreadsReachedBarrier;
#endif
const TIdx m_threadCount;
#ifdef ALPAKA_TBB_BARRIER_USE_MUTEX
TIdx m_curThreadCount;
TIdx m_generation;
#else
std::atomic<TIdx> m_curThreadCount;
std::atomic<TIdx> m_generation;
oneapi::tbb::task::suspend_point m_tag;
#endif
};
} // namespace tbb
} // namespace alpaka::core

#endif
43 changes: 43 additions & 0 deletions include/alpaka/grid/GridSyncCpuTbbBlocks.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/* Copyright 2024 Mykhailo Varvarin
* SPDX-License-Identifier: MPL-2.0
*/

#pragma once

#include "alpaka/core/Common.hpp"
#include "alpaka/grid/Traits.hpp"

#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
# include "alpaka/core/BarrierTbb.h"

namespace alpaka
{
//! The thread id map barrier grid synchronization for TBB.
template<typename TIdx>
class GridSyncBarrierTbb : public interface::Implements<ConceptGridSync, GridSyncBarrierTbb<TIdx>>
{
public:
using Barrier = core::tbb::BarrierThread<TIdx>;

ALPAKA_FN_HOST explicit GridSyncBarrierTbb(TIdx const& gridThreadCount) : m_barrier(gridThreadCount)
{
}

Barrier mutable m_barrier;
};

namespace trait
{
template<typename TIdx>
struct SyncGridThreads<GridSyncBarrierTbb<TIdx>>
{
ALPAKA_FN_HOST static auto syncGridThreads(GridSyncBarrierTbb<TIdx> const& gridSync) -> void
{
gridSync.m_barrier.wait();
}
};

} // namespace trait
} // namespace alpaka

#endif
24 changes: 20 additions & 4 deletions include/alpaka/kernel/TaskKernelCpuTbbBlocks.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,15 +86,15 @@ namespace alpaka
tbb::this_task_arena::isolate(
[&]
{
AccCpuTbbBlocks<TDim, TIdx> acc(
*static_cast<WorkDivMembers<TDim, TIdx> const*>(this),
blockSharedMemDynSizeBytes);

tbb::parallel_for(
static_cast<TIdx>(0),
static_cast<TIdx>(numBlocksInGrid),
[&](TIdx i)
{
AccCpuTbbBlocks<TDim, TIdx> acc(
*static_cast<WorkDivMembers<TDim, TIdx> const*>(this),
blockSharedMemDynSizeBytes);

acc.m_gridBlockIdx
= mapIdx<TDim::value>(Vec<DimInt<1u>, TIdx>(static_cast<TIdx>(i)), gridBlockExtent);

Expand Down Expand Up @@ -177,6 +177,22 @@ namespace alpaka
return kernelFunctionAttributes;
}
};

//! The CPU CPU OMP2 blocks get max active blocks for cooperative kernel specialization.
template<typename TDev, typename TKernelFnObj, typename TDim, typename TIdx, typename... TArgs>
struct MaxActiveBlocks<AccCpuTbbBlocks<TDim, TIdx>, TDev, TKernelFnObj, TDim, TIdx, TArgs...>
{
ALPAKA_FN_HOST static auto getMaxActiveBlocks(
TKernelFnObj const& /*kernelFnObj*/,
TDev const& device,
alpaka::Vec<TDim, TIdx> const& /*blockThreadExtent*/,
alpaka::Vec<TDim, TIdx> const& /*threadElemExtent*/,
TArgs const&... /*args*/) -> int
{
return static_cast<int>(
trait::GetAccDevProps<AccCpuTbbBlocks<TDim, TIdx>>::getAccDevProps(device).m_multiProcessorCount);
}
};
} // namespace trait
} // namespace alpaka

Expand Down
Loading