Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NATIVECPU] NativeCPU with optional oneTBB backend #2627

Open
wants to merge 66 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
66 commits
Select commit Hold shift + click to select a range
da5500c
[SYCLNATIVECPU] inline native_cpu adapter functions
uwedolinsky Aug 26, 2024
0b8b0f7
[NATIVECPU] use size_t, reserve vector size
uwedolinsky Aug 28, 2024
88db20a
[NATIVECPU] use strcpy_s
uwedolinsky Oct 17, 2024
f64fc08
Merge remote-tracking branch 'origin/main' into uwe/inline
uwedolinsky Oct 21, 2024
0b89dee
[NATIVECPU] only enqeue when groupsPerThread is >0
uwedolinsky Oct 21, 2024
d111337
[NATIVECPU] changed filemode back
uwedolinsky Oct 21, 2024
a1166d8
[NATIVECPU] added threadpool file to CMakeList
uwedolinsky Oct 22, 2024
47b12a4
[SYCLNATIVECPU] threadID now managed by kernel enqueue
uwedolinsky Oct 22, 2024
4b04ce6
[SYCLNATIVECPU] file mode changed back
uwedolinsky Oct 22, 2024
5406b39
[NATIVECPU] Simple TBB backend
uwedolinsky Oct 22, 2024
2e8ae3f
[NATIVECPU] changed back filemode
uwedolinsky Oct 22, 2024
3c64917
[NATIVECPU] fixed merge with optimisation branch
uwedolinsky Oct 22, 2024
eb1b1cc
Merge remote-tracking branch 'origin/main' into uwe/tbb_integration
uwedolinsky Oct 22, 2024
d5cf2c9
[NATIVECPU] fixed scheduling
uwedolinsky Oct 23, 2024
ba9b2c5
[NATIVECPU] more shared code
uwedolinsky Oct 23, 2024
5cf59d2
[NATIVECPU] reversed filemode change
uwedolinsky Oct 23, 2024
7077d1a
[NATIVECPU] update oneTBB tag
uwedolinsky Oct 24, 2024
a8e599c
[NATIVECPU] added required include not needed by Windows
uwedolinsky Oct 24, 2024
4c64575
[NATIVECPU] removed strcpy_s because it's not supported by gcc
uwedolinsky Oct 24, 2024
4905c44
[NATIVECPU] added system headers first
uwedolinsky Oct 24, 2024
e426b3f
[NATIVECPU] cmake fix
uwedolinsky Oct 24, 2024
4200f30
[NATIVECPU] removed GIT_SHALLOW
uwedolinsky Oct 25, 2024
53f4494
[NATIVECPU] turn CMAKE_INCLUDE_CURRENT_DIR off for tbb
uwedolinsky Nov 1, 2024
2ca6a3f
[NATIVECPU] workaround for oneTBB casting away const qualifiers
uwedolinsky Nov 1, 2024
835ce2f
[NATIVECPU] workaround for oneTBB casting away const qualifiers
uwedolinsky Nov 1, 2024
42b1e6e
[NATIVECPU] remove potentially unneeded cmake
uwedolinsky Nov 1, 2024
ceee902
Merge commit 'fa8cc8ec16c1a2cf0926cc64026edc6a254ff0c2' into uwe/tbb_…
uwedolinsky Nov 1, 2024
9c6fb07
[NATIVECPU] oneTBB disabled by default
uwedolinsky Nov 4, 2024
de98e9b
[NATIVECPU] tbb to oneTBB
uwedolinsky Nov 4, 2024
8dbe123
Merge commit 'b0a9e2be61ad42d3447f1f246120ab25119a03e0' into uwe/tbb_…
uwedolinsky Nov 4, 2024
51e915a
[NATIVECPU] improved comment
uwedolinsky Nov 4, 2024
dd8b027
[NATIVECPU] tbb to oneTBB
uwedolinsky Nov 4, 2024
4a5238f
[NATIVECPU] tbb to oneTBB
uwedolinsky Nov 4, 2024
3f697ae
[NATIVECPU] num_threads with oneTBB
uwedolinsky Nov 4, 2024
5f687cc
[NATIVECPU] added comment to cmake
uwedolinsky Nov 6, 2024
b651d29
[NATIVECPU] waiting for tasks when using local args
uwedolinsky Nov 6, 2024
4f64538
[NATIVECPU] using old task ids with tbb (WIP)
uwedolinsky Nov 6, 2024
6330a29
[NATIVECPU] changed back filemode
uwedolinsky Nov 6, 2024
f566f80
[NATIVECPU] removed unneeded clear
uwedolinsky Nov 7, 2024
e06b72b
Merge commit '2858a8a28d0b6524a3b2b0e25a597d1c8295ce9d' into uwe/tbb_…
uwedolinsky Nov 7, 2024
2584cd4
Merge commit '09ae26af4e4e4301177db704b3b109ecd388c846' into uwe/tbb_…
uwedolinsky Nov 8, 2024
c1cd18a
Merge commit 'ed9fe09f9987bbe86715f191e6dbe5695ebf0306' into uwe/tbb_…
uwedolinsky Nov 8, 2024
b78eb83
Merge commit 'c94dbc8fa25f62b812a3db707b78f1c217b33bc3' into uwe/tbb_…
uwedolinsky Nov 11, 2024
f121eb6
[NATIVECPU] fixed merge from main
uwedolinsky Nov 13, 2024
7357652
[NATIVECPU] removed MS extensions
uwedolinsky Nov 13, 2024
a3e52e6
[NATIVECPU] fix merge with events update
uwedolinsky Nov 14, 2024
56afb9a
[NATIVECPU] revert noise
uwedolinsky Nov 14, 2024
488b641
[NATIVECPU] fix integer size warnings
uwedolinsky Nov 14, 2024
fefafcb
Merge commit '38ee6ce2a0400573c0c7c5da782bc32ff578fcc4' into uwe/tbb_…
uwedolinsky Nov 25, 2024
75288ce
[NATIVECPU] update oneTBB tag
uwedolinsky Nov 26, 2024
fc992e3
[NATIVECPU] use oneTBB UXL github
uwedolinsky Nov 29, 2024
ea8a19c
Merge commit '5f4a5a27e8192ee41ce21c8fb140d770c779af78' into uwe/tbb_…
uwedolinsky Dec 5, 2024
469f27f
[NATIVECPU] undefine _DEBUG in release builds for tbb
uwedolinsky Dec 12, 2024
a67282b
[NATIVECPU] merge
uwedolinsky Jan 6, 2025
7dc9416
Merge commit '029a977bc76d1216783c69bfdb18d0db465ea399' into uwe/tbb_…
uwedolinsky Jan 20, 2025
7ed8432
Merge commit 'f058cb230c65fe8094f74043d0c9afd5ba0e8325' into uwe/tbb_…
uwedolinsky Jan 22, 2025
cea6883
Merge commit 'b841691699393dd2375e987c3d38d5f59c3e35cf' into uwe/tbb_…
uwedolinsky Jan 27, 2025
979072f
[NATIVECPU] oneTBB bump
uwedolinsky Jan 27, 2025
f035142
Merge commit '0bb6789f0113ea937d861fd67fd677b91ecdeb8b' into uwe/tbb_…
uwedolinsky Jan 27, 2025
dac6f01
[NATIVECPU] clang-format and removed one inline
uwedolinsky Jan 28, 2025
2abe90a
[NATIVECPU] clang-format
uwedolinsky Jan 28, 2025
bddd831
[NATIVECPU] removed inline
uwedolinsky Jan 28, 2025
e7d9ff7
[NATIVECPU] added separate if statement for clarity
uwedolinsky Jan 28, 2025
4e9bd67
[NATIVECPU] renamed wait to wait_all
uwedolinsky Jan 28, 2025
20668ff
[NATIVECPU] move
uwedolinsky Feb 3, 2025
6226347
[NATIVECPU] merge with main
uwedolinsky Feb 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions source/adapters/native_cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ add_ur_adapter(${TARGET_NAME}
${CMAKE_CURRENT_SOURCE_DIR}/queue.cpp
${CMAKE_CURRENT_SOURCE_DIR}/queue.hpp
${CMAKE_CURRENT_SOURCE_DIR}/sampler.cpp
${CMAKE_CURRENT_SOURCE_DIR}/threadpool.hpp
${CMAKE_CURRENT_SOURCE_DIR}/ur_interface_loader.cpp
${CMAKE_CURRENT_SOURCE_DIR}/usm_p2p.cpp
${CMAKE_CURRENT_SOURCE_DIR}/virtual_mem.cpp
Expand Down
36 changes: 20 additions & 16 deletions source/adapters/native_cpu/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ struct NDRDescT {
RangeT GlobalOffset;
RangeT GlobalSize;
RangeT LocalSize;
NDRDescT(uint32_t WorkDim, const size_t *GlobalWorkOffset,
const size_t *GlobalWorkSize, const size_t *LocalWorkSize)
inline NDRDescT(uint32_t WorkDim, const size_t *GlobalWorkOffset,
const size_t *GlobalWorkSize, const size_t *LocalWorkSize)
: WorkDim(WorkDim) {
for (uint32_t I = 0; I < WorkDim; I++) {
GlobalOffset[I] = GlobalWorkOffset[I];
Expand All @@ -52,8 +52,8 @@ struct NDRDescT {
} // namespace native_cpu

#ifdef NATIVECPU_USE_OCK
static native_cpu::state getResizedState(const native_cpu::NDRDescT &ndr,
size_t itemsPerThread) {
static inline native_cpu::state getResizedState(const native_cpu::NDRDescT &ndr,
size_t itemsPerThread) {
native_cpu::state resized_state(
ndr.GlobalSize[0], ndr.GlobalSize[1], ndr.GlobalSize[2], itemsPerThread,
ndr.LocalSize[1], ndr.LocalSize[2], ndr.GlobalOffset[0],
Expand Down Expand Up @@ -168,7 +168,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
}
// Peel the remaining work items. Since the local size is 1, we iterate
// over the work groups.
for (unsigned g0 = new_num_work_groups_0 * itemsPerThread; g0 < numWG0;
for (size_t g0 = new_num_work_groups_0 * itemsPerThread; g0 < numWG0;
g0++) {
state.update(g0, g1, g2);
hKernel->_subhandler(hKernel->_args.data(), &state);
Expand All @@ -178,25 +178,28 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(

} else {
// We are running a parallel_for over an nd_range

size_t threadId = 0;
if (numWG1 * numWG2 >= numParallelThreads) {
// Dimensions 1 and 2 have enough work, split them across the threadpool
for (unsigned g2 = 0; g2 < numWG2; g2++) {
for (unsigned g1 = 0; g1 < numWG1; g1++) {
Tasks.schedule([state, kernel = *hKernel, numWG0, g1, g2,
numParallelThreads](size_t threadId) mutable {
Tasks.schedule([state, kernel = *hKernel, numWG0, g1, g2, threadId,
numParallelThreads](size_t /*threadId*/) mutable {
for (unsigned g0 = 0; g0 < numWG0; g0++) {
kernel.handleLocalArgs(numParallelThreads, threadId);
state.update(g0, g1, g2);
kernel._subhandler(kernel._args.data(), &state);
}
});
if (++threadId == numParallelThreads)
threadId = 0;
}
}
} else {
// Split dimension 0 across the threadpool
// Here we try to create groups of workgroups in order to reduce
// synchronization overhead
groups.reserve(numWG2 * numWG1 * numWG0);
for (unsigned g2 = 0; g2 < numWG2; g2++) {
for (unsigned g1 = 0; g1 < numWG1; g1++) {
for (unsigned g0 = 0; g0 < numWG0; g0++) {
Expand All @@ -213,24 +216,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
auto numGroups = groups.size();
auto groupsPerThread = numGroups / numParallelThreads;
auto remainder = numGroups % numParallelThreads;
for (unsigned thread = 0; thread < numParallelThreads; thread++) {
unsigned thread = 0;
for (; groupsPerThread && thread < numParallelThreads; thread++) {
Tasks.schedule(
[&groups, thread, groupsPerThread, hKernel](size_t threadId) {
[&groups, thread, groupsPerThread, hKernel](size_t /*threadId*/) {
for (unsigned i = 0; i < groupsPerThread; i++) {
auto index = thread * groupsPerThread + i;
groups[index](threadId, *hKernel);
groups[index](thread /*Id*/, *hKernel);
}
});
}

// schedule the remaining tasks
if (remainder) {
Tasks.schedule([&groups, remainder,
Tasks.schedule([&groups, remainder, thread,
scheduled = numParallelThreads * groupsPerThread,
hKernel](size_t threadId) {
hKernel](size_t /*threadId*/) {
for (unsigned i = 0; i < remainder; i++) {
auto index = scheduled + i;
groups[index](threadId, *hKernel);
groups[index](thread /*Id*/, *hKernel);
}
});
}
Expand Down Expand Up @@ -407,7 +411,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
// TODO: error checking
// TODO: handle async
void *startingPtr = hBuffer->_mem + offset;
unsigned steps = size / patternSize;
size_t steps = size / patternSize;
for (unsigned i = 0; i < steps; i++) {
memcpy(static_cast<int8_t *>(startingPtr) + i * patternSize, pPattern,
patternSize);
Expand Down Expand Up @@ -553,7 +557,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
break;
}
default: {
for (unsigned int step{0}; step < size; step += patternSize) {
for (size_t step{0}; step < size; step += patternSize) {
auto *dest =
reinterpret_cast<void *>(reinterpret_cast<uint8_t *>(ptr) + step);
memcpy(dest, pPattern, patternSize);
Expand Down
12 changes: 6 additions & 6 deletions source/adapters/native_cpu/kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ namespace native_cpu {
struct NativeCPUArgDesc {
void *MPtr;

NativeCPUArgDesc(void *Ptr) : MPtr(Ptr){};
inline NativeCPUArgDesc(void *Ptr) : MPtr(Ptr){};
};

} // namespace native_cpu
Expand All @@ -33,17 +33,17 @@ using nativecpu_task_t = std::function<nativecpu_kernel_t>;
struct local_arg_info_t {
uint32_t argIndex;
size_t argSize;
local_arg_info_t(uint32_t argIndex, size_t argSize)
inline local_arg_info_t(uint32_t argIndex, size_t argSize)
: argIndex(argIndex), argSize(argSize) {}
};

struct ur_kernel_handle_t_ : RefCounted {

ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name,
inline ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name,
nativecpu_task_t subhandler)
: hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)} {}

ur_kernel_handle_t_(const ur_kernel_handle_t_ &other)
inline ur_kernel_handle_t_(const ur_kernel_handle_t_ &other)
: hProgram(other.hProgram), _name(other._name),
_subhandler(other._subhandler), _args(other._args),
_localArgInfo(other._localArgInfo), _localMemPool(other._localMemPool),
Expand All @@ -52,12 +52,12 @@ struct ur_kernel_handle_t_ : RefCounted {
incrementReferenceCount();
}

~ur_kernel_handle_t_() {
inline ~ur_kernel_handle_t_() {
if (decrementReferenceCount() == 0) {
free(_localMemPool);
}
}
ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name,
inline ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name,
nativecpu_task_t subhandler,
std::optional<native_cpu::WGSize_t> ReqdWGSize,
std::optional<native_cpu::WGSize_t> MaxWGSize,
Expand Down
6 changes: 3 additions & 3 deletions source/adapters/native_cpu/nativecpu_state.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ struct state {
size_t MNumGroups[3];
size_t MGlobalOffset[3];
uint32_t NumSubGroups, SubGroup_id, SubGroup_local_id, SubGroup_size;
state(size_t globalR0, size_t globalR1, size_t globalR2, size_t localR0,
inline state(size_t globalR0, size_t globalR1, size_t globalR2, size_t localR0,
size_t localR1, size_t localR2, size_t globalO0, size_t globalO1,
size_t globalO2)
: MGlobal_range{globalR0, globalR1, globalR2}, MWorkGroup_size{localR0,
Expand All @@ -43,7 +43,7 @@ struct state {
SubGroup_size = 1;
}

void update(size_t group0, size_t group1, size_t group2, size_t local0,
inline void update(size_t group0, size_t group1, size_t group2, size_t local0,
size_t local1, size_t local2) {
MWorkGroup_id[0] = group0;
MWorkGroup_id[1] = group1;
Expand All @@ -59,7 +59,7 @@ struct state {
MWorkGroup_size[2] * MWorkGroup_id[2] + MLocal_id[2] + MGlobalOffset[2];
}

void update(size_t group0, size_t group1, size_t group2) {
inline void update(size_t group0, size_t group1, size_t group2) {
MWorkGroup_id[0] = group0;
MWorkGroup_id[1] = group1;
MWorkGroup_id[2] = group2;
Expand Down