Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NATIVECPU] NativeCPU with optional oneTBB backend #2627

Open
wants to merge 66 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
66 commits
Select commit Hold shift + click to select a range
da5500c
[SYCLNATIVECPU] inline native_cpu adapter functions
uwedolinsky Aug 26, 2024
0b8b0f7
[NATIVECPU] use size_t, reserve vector size
uwedolinsky Aug 28, 2024
88db20a
[NATIVECPU] use strcpy_s
uwedolinsky Oct 17, 2024
f64fc08
Merge remote-tracking branch 'origin/main' into uwe/inline
uwedolinsky Oct 21, 2024
0b89dee
[NATIVECPU] only enqeue when groupsPerThread is >0
uwedolinsky Oct 21, 2024
d111337
[NATIVECPU] changed filemode back
uwedolinsky Oct 21, 2024
a1166d8
[NATIVECPU] added threadpool file to CMakeList
uwedolinsky Oct 22, 2024
47b12a4
[SYCLNATIVECPU] threadID now managed by kernel enqueue
uwedolinsky Oct 22, 2024
4b04ce6
[SYCLNATIVECPU] file mode changed back
uwedolinsky Oct 22, 2024
5406b39
[NATIVECPU] Simple TBB backend
uwedolinsky Oct 22, 2024
2e8ae3f
[NATIVECPU] changed back filemode
uwedolinsky Oct 22, 2024
3c64917
[NATIVECPU] fixed merge with optimisation branch
uwedolinsky Oct 22, 2024
eb1b1cc
Merge remote-tracking branch 'origin/main' into uwe/tbb_integration
uwedolinsky Oct 22, 2024
d5cf2c9
[NATIVECPU] fixed scheduling
uwedolinsky Oct 23, 2024
ba9b2c5
[NATIVECPU] more shared code
uwedolinsky Oct 23, 2024
5cf59d2
[NATIVECPU] reversed filemode change
uwedolinsky Oct 23, 2024
7077d1a
[NATIVECPU] update oneTBB tag
uwedolinsky Oct 24, 2024
a8e599c
[NATIVECPU] added required include not needed by Windows
uwedolinsky Oct 24, 2024
4c64575
[NATIVECPU] removed strcpy_s because it's not supported by gcc
uwedolinsky Oct 24, 2024
4905c44
[NATIVECPU] added system headers first
uwedolinsky Oct 24, 2024
e426b3f
[NATIVECPU] cmake fix
uwedolinsky Oct 24, 2024
4200f30
[NATIVECPU] removed GIT_SHALLOW
uwedolinsky Oct 25, 2024
53f4494
[NATIVECPU] turn CMAKE_INCLUDE_CURRENT_DIR off for tbb
uwedolinsky Nov 1, 2024
2ca6a3f
[NATIVECPU] workaround for oneTBB casting away const qualifiers
uwedolinsky Nov 1, 2024
835ce2f
[NATIVECPU] workaround for oneTBB casting away const qualifiers
uwedolinsky Nov 1, 2024
42b1e6e
[NATIVECPU] remove potentially unneeded cmake
uwedolinsky Nov 1, 2024
ceee902
Merge commit 'fa8cc8ec16c1a2cf0926cc64026edc6a254ff0c2' into uwe/tbb_…
uwedolinsky Nov 1, 2024
9c6fb07
[NATIVECPU] oneTBB disabled by default
uwedolinsky Nov 4, 2024
de98e9b
[NATIVECPU] tbb to oneTBB
uwedolinsky Nov 4, 2024
8dbe123
Merge commit 'b0a9e2be61ad42d3447f1f246120ab25119a03e0' into uwe/tbb_…
uwedolinsky Nov 4, 2024
51e915a
[NATIVECPU] improved comment
uwedolinsky Nov 4, 2024
dd8b027
[NATIVECPU] tbb to oneTBB
uwedolinsky Nov 4, 2024
4a5238f
[NATIVECPU] tbb to oneTBB
uwedolinsky Nov 4, 2024
3f697ae
[NATIVECPU] num_threads with oneTBB
uwedolinsky Nov 4, 2024
5f687cc
[NATIVECPU] added comment to cmake
uwedolinsky Nov 6, 2024
b651d29
[NATIVECPU] waiting for tasks when using local args
uwedolinsky Nov 6, 2024
4f64538
[NATIVECPU] using old task ids with tbb (WIP)
uwedolinsky Nov 6, 2024
6330a29
[NATIVECPU] changed back filemode
uwedolinsky Nov 6, 2024
f566f80
[NATIVECPU] removed unneeded clear
uwedolinsky Nov 7, 2024
e06b72b
Merge commit '2858a8a28d0b6524a3b2b0e25a597d1c8295ce9d' into uwe/tbb_…
uwedolinsky Nov 7, 2024
2584cd4
Merge commit '09ae26af4e4e4301177db704b3b109ecd388c846' into uwe/tbb_…
uwedolinsky Nov 8, 2024
c1cd18a
Merge commit 'ed9fe09f9987bbe86715f191e6dbe5695ebf0306' into uwe/tbb_…
uwedolinsky Nov 8, 2024
b78eb83
Merge commit 'c94dbc8fa25f62b812a3db707b78f1c217b33bc3' into uwe/tbb_…
uwedolinsky Nov 11, 2024
f121eb6
[NATIVECPU] fixed merge from main
uwedolinsky Nov 13, 2024
7357652
[NATIVECPU] removed MS extensions
uwedolinsky Nov 13, 2024
a3e52e6
[NATIVECPU] fix merge with events update
uwedolinsky Nov 14, 2024
56afb9a
[NATIVECPU] revert noise
uwedolinsky Nov 14, 2024
488b641
[NATIVECPU] fix integer size warnings
uwedolinsky Nov 14, 2024
fefafcb
Merge commit '38ee6ce2a0400573c0c7c5da782bc32ff578fcc4' into uwe/tbb_…
uwedolinsky Nov 25, 2024
75288ce
[NATIVECPU] update oneTBB tag
uwedolinsky Nov 26, 2024
fc992e3
[NATIVECPU] use oneTBB UXL github
uwedolinsky Nov 29, 2024
ea8a19c
Merge commit '5f4a5a27e8192ee41ce21c8fb140d770c779af78' into uwe/tbb_…
uwedolinsky Dec 5, 2024
469f27f
[NATIVECPU] undefine _DEBUG in release builds for tbb
uwedolinsky Dec 12, 2024
a67282b
[NATIVECPU] merge
uwedolinsky Jan 6, 2025
7dc9416
Merge commit '029a977bc76d1216783c69bfdb18d0db465ea399' into uwe/tbb_…
uwedolinsky Jan 20, 2025
7ed8432
Merge commit 'f058cb230c65fe8094f74043d0c9afd5ba0e8325' into uwe/tbb_…
uwedolinsky Jan 22, 2025
cea6883
Merge commit 'b841691699393dd2375e987c3d38d5f59c3e35cf' into uwe/tbb_…
uwedolinsky Jan 27, 2025
979072f
[NATIVECPU] oneTBB bump
uwedolinsky Jan 27, 2025
f035142
Merge commit '0bb6789f0113ea937d861fd67fd677b91ecdeb8b' into uwe/tbb_…
uwedolinsky Jan 27, 2025
dac6f01
[NATIVECPU] clang-format and removed one inline
uwedolinsky Jan 28, 2025
2abe90a
[NATIVECPU] clang-format
uwedolinsky Jan 28, 2025
bddd831
[NATIVECPU] removed inline
uwedolinsky Jan 28, 2025
e7d9ff7
[NATIVECPU] added separate if statement for clarity
uwedolinsky Jan 28, 2025
4e9bd67
[NATIVECPU] renamed wait to wait_all
uwedolinsky Jan 28, 2025
20668ff
[NATIVECPU] move
uwedolinsky Feb 3, 2025
6226347
[NATIVECPU] merge with main
uwedolinsky Feb 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions source/adapters/native_cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ add_ur_adapter(${TARGET_NAME}
${CMAKE_CURRENT_SOURCE_DIR}/queue.cpp
${CMAKE_CURRENT_SOURCE_DIR}/queue.hpp
${CMAKE_CURRENT_SOURCE_DIR}/sampler.cpp
${CMAKE_CURRENT_SOURCE_DIR}/threadpool.hpp
${CMAKE_CURRENT_SOURCE_DIR}/ur_interface_loader.cpp
${CMAKE_CURRENT_SOURCE_DIR}/usm_p2p.cpp
${CMAKE_CURRENT_SOURCE_DIR}/virtual_mem.cpp
Expand All @@ -49,6 +50,34 @@ set_target_properties(${TARGET_NAME} PROPERTIES
SOVERSION "${PROJECT_VERSION_MAJOR}"
)

# oneTBB is used as an optional NativeCPU backend and disabled by default.
option(NATIVECPU_WITH_ONETBB "Use oneTBB as backend for Native CPU" OFF)
if(NATIVECPU_WITH_ONETBB)
message(STATUS "Configuring Native CPU adapter with oneTBB backend.")

include(FetchContent)
FetchContent_Declare(
tbb
GIT_REPOSITORY https://github.com/uxlfoundation/oneTBB.git
#commit 9d4578723827f31defd79389819a5fbf659577f7 (HEAD -> master, origin/master, origin/HEAD)
#Author: Konstantin Boyarinov <konstantin.boyarinov@intel.com>
#Date: Fri Jan 24 23:23:59 2025 +0200
# Add explicit deduction guides for blocked_nd_range (#1525)
GIT_TAG 9d4578723827f31defd79389819a5fbf659577f7
CMAKE_ARGS "-DTBB_TEST:BOOL=OFF -DTBB_EXAMPLES:BOOL=OFF -DTBB_BENCH:BOOL=OFF"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we keep FetchContent: this line has no effect as far as I can see, CMake does not get called for oneTBB, so there is nothing that parses CMake args. This is why you needed to set TBB_TEST etc. again below despite including them here.

This is harmless though so I'm okay with leaving this as is unless other changes are needed as well.

OVERRIDE_FIND_PACKAGE
)
set(TBB_TEST OFF CACHE INTERNAL "" FORCE)
set(TBB_EXAMPLES OFF CACHE INTERNAL "" FORCE)
set(TBB_BENCH OFF CACHE INTERNAL "" FORCE)
set(TBB_BUILD ON CACHE INTERNAL "" FORCE)
set(TBB_FIND_PACKAGE OFF CACHE INTERNAL "" FORCE)
set(TBB_FUZZ_TESTING OFF CACHE INTERNAL "" FORCE)
set(TBB_INSTALL ON CACHE INTERNAL "" FORCE)
set (CMAKE_INCLUDE_CURRENT_DIR OFF)
FetchContent_MakeAvailable(tbb)
endif()

find_package(Threads REQUIRED)

target_link_libraries(${TARGET_NAME} PRIVATE
Expand All @@ -61,3 +90,23 @@ target_link_libraries(${TARGET_NAME} PRIVATE
target_include_directories(${TARGET_NAME} PRIVATE
"${CMAKE_CURRENT_SOURCE_DIR}/../../"
)

if(NATIVECPU_WITH_ONETBB)
target_link_libraries(${TARGET_NAME} PRIVATE
TBB::tbb
)
if (NOT MSVC)
# oneTBB currently casts away some const qualifiers
# todo: check if compiler actually supports these options
target_compile_options(tbb PRIVATE -Wno-cast-qual)
target_compile_options(tbbmalloc PRIVATE -Wno-cast-qual)
endif()

# Undefine _DEBUG option in release builds to find
# release tbbbind
if (NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG")
target_compile_options(tbb PRIVATE -U_DEBUG)
endif()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The root cause for the complication here and before is including oneTBB with FetchContent, which processes it according to the current project's CMake rules, and then requires extra work when unified-runtime's CMake rules are no good for oneTBB. Is it possible to include it using ExternalProject_Add instead?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably, but this may require some more rework and retesting, and the current FetchContent integration is working and passing the DPCPP CI so I'd recommend to revisit this in a follow-up PR if need be. Also, the latest Cmake "Using Dependencies Guide" doesn't seem to mention ExternalProject, so it's not clear how recommended it is.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

True, but the "Using Dependencies Guide" only gives instructions on how to either use pre-built packages, or how to include an external project's sources as this project's sources. Neither works for us and the result is that we need workarounds to include oneTBB's sources as this project's sources, except make it mostly, but not entirely, as if they are a different project's sources, because our project's configuration is not what we want for oneTBB.

But if we do not have an easy way of using an alternative, if it would be too much work to use ExternalProject_Add or some other alternatives, then sure, we can delay that until later.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment looks like it does not match what the code does, this does not build oneTBB in release mode, this only builds oneTBB without assertions. In debug builds with no (or minimal?) optimizations, this would build oneTBB with no/minimal optimizations as well, whereas release mode would enable aggressive optimizations. Is that intended? If it is, that is fine but can you update the comment to match? If it is not intended, some more work would be needed to really build in release mode. From what I can find, setting CMAKE_BUILD_TYPE should be the most reliable way of doing that with single-config generators (e.g. Ninja). For multi-config generators (e.g. MSVC), I think we should still have access to CMAKE_CXX_FLAGS_RELEASE to add?


target_compile_definitions(${TARGET_NAME} PRIVATE NATIVECPU_WITH_ONETBB)
endif()
2 changes: 1 addition & 1 deletion source/adapters/native_cpu/context.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ struct ur_context_handle_t_ : RefCounted {
// We need to ensure that we align to at least alignof(usm_alloc_info),
// otherwise its start address may be unaligned.
alignment =
std::max<size_t>(alignment, alignof(native_cpu::usm_alloc_info));
std::max<uint32_t>(alignment, alignof(native_cpu::usm_alloc_info));
void *alloc = native_cpu::malloc_impl(alignment, size);
if (!alloc)
return nullptr;
Expand Down
76 changes: 37 additions & 39 deletions source/adapters/native_cpu/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ struct NDRDescT {
} // namespace native_cpu

#ifdef NATIVECPU_USE_OCK
static native_cpu::state getResizedState(const native_cpu::NDRDescT &ndr,
size_t itemsPerThread) {
static inline native_cpu::state getResizedState(const native_cpu::NDRDescT &ndr,
size_t itemsPerThread) {
native_cpu::state resized_state(
ndr.GlobalSize[0], ndr.GlobalSize[1], ndr.GlobalSize[2], itemsPerThread,
ndr.LocalSize[1], ndr.LocalSize[2], ndr.GlobalOffset[0],
Expand Down Expand Up @@ -107,7 +107,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
auto &tp = hQueue->getDevice()->tp;
const size_t numParallelThreads = tp.num_threads();
hKernel->updateMemPool(numParallelThreads);
std::vector<std::future<void>> futures;
auto Tasks = native_cpu::getScheduler(tp);
std::vector<std::function<void(size_t, ur_kernel_handle_t_)>> groups;
auto numWG0 = ndr.GlobalSize[0] / ndr.LocalSize[0];
auto numWG1 = ndr.GlobalSize[1] / ndr.LocalSize[1];
Expand Down Expand Up @@ -159,17 +159,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
for (unsigned g2 = 0; g2 < numWG2; g2++) {
for (unsigned g1 = 0; g1 < numWG1; g1++) {
for (unsigned g0 = 0; g0 < new_num_work_groups_0; g0 += 1) {
futures.emplace_back(tp.schedule_task(
Tasks.schedule(
[ndr, itemsPerThread, kernel = *hKernel, g0, g1, g2](size_t) {
native_cpu::state resized_state =
getResizedState(ndr, itemsPerThread);
resized_state.update(g0, g1, g2);
kernel._subhandler(kernel.getArgs().data(), &resized_state);
}));
});
}
// Peel the remaining work items. Since the local size is 1, we iterate
// over the work groups.
for (unsigned g0 = new_num_work_groups_0 * itemsPerThread; g0 < numWG0;
for (size_t g0 = new_num_work_groups_0 * itemsPerThread; g0 < numWG0;
g0++) {
state.update(g0, g1, g2);
hKernel->_subhandler(hKernel->getArgs().data(), &state);
Expand All @@ -179,26 +179,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(

} else {
// We are running a parallel_for over an nd_range

if (numWG1 * numWG2 >= numParallelThreads) {
// Dimensions 1 and 2 have enough work, split them across the threadpool
for (unsigned g2 = 0; g2 < numWG2; g2++) {
for (unsigned g1 = 0; g1 < numWG1; g1++) {
futures.emplace_back(
tp.schedule_task([state, kernel = *hKernel, numWG0, g1, g2,
numParallelThreads](size_t threadId) mutable {
for (unsigned g0 = 0; g0 < numWG0; g0++) {
kernel.handleLocalArgs(numParallelThreads, threadId);
state.update(g0, g1, g2);
kernel._subhandler(kernel.getArgs().data(), &state);
}
}));
Tasks.schedule([state, kernel = *hKernel, numWG0, g1, g2,
numParallelThreads](size_t threadId) mutable {
for (unsigned g0 = 0; g0 < numWG0; g0++) {
kernel.handleLocalArgs(numParallelThreads, threadId);
state.update(g0, g1, g2);
kernel._subhandler(kernel.getArgs().data(), &state);
}
});
}
}
} else {
// Split dimension 0 across the threadpool
// Here we try to create groups of workgroups in order to reduce
// synchronization overhead
groups.reserve(numWG2 * numWG1 * numWG0);
for (unsigned g2 = 0; g2 < numWG2; g2++) {
for (unsigned g1 = 0; g1 < numWG1; g1++) {
for (unsigned g0 = 0; g0 < numWG0; g0++) {
Expand All @@ -214,35 +213,34 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
}
auto numGroups = groups.size();
auto groupsPerThread = numGroups / numParallelThreads;
auto remainder = numGroups % numParallelThreads;
for (unsigned thread = 0; thread < numParallelThreads; thread++) {
futures.emplace_back(
tp.schedule_task([groups, thread, groupsPerThread,
kernel = *hKernel](size_t threadId) {
for (unsigned i = 0; i < groupsPerThread; i++) {
auto index = thread * groupsPerThread + i;
groups[index](threadId, kernel);
}
}));
if (groupsPerThread) {
for (unsigned thread = 0; thread < numParallelThreads; thread++) {
Tasks.schedule([groups, thread, groupsPerThread,
kernel = *hKernel](size_t threadId) {
for (unsigned i = 0; i < groupsPerThread; i++) {
auto index = thread * groupsPerThread + i;
groups[index](threadId, kernel);
}
});
}
}

// schedule the remaining tasks
auto remainder = numGroups % numParallelThreads;
if (remainder) {
futures.emplace_back(
tp.schedule_task([groups, remainder,
scheduled = numParallelThreads * groupsPerThread,
kernel = *hKernel](size_t threadId) {
for (unsigned i = 0; i < remainder; i++) {
auto index = scheduled + i;
groups[index](threadId, kernel);
}
}));
Tasks.schedule([groups, remainder,
scheduled = numParallelThreads * groupsPerThread,
kernel = *hKernel](size_t threadId) {
for (unsigned i = 0; i < remainder; i++) {
auto index = scheduled + i;
groups[index](threadId, kernel);
}
});
}
}
}

#endif // NATIVECPU_USE_OCK
event->set_futures(futures);
event->set_futures(Tasks.getTaskInfo());

*phEvent = event;
event->set_callback([hKernel, event]() {
Expand Down Expand Up @@ -456,7 +454,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
// TODO: error checking
// TODO: handle async
void *startingPtr = hBuffer->_mem + offset;
unsigned steps = size / patternSize;
size_t steps = size / patternSize;
for (unsigned i = 0; i < steps; i++) {
memcpy(static_cast<int8_t *>(startingPtr) + i * patternSize, pPattern,
patternSize);
Expand Down Expand Up @@ -597,7 +595,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
break;
}
default: {
for (unsigned int step{0}; step < size; step += patternSize) {
for (size_t step{0}; step < size; step += patternSize) {
auto *dest = reinterpret_cast<void *>(
reinterpret_cast<uint8_t *>(ptr) + step);
memcpy(dest, pPattern, patternSize);
Expand Down
7 changes: 3 additions & 4 deletions source/adapters/native_cpu/event.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "ur_api.h"

#include "common.hpp"
#include "device.hpp"
#include "event.hpp"
#include "queue.hpp"
#include <cstdint>
Expand Down Expand Up @@ -123,7 +124,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp(
ur_event_handle_t_::ur_event_handle_t_(ur_queue_handle_t queue,
ur_command_t command_type)
: queue(queue), context(queue->getContext()), command_type(command_type),
done(false) {
done(false), futures(queue->getDevice()->tp) {
this->queue->addEvent(this);
}

Expand All @@ -138,9 +139,7 @@ void ur_event_handle_t_::wait() {
if (done) {
return;
}
for (auto &f : futures) {
f.wait();
}
this->futures.wait_all();
queue->removeEvent(this);
done = true;
// The callback may need to acquire the lock, so we unlock it here
Expand Down
6 changes: 4 additions & 2 deletions source/adapters/native_cpu/event.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
//===----------------------------------------------------------------------===//
#pragma once
#include "common.hpp"
#include "threadpool.hpp"
#include "ur_api.h"
#include <cstdint>
#include <future>
Expand Down Expand Up @@ -40,7 +41,8 @@ struct ur_event_handle_t_ : RefCounted {

ur_command_t getCommandType() const { return command_type; }

void set_futures(std::vector<std::future<void>> &fs) {
// todo: get rid of this function
void set_futures(native_cpu::tasksinfo_t &&fs) {
std::lock_guard<std::mutex> lock(mutex);
futures = std::move(fs);
}
Expand All @@ -59,7 +61,7 @@ struct ur_event_handle_t_ : RefCounted {
ur_command_t command_type;
bool done;
std::mutex mutex;
std::vector<std::future<void>> futures;
native_cpu::tasksinfo_t futures;
std::function<void()> callback;
uint64_t timestamp_start = 0;
uint64_t timestamp_end = 0;
Expand Down
4 changes: 2 additions & 2 deletions source/adapters/native_cpu/kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ using nativecpu_task_t = std::function<nativecpu_kernel_t>;
struct local_arg_info_t {
uint32_t argIndex;
size_t argSize;
local_arg_info_t(uint32_t argIndex, size_t argSize)
inline local_arg_info_t(uint32_t argIndex, size_t argSize)
: argIndex(argIndex), argSize(argSize) {}
};

Expand All @@ -41,7 +41,7 @@ struct ur_kernel_handle_t_ : RefCounted {
incrementReferenceCount();
}

~ur_kernel_handle_t_() {
inline ~ur_kernel_handle_t_() {
if (decrementReferenceCount() == 0) {
free(_localMemPool);
Args.deallocate();
Expand Down
12 changes: 6 additions & 6 deletions source/adapters/native_cpu/nativecpu_state.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ struct state {
size_t MNumGroups[3];
size_t MGlobalOffset[3];
uint32_t NumSubGroups, SubGroup_id, SubGroup_local_id, SubGroup_size;
state(size_t globalR0, size_t globalR1, size_t globalR2, size_t localR0,
size_t localR1, size_t localR2, size_t globalO0, size_t globalO1,
size_t globalO2)
inline state(size_t globalR0, size_t globalR1, size_t globalR2,
size_t localR0, size_t localR1, size_t localR2, size_t globalO0,
size_t globalO1, size_t globalO2)
: MGlobal_range{globalR0, globalR1, globalR2},
MWorkGroup_size{localR0, localR1, localR2},
MNumGroups{globalR0 / localR0, globalR1 / localR1, globalR2 / localR2},
Expand All @@ -42,8 +42,8 @@ struct state {
SubGroup_size = 1;
}

void update(size_t group0, size_t group1, size_t group2, size_t local0,
size_t local1, size_t local2) {
inline void update(size_t group0, size_t group1, size_t group2, size_t local0,
size_t local1, size_t local2) {
MWorkGroup_id[0] = group0;
MWorkGroup_id[1] = group1;
MWorkGroup_id[2] = group2;
Expand All @@ -58,7 +58,7 @@ struct state {
MWorkGroup_size[2] * MWorkGroup_id[2] + MLocal_id[2] + MGlobalOffset[2];
}

void update(size_t group0, size_t group1, size_t group2) {
inline void update(size_t group0, size_t group1, size_t group2) {
MWorkGroup_id[0] = group0;
MWorkGroup_id[1] = group1;
MWorkGroup_id[2] = group2;
Expand Down
Loading
Loading