Skip to content

Commit

Permalink
[SYCL][Graph] Fix CUDA/HIP local mem argument update bug
Browse files Browse the repository at this point in the history
Tests UR PR oneapi-src/unified-runtime#2298
with additional SYCL-Graph local memory argument E2E tests.

PR also sets the `pnext` and `snext` members of
`ur_exp_command_buffer_update_kernel_launch_desc_t ` which were missing when
calling into UR.
  • Loading branch information
EwanC committed Nov 21, 2024
1 parent 5e61f8f commit b6e03ad
Show file tree
Hide file tree
Showing 20 changed files with 533 additions and 27 deletions.
2 changes: 1 addition & 1 deletion sycl/cmake/modules/FetchUnifiedRuntime.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ if(SYCL_UR_USE_FETCH_CONTENT)
CACHE PATH "Path to external '${name}' adapter source dir" FORCE)
endfunction()

set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
set(UNIFIED_RUNTIME_REPO "https://github.com/Bensuo/unified-runtime.git")
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/UnifiedRuntimeTag.cmake)

set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES")
Expand Down
8 changes: 1 addition & 7 deletions sycl/cmake/modules/UnifiedRuntimeTag.cmake
Original file line number Diff line number Diff line change
@@ -1,7 +1 @@
# commit 9937d029c7fdcbf101e89f8515f640c145e059c5
# Merge: 9ac6d5d9 10b0e101
# Author: Callum Fare <[email protected]>
# Date: Wed Nov 20 14:49:17 2024 +0000
# Merge pull request #2258 from aarongreig/aaron/tryUseExtensionSubgroupInfo
# Use extension version of clGetKernelSubGroupInfo when necessary.
set(UNIFIED_RUNTIME_TAG 9937d029c7fdcbf101e89f8515f640c145e059c5)
set(UNIFIED_RUNTIME_TAG "ewan/cuda_update_local_size")
5 changes: 4 additions & 1 deletion sycl/source/detail/graph_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1510,7 +1510,10 @@ void exec_graph_impl::updateImpl(std::shared_ptr<node_impl> Node) {
PtrDescs.reserve(MaskedArgs.size());
ValueDescs.reserve(MaskedArgs.size());

ur_exp_command_buffer_update_kernel_launch_desc_t UpdateDesc;
ur_exp_command_buffer_update_kernel_launch_desc_t UpdateDesc{};
UpdateDesc.stype =
UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC;
UpdateDesc.pNext = nullptr;

// Collect arg descriptors and fill kernel launch descriptor
using sycl::detail::kernel_param_kind_t;
Expand Down
10 changes: 10 additions & 0 deletions sycl/test-e2e/Graph/Explicit/local_accessor_multiple_accessors.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
// Extra run to check for immediate-command-list in Level Zero
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}

#define GRAPH_E2E_EXPLICIT

#include "../Inputs/local_accessor_multiple_accessors.cpp"
10 changes: 10 additions & 0 deletions sycl/test-e2e/Graph/Explicit/local_accessor_multiple_nodes.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
// Extra run to check for immediate-command-list in Level Zero
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}

#define GRAPH_E2E_EXPLICIT

#include "../Inputs/local_accessor_multiple_nodes.cpp"
12 changes: 12 additions & 0 deletions sycl/test-e2e/Graph/Explicit/opencl_local_acc.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
// Extra run to check for immediate-command-list in Level Zero
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}

// REQUIRES: ocloc && (opencl || level_zero)

#define GRAPH_E2E_EXPLICIT

#include "../Inputs/opencl_local_acc.cpp"
16 changes: 7 additions & 9 deletions sycl/test-e2e/Graph/Inputs/local_accessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,18 @@ int main() {

const size_t LocalSize = 128;

std::vector<T> DataA(Size), DataB(Size), DataC(Size);
std::vector<T> HostData(Size);

std::iota(DataA.begin(), DataA.end(), 10);

std::vector<T> ReferenceA(DataA);
std::iota(HostData.begin(), HostData.end(), 10);

exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};

T *PtrA = malloc_device<T>(Size, Queue);

Queue.copy(DataA.data(), PtrA, Size);
Queue.copy(HostData.data(), PtrA, Size);
Queue.wait_and_throw();

auto node = add_node(Graph, Queue, [&](handler &CGH) {
auto Node = add_node(Graph, Queue, [&](handler &CGH) {
local_accessor<T, 1> LocalMem(LocalSize, CGH);

CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) {
Expand All @@ -40,14 +38,14 @@ int main() {

Queue.wait_and_throw();

Queue.copy(PtrA, DataA.data(), Size);
Queue.copy(PtrA, HostData.data(), Size);
Queue.wait_and_throw();

free(PtrA, Queue);

for (size_t i = 0; i < Size; i++) {
T Ref = 10 + i + (i * 2);
check_value(i, Ref, ReferenceA[i], "PtrA");
T Ref = 10 + i + (Iterations * (i * 2));
assert(check_value(i, Ref, HostData[i], "PtrA"));
}

return 0;
Expand Down
62 changes: 62 additions & 0 deletions sycl/test-e2e/Graph/Inputs/local_accessor_multiple_accessors.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
// Tests adding of nodes with more than one local accessor,
// and submitting the graph.

#include "../graph_common.hpp"

int main() {
queue Queue{};

using T = int;

const size_t LocalSize = 128;

std::vector<T> HostData(Size);

std::iota(HostData.begin(), HostData.end(), 10);

exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};

T *PtrIn = malloc_device<T>(Size, Queue);
T *PtrOut = malloc_device<T>(Size, Queue);

Queue.memset(PtrOut, 0, Size * sizeof(T));
Queue.copy(HostData.data(), PtrIn, Size);
Queue.wait_and_throw();

auto Node = add_node(Graph, Queue, [&](handler &CGH) {
local_accessor<T, 1> LocalMemA(LocalSize, CGH);
local_accessor<T, 1> LocalMemB(LocalSize, CGH);

CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) {
auto LocalID = Item.get_local_linear_id();
auto GlobalID = Item.get_global_linear_id();
LocalMemA[LocalID] = GlobalID;
LocalMemB[LocalID] = PtrIn[GlobalID];
PtrOut[GlobalID] += LocalMemA[LocalID] * LocalMemB[LocalID];
});
});

auto GraphExec = Graph.finalize();

for (unsigned n = 0; n < Iterations; n++) {
Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
}

Queue.wait_and_throw();

Queue.copy(PtrOut, HostData.data(), Size);
Queue.wait_and_throw();

free(PtrIn, Queue);
free(PtrOut, Queue);

for (size_t i = 0; i < Size; i++) {
T Ref = 0;
for (size_t n = 0; n < Iterations; n++) {
Ref += (i * (10 + i));
}
assert(check_value(i, Ref, HostData[i], "PtrOut"));
}

return 0;
}
71 changes: 71 additions & 0 deletions sycl/test-e2e/Graph/Inputs/local_accessor_multiple_nodes.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
// Test creating a graph where more than one nodes uses local accessors,
// and submits of the graph.

#include "../graph_common.hpp"

int main() {
queue Queue{};

using T = int;

const size_t LocalSize = 128;

std::vector<T> HostData(Size);

std::iota(HostData.begin(), HostData.end(), 10);

exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};

T *PtrA = malloc_device<T>(Size, Queue);

Queue.copy(HostData.data(), PtrA, Size);
Queue.wait_and_throw();

auto NodeA = add_node(Graph, Queue, [&](handler &CGH) {
local_accessor<T, 1> LocalMem(LocalSize, CGH);

CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) {
LocalMem[Item.get_local_linear_id()] = Item.get_global_linear_id() * 2;
PtrA[Item.get_global_linear_id()] += LocalMem[Item.get_local_linear_id()];
});
});

auto NodeB = add_node(
Graph, Queue,
[&](handler &CGH) {
local_accessor<T, 1> LocalMem(LocalSize, CGH);
depends_on_helper(CGH, NodeA);

CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) {
LocalMem[Item.get_local_linear_id()] = 3;
PtrA[Item.get_global_linear_id()] *=
LocalMem[Item.get_local_linear_id()];
});
},
NodeA);

auto GraphExec = Graph.finalize();

for (unsigned n = 0; n < Iterations; n++) {
Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
}

Queue.wait_and_throw();

Queue.copy(PtrA, HostData.data(), Size);
Queue.wait_and_throw();

free(PtrA, Queue);

for (size_t i = 0; i < Size; i++) {
T Ref = 10 + i;

for (size_t n = 0; n < Iterations; n++) {
Ref += i * 2;
Ref *= 3;
}
assert(check_value(i, Ref, HostData[i], "PtrA"));
}

return 0;
}
78 changes: 78 additions & 0 deletions sycl/test-e2e/Graph/Inputs/opencl_local_acc.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
// Tests using an OpenCL-C defined kernel with multiple local accessors

#include "../graph_common.hpp"

using source_kb = sycl::kernel_bundle<sycl::bundle_state::ext_oneapi_source>;
using exe_kb = sycl::kernel_bundle<sycl::bundle_state::executable>;

auto constexpr LocalAccCLSource = R"===(
kernel void test_la(__global int *out, __local int* local_ptr1,
__local int2* local_ptr2, int n) {
__local int4 local_data[1];
size_t gid = get_global_id(0);
size_t lid = get_local_id(0);
size_t wg_size = get_num_groups(0);
local_ptr1[lid] = lid;
local_ptr2[lid].x = n;
local_ptr2[lid].y = wg_size;
if (lid == 0) {
local_data[lid] = (int4)(0xA, 0xB, 0xC, 0xD);
}
barrier(CLK_LOCAL_MEM_FENCE);
int acc = local_data[0].x + local_data[0].y + local_data[0].z +
local_data[0].w;
out[gid] = (local_ptr1[lid] * local_ptr2[lid].x) +
(local_ptr2[lid].y * acc);
}
)===";

int main() {
queue Queue;

source_kb kbSrc = exp_ext::create_kernel_bundle_from_source(
Queue.get_context(), exp_ext::source_language::opencl, LocalAccCLSource);
exe_kb kbExe1 = exp_ext::build(kbSrc);
sycl::kernel test_kernel = kbExe1.ext_oneapi_get_kernel("test_la");

exp_ext::command_graph Graph{Queue};

int32_t *Ptr = malloc_device<int32_t>(Size, Queue);

int32_t N = 42;
constexpr size_t LocalSize = 256;
auto Node = add_node(Graph, Queue, [&](handler &cgh) {
local_accessor<int32_t, 1> acc_local1(LocalSize, cgh);
local_accessor<sycl::int2, 1> acc_local2(LocalSize, cgh);

cgh.set_arg(0, Ptr);
cgh.set_arg(1, acc_local1);
cgh.set_arg(2, acc_local2);
cgh.set_arg(3, N);

cgh.parallel_for(nd_range<1>(Size, LocalSize), test_kernel);
});

auto ExecGraph = Graph.finalize();
Queue.ext_oneapi_graph(ExecGraph).wait();

std::vector<int32_t> HostData(Size);
Queue.copy(Ptr, HostData.data(), Size).wait();

constexpr int32_t Acc = 0xA + 0xB + 0xC + 0xD;
constexpr int32_t WorkGroups = Size / LocalSize;
constexpr int32_t Tmp = Acc * WorkGroups;
for (size_t i = 0; i < Size; i++) {
int32_t local_id = i % LocalSize;
int32_t Ref = (local_id * N) + Tmp;
assert(HostData[i] == Ref);
}

sycl::free(Ptr, Queue);

return 0;
}
16 changes: 7 additions & 9 deletions sycl/test-e2e/Graph/Inputs/whole_update_local_acc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@ int main() {
std::iota(DataA.begin(), DataA.end(), 10);
std::iota(DataB.begin(), DataB.end(), 10);

std::vector<T> ReferenceA(DataA), ReferenceB(DataB);

exp_ext::command_graph GraphA{Queue.get_context(), Queue.get_device()};

T *PtrA = malloc_device<T>(Size, Queue);
Expand Down Expand Up @@ -59,10 +57,10 @@ int main() {
Queue.wait_and_throw();

for (size_t i = 0; i < Size; i++) {
T RefA = 10 + i + (i * 2) + LocalSize / 2;
T RefA = 10 + i + Iterations * ((i * 2) + (LocalSize / 2));
T RefB = 10 + i;
check_value(i, RefA, ReferenceA[i], "PtrA");
check_value(i, RefB, ReferenceB[i], "PtrB");
assert(check_value(i, RefA, DataA[i], "PtrA"));
assert(check_value(i, RefB, DataB[i], "PtrB"));
}

// Update GraphExecA using whole graph update
Expand All @@ -81,10 +79,10 @@ int main() {
Queue.wait_and_throw();

for (size_t i = 0; i < Size; i++) {
T RefA = 10 + i + (i * 2) + LocalSize / 2;
T RefB = 10 + i + (i * 2) + LocalSize;
check_value(i, RefA, ReferenceA[i], "PtrA");
check_value(i, RefB, ReferenceB[i], "PtrB");
T RefA = 10 + i + Iterations * ((i * 2) + (LocalSize / 2));
T RefB = 10 + i + Iterations * ((i * 2) + LocalSize);
assert(check_value(i, RefA, DataA[i], "PtrA"));
assert(check_value(i, RefB, DataB[i], "PtrB"));
}

free(PtrA, Queue);
Expand Down
Loading

0 comments on commit b6e03ad

Please sign in to comment.