diff --git a/sycl/cmake/modules/FetchUnifiedRuntime.cmake b/sycl/cmake/modules/FetchUnifiedRuntime.cmake index 72841724fa01d..202bfdcdc19e3 100644 --- a/sycl/cmake/modules/FetchUnifiedRuntime.cmake +++ b/sycl/cmake/modules/FetchUnifiedRuntime.cmake @@ -116,7 +116,7 @@ if(SYCL_UR_USE_FETCH_CONTENT) CACHE PATH "Path to external '${name}' adapter source dir" FORCE) endfunction() - set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") + set(UNIFIED_RUNTIME_REPO "https://github.com/Bensuo/unified-runtime.git") include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/UnifiedRuntimeTag.cmake) set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES") diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake index 95a7b4dc86bce..e73e7a3d52331 100644 --- a/sycl/cmake/modules/UnifiedRuntimeTag.cmake +++ b/sycl/cmake/modules/UnifiedRuntimeTag.cmake @@ -1,7 +1 @@ -# commit 9937d029c7fdcbf101e89f8515f640c145e059c5 -# Merge: 9ac6d5d9 10b0e101 -# Author: Callum Fare -# Date: Wed Nov 20 14:49:17 2024 +0000 -# Merge pull request #2258 from aarongreig/aaron/tryUseExtensionSubgroupInfo -# Use extension version of clGetKernelSubGroupInfo when necessary. -set(UNIFIED_RUNTIME_TAG 9937d029c7fdcbf101e89f8515f640c145e059c5) +set(UNIFIED_RUNTIME_TAG "ewan/cuda_update_local_size") diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 861ec2a883601..d723d8d83511f 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -1510,7 +1510,10 @@ void exec_graph_impl::updateImpl(std::shared_ptr Node) { PtrDescs.reserve(MaskedArgs.size()); ValueDescs.reserve(MaskedArgs.size()); - ur_exp_command_buffer_update_kernel_launch_desc_t UpdateDesc; + ur_exp_command_buffer_update_kernel_launch_desc_t UpdateDesc{}; + UpdateDesc.stype = + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC; + UpdateDesc.pNext = nullptr; // Collect arg descriptors and fill kernel launch descriptor using sycl::detail::kernel_param_kind_t; diff --git a/sycl/test-e2e/Graph/Explicit/local_accessor_multiple_accessors.cpp b/sycl/test-e2e/Graph/Explicit/local_accessor_multiple_accessors.cpp new file mode 100644 index 0000000000000..c4d704216a082 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/local_accessor_multiple_accessors.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/local_accessor_multiple_accessors.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/local_accessor_multiple_nodes.cpp b/sycl/test-e2e/Graph/Explicit/local_accessor_multiple_nodes.cpp new file mode 100644 index 0000000000000..83eff54307ea3 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/local_accessor_multiple_nodes.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/local_accessor_multiple_nodes.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/opencl_local_acc.cpp b/sycl/test-e2e/Graph/Explicit/opencl_local_acc.cpp new file mode 100644 index 0000000000000..48c41ad0e78c5 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/opencl_local_acc.cpp @@ -0,0 +1,12 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// REQUIRES: ocloc && (opencl || level_zero) + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/opencl_local_acc.cpp" diff --git a/sycl/test-e2e/Graph/Inputs/local_accessor.cpp b/sycl/test-e2e/Graph/Inputs/local_accessor.cpp index b3ac9fde67b6e..b6f6a7dd34348 100644 --- a/sycl/test-e2e/Graph/Inputs/local_accessor.cpp +++ b/sycl/test-e2e/Graph/Inputs/local_accessor.cpp @@ -10,20 +10,18 @@ int main() { const size_t LocalSize = 128; - std::vector DataA(Size), DataB(Size), DataC(Size); + std::vector HostData(Size); - std::iota(DataA.begin(), DataA.end(), 10); - - std::vector ReferenceA(DataA); + std::iota(HostData.begin(), HostData.end(), 10); exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; T *PtrA = malloc_device(Size, Queue); - Queue.copy(DataA.data(), PtrA, Size); + Queue.copy(HostData.data(), PtrA, Size); Queue.wait_and_throw(); - auto node = add_node(Graph, Queue, [&](handler &CGH) { + auto Node = add_node(Graph, Queue, [&](handler &CGH) { local_accessor LocalMem(LocalSize, CGH); CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) { @@ -40,14 +38,14 @@ int main() { Queue.wait_and_throw(); - Queue.copy(PtrA, DataA.data(), Size); + Queue.copy(PtrA, HostData.data(), Size); Queue.wait_and_throw(); free(PtrA, Queue); for (size_t i = 0; i < Size; i++) { - T Ref = 10 + i + (i * 2); - check_value(i, Ref, ReferenceA[i], "PtrA"); + T Ref = 10 + i + (Iterations * (i * 2)); + assert(check_value(i, Ref, HostData[i], "PtrA")); } return 0; diff --git a/sycl/test-e2e/Graph/Inputs/local_accessor_multiple_accessors.cpp b/sycl/test-e2e/Graph/Inputs/local_accessor_multiple_accessors.cpp new file mode 100644 index 0000000000000..0ae67a4469a8c --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/local_accessor_multiple_accessors.cpp @@ -0,0 +1,62 @@ +// Tests adding of nodes with more than one local accessor, +// and submitting the graph. + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + using T = int; + + const size_t LocalSize = 128; + + std::vector HostData(Size); + + std::iota(HostData.begin(), HostData.end(), 10); + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + T *PtrIn = malloc_device(Size, Queue); + T *PtrOut = malloc_device(Size, Queue); + + Queue.memset(PtrOut, 0, Size * sizeof(T)); + Queue.copy(HostData.data(), PtrIn, Size); + Queue.wait_and_throw(); + + auto Node = add_node(Graph, Queue, [&](handler &CGH) { + local_accessor LocalMemA(LocalSize, CGH); + local_accessor LocalMemB(LocalSize, CGH); + + CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) { + auto LocalID = Item.get_local_linear_id(); + auto GlobalID = Item.get_global_linear_id(); + LocalMemA[LocalID] = GlobalID; + LocalMemB[LocalID] = PtrIn[GlobalID]; + PtrOut[GlobalID] += LocalMemA[LocalID] * LocalMemB[LocalID]; + }); + }); + + auto GraphExec = Graph.finalize(); + + for (unsigned n = 0; n < Iterations; n++) { + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + } + + Queue.wait_and_throw(); + + Queue.copy(PtrOut, HostData.data(), Size); + Queue.wait_and_throw(); + + free(PtrIn, Queue); + free(PtrOut, Queue); + + for (size_t i = 0; i < Size; i++) { + T Ref = 0; + for (size_t n = 0; n < Iterations; n++) { + Ref += (i * (10 + i)); + } + assert(check_value(i, Ref, HostData[i], "PtrOut")); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/local_accessor_multiple_nodes.cpp b/sycl/test-e2e/Graph/Inputs/local_accessor_multiple_nodes.cpp new file mode 100644 index 0000000000000..8d38824c680ae --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/local_accessor_multiple_nodes.cpp @@ -0,0 +1,71 @@ +// Test creating a graph where more than one nodes uses local accessors, +// and submits of the graph. + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + using T = int; + + const size_t LocalSize = 128; + + std::vector HostData(Size); + + std::iota(HostData.begin(), HostData.end(), 10); + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + T *PtrA = malloc_device(Size, Queue); + + Queue.copy(HostData.data(), PtrA, Size); + Queue.wait_and_throw(); + + auto NodeA = add_node(Graph, Queue, [&](handler &CGH) { + local_accessor LocalMem(LocalSize, CGH); + + CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) { + LocalMem[Item.get_local_linear_id()] = Item.get_global_linear_id() * 2; + PtrA[Item.get_global_linear_id()] += LocalMem[Item.get_local_linear_id()]; + }); + }); + + auto NodeB = add_node( + Graph, Queue, + [&](handler &CGH) { + local_accessor LocalMem(LocalSize, CGH); + depends_on_helper(CGH, NodeA); + + CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) { + LocalMem[Item.get_local_linear_id()] = 3; + PtrA[Item.get_global_linear_id()] *= + LocalMem[Item.get_local_linear_id()]; + }); + }, + NodeA); + + auto GraphExec = Graph.finalize(); + + for (unsigned n = 0; n < Iterations; n++) { + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + } + + Queue.wait_and_throw(); + + Queue.copy(PtrA, HostData.data(), Size); + Queue.wait_and_throw(); + + free(PtrA, Queue); + + for (size_t i = 0; i < Size; i++) { + T Ref = 10 + i; + + for (size_t n = 0; n < Iterations; n++) { + Ref += i * 2; + Ref *= 3; + } + assert(check_value(i, Ref, HostData[i], "PtrA")); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/opencl_local_acc.cpp b/sycl/test-e2e/Graph/Inputs/opencl_local_acc.cpp new file mode 100644 index 0000000000000..c4f51ba9c9232 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/opencl_local_acc.cpp @@ -0,0 +1,78 @@ +// Tests using an OpenCL-C defined kernel with multiple local accessors + +#include "../graph_common.hpp" + +using source_kb = sycl::kernel_bundle; +using exe_kb = sycl::kernel_bundle; + +auto constexpr LocalAccCLSource = R"===( + kernel void test_la(__global int *out, __local int* local_ptr1, + __local int2* local_ptr2, int n) { + __local int4 local_data[1]; + + size_t gid = get_global_id(0); + size_t lid = get_local_id(0); + size_t wg_size = get_num_groups(0); + + local_ptr1[lid] = lid; + local_ptr2[lid].x = n; + local_ptr2[lid].y = wg_size; + + if (lid == 0) { + local_data[lid] = (int4)(0xA, 0xB, 0xC, 0xD); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + int acc = local_data[0].x + local_data[0].y + local_data[0].z + + local_data[0].w; + out[gid] = (local_ptr1[lid] * local_ptr2[lid].x) + + (local_ptr2[lid].y * acc); + } +)==="; + +int main() { + queue Queue; + + source_kb kbSrc = exp_ext::create_kernel_bundle_from_source( + Queue.get_context(), exp_ext::source_language::opencl, LocalAccCLSource); + exe_kb kbExe1 = exp_ext::build(kbSrc); + sycl::kernel test_kernel = kbExe1.ext_oneapi_get_kernel("test_la"); + + exp_ext::command_graph Graph{Queue}; + + int32_t *Ptr = malloc_device(Size, Queue); + + int32_t N = 42; + constexpr size_t LocalSize = 256; + auto Node = add_node(Graph, Queue, [&](handler &cgh) { + local_accessor acc_local1(LocalSize, cgh); + local_accessor acc_local2(LocalSize, cgh); + + cgh.set_arg(0, Ptr); + cgh.set_arg(1, acc_local1); + cgh.set_arg(2, acc_local2); + cgh.set_arg(3, N); + + cgh.parallel_for(nd_range<1>(Size, LocalSize), test_kernel); + }); + + auto ExecGraph = Graph.finalize(); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + std::vector HostData(Size); + Queue.copy(Ptr, HostData.data(), Size).wait(); + + constexpr int32_t Acc = 0xA + 0xB + 0xC + 0xD; + constexpr int32_t WorkGroups = Size / LocalSize; + constexpr int32_t Tmp = Acc * WorkGroups; + for (size_t i = 0; i < Size; i++) { + int32_t local_id = i % LocalSize; + int32_t Ref = (local_id * N) + Tmp; + assert(HostData[i] == Ref); + } + + sycl::free(Ptr, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/whole_update_local_acc.cpp b/sycl/test-e2e/Graph/Inputs/whole_update_local_acc.cpp index 100792a2e4762..6feb497115b0e 100644 --- a/sycl/test-e2e/Graph/Inputs/whole_update_local_acc.cpp +++ b/sycl/test-e2e/Graph/Inputs/whole_update_local_acc.cpp @@ -28,8 +28,6 @@ int main() { std::iota(DataA.begin(), DataA.end(), 10); std::iota(DataB.begin(), DataB.end(), 10); - std::vector ReferenceA(DataA), ReferenceB(DataB); - exp_ext::command_graph GraphA{Queue.get_context(), Queue.get_device()}; T *PtrA = malloc_device(Size, Queue); @@ -59,10 +57,10 @@ int main() { Queue.wait_and_throw(); for (size_t i = 0; i < Size; i++) { - T RefA = 10 + i + (i * 2) + LocalSize / 2; + T RefA = 10 + i + Iterations * ((i * 2) + (LocalSize / 2)); T RefB = 10 + i; - check_value(i, RefA, ReferenceA[i], "PtrA"); - check_value(i, RefB, ReferenceB[i], "PtrB"); + assert(check_value(i, RefA, DataA[i], "PtrA")); + assert(check_value(i, RefB, DataB[i], "PtrB")); } // Update GraphExecA using whole graph update @@ -81,10 +79,10 @@ int main() { Queue.wait_and_throw(); for (size_t i = 0; i < Size; i++) { - T RefA = 10 + i + (i * 2) + LocalSize / 2; - T RefB = 10 + i + (i * 2) + LocalSize; - check_value(i, RefA, ReferenceA[i], "PtrA"); - check_value(i, RefB, ReferenceB[i], "PtrB"); + T RefA = 10 + i + Iterations * ((i * 2) + (LocalSize / 2)); + T RefB = 10 + i + Iterations * ((i * 2) + LocalSize); + assert(check_value(i, RefA, DataA[i], "PtrA")); + assert(check_value(i, RefB, DataB[i], "PtrB")); } free(PtrA, Queue); diff --git a/sycl/test-e2e/Graph/Inputs/whole_update_local_acc_multi.cpp b/sycl/test-e2e/Graph/Inputs/whole_update_local_acc_multi.cpp new file mode 100644 index 0000000000000..95f55736702a5 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/whole_update_local_acc_multi.cpp @@ -0,0 +1,122 @@ +// Tests whole graph update of nodes with 2 local accessors, +// and submission of the graph. + +#include "../graph_common.hpp" + +using T = int; + +void add_graph_nodes( + exp_ext::command_graph &Graph, + queue &Queue, size_t Size, size_t LocalSize, T *Ptr) { + auto NodeA = add_node(Graph, Queue, [&](handler &CGH) { + local_accessor LocalMemA(LocalSize, CGH); + local_accessor LocalMemB(LocalSize, CGH); + CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) { + auto LocalID = Item.get_local_linear_id(); + auto GlobalID = Item.get_global_linear_id(); + LocalMemA[LocalID] = GlobalID; + LocalMemB[LocalID] = Item.get_local_range(0); + Ptr[GlobalID] += LocalMemA[LocalID] * LocalMemB[LocalID]; + }); + }); + + // Introduce value params so that local arguments are not contiguous indices + // when set as kernel arguments + T Constant = 2; + auto NodeB = add_node( + Graph, Queue, + [&](handler &CGH) { + local_accessor LocalMemA(LocalSize, CGH); + local_accessor LocalMemB(LocalSize * 2, CGH); + + depends_on_helper(CGH, NodeA); + + CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) { + auto LocalID = Item.get_local_linear_id(); + auto GlobalID = Item.get_global_linear_id(); + LocalMemA[LocalID] = GlobalID; + LocalMemB[LocalID * 2] = Constant; + LocalMemB[(LocalID * 2) + 1] = Constant; + Ptr[GlobalID] += LocalMemA[LocalID] * LocalMemB[LocalID * 2] * + LocalMemB[(LocalID * 2) + 1]; + }); + }, + NodeA); +} +int main() { + queue Queue{}; + + const size_t LocalSize = 128; + + std::vector DataA(Size), DataB(Size); + + std::iota(DataA.begin(), DataA.end(), 10); + std::iota(DataB.begin(), DataB.end(), 10); + + exp_ext::command_graph GraphA{Queue.get_context(), Queue.get_device()}; + + T *PtrA = malloc_device(Size, Queue); + T *PtrB = malloc_device(Size, Queue); + + Queue.copy(DataA.data(), PtrA, Size); + Queue.copy(DataB.data(), PtrB, Size); + Queue.wait_and_throw(); + + size_t GraphALocalSize = LocalSize / 2; + add_graph_nodes(GraphA, Queue, Size, GraphALocalSize, PtrA); + + auto GraphExecA = GraphA.finalize(exp_ext::property::graph::updatable{}); + + // Create second graph for whole graph update with a different local size + exp_ext::command_graph GraphB{Queue.get_context(), Queue.get_device()}; + add_graph_nodes(GraphB, Queue, Size, LocalSize, PtrB); + + // Execute graphs before updating and check outputs + for (unsigned n = 0; n < Iterations; n++) { + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExecA); }); + } + + Queue.wait_and_throw(); + + Queue.copy(PtrA, DataA.data(), Size); + Queue.copy(PtrB, DataB.data(), Size); + Queue.wait_and_throw(); + + for (size_t i = 0; i < Size; i++) { + T Init = 10 + i; + T NodeA = i * GraphALocalSize; + T NodeB = i * 4; + T RefA = Init + Iterations * (NodeA + NodeB); + assert(check_value(i, RefA, DataA[i], "PtrA")); + assert(check_value(i, Init, DataB[i], "PtrB")); + } + + // Update GraphExecA using whole graph update + GraphExecA.update(GraphB); + + // Execute graphs again and check outputs + for (unsigned n = 0; n < Iterations; n++) { + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExecA); }); + } + + Queue.wait_and_throw(); + + Queue.copy(PtrA, DataA.data(), Size); + Queue.copy(PtrB, DataB.data(), Size); + Queue.wait_and_throw(); + + for (size_t i = 0; i < Size; i++) { + T Init = 10 + i; + T NodeAGraphA = i * GraphALocalSize; + T NodeAGraphB = i * LocalSize; + T NodeB = i * 4; + T RefA = Init + Iterations * (NodeAGraphA + NodeB); + T RefB = Init + Iterations * (NodeAGraphB + NodeB); + assert(check_value(i, RefA, DataA[i], "PtrA")); + assert(check_value(i, RefB, DataB[i], "PtrB")); + } + + free(PtrA, Queue); + free(PtrB, Queue); + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/local_accessor_multiple_accessors.cpp b/sycl/test-e2e/Graph/RecordReplay/local_accessor_multiple_accessors.cpp new file mode 100644 index 0000000000000..aa044c80e19ec --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/local_accessor_multiple_accessors.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/local_accessor_multiple_accessors.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/local_accessor_multiple_nodes.cpp b/sycl/test-e2e/Graph/RecordReplay/local_accessor_multiple_nodes.cpp new file mode 100644 index 0000000000000..5830c21a57431 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/local_accessor_multiple_nodes.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/local_accessor_multiple_nodes.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/opencl_local_acc.cpp b/sycl/test-e2e/Graph/RecordReplay/opencl_local_acc.cpp new file mode 100644 index 0000000000000..a924d018ba8ec --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/opencl_local_acc.cpp @@ -0,0 +1,12 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// REQUIRES: ocloc && (opencl || level_zero) + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/opencl_local_acc.cpp" diff --git a/sycl/test-e2e/Graph/Update/Explicit/whole_update_local_acc.cpp b/sycl/test-e2e/Graph/Update/Explicit/whole_update_local_acc.cpp index 1db9905457ae7..a99eec42afa02 100644 --- a/sycl/test-e2e/Graph/Update/Explicit/whole_update_local_acc.cpp +++ b/sycl/test-e2e/Graph/Update/Explicit/whole_update_local_acc.cpp @@ -5,6 +5,9 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-422 + #define GRAPH_E2E_EXPLICIT #include "../../Inputs/whole_update_local_acc.cpp" diff --git a/sycl/test-e2e/Graph/Update/Explicit/whole_update_local_acc_multi.cpp b/sycl/test-e2e/Graph/Update/Explicit/whole_update_local_acc_multi.cpp new file mode 100644 index 0000000000000..d15ec880d89ad --- /dev/null +++ b/sycl/test-e2e/Graph/Update/Explicit/whole_update_local_acc_multi.cpp @@ -0,0 +1,13 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-422 + +#define GRAPH_E2E_EXPLICIT + +#include "../../Inputs/whole_update_local_acc_multi.cpp" diff --git a/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_local_acc.cpp b/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_local_acc.cpp index 03645b2f19bfd..ab9bb3fe37fa5 100644 --- a/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_local_acc.cpp +++ b/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_local_acc.cpp @@ -5,6 +5,9 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-422 + #define GRAPH_E2E_RECORD_REPLAY #include "../../Inputs/whole_update_local_acc.cpp" diff --git a/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_local_acc_multi.cpp b/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_local_acc_multi.cpp new file mode 100644 index 0000000000000..97a83de1129b3 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_local_acc_multi.cpp @@ -0,0 +1,13 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-422 + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../../Inputs/whole_update_local_acc_multi.cpp" diff --git a/sycl/test-e2e/KernelCompiler/opencl_local_mem.cpp b/sycl/test-e2e/KernelCompiler/opencl_local_mem.cpp new file mode 100644 index 0000000000000..dfc54b786703a --- /dev/null +++ b/sycl/test-e2e/KernelCompiler/opencl_local_mem.cpp @@ -0,0 +1,84 @@ +// Tests using an OpenCL-C defined kernel with multiple local accessors + +// REQUIRES: ocloc && (opencl || level_zero) +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include +#include + +using namespace sycl; +namespace syclex = sycl::ext::oneapi::experimental; +using source_kb = sycl::kernel_bundle; +using exe_kb = sycl::kernel_bundle; + +auto constexpr LocalAccCLSource = R"===( + kernel void test_la(__global int *out, __local int* local_ptr1, + __local int2* local_ptr2, int n) { + __local int4 local_data[1]; + + size_t gid = get_global_id(0); + size_t lid = get_local_id(0); + size_t wg_size = get_num_groups(0); + + local_ptr1[lid] = lid; + local_ptr2[lid].x = n; + local_ptr2[lid].y = wg_size; + + if (lid == 0) { + local_data[lid] = (int4)(0xA, 0xB, 0xC, 0xD); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + int acc = local_data[0].x + local_data[0].y + local_data[0].z + + local_data[0].w; + out[gid] = (local_ptr1[lid] * local_ptr2[lid].x) + + (local_ptr2[lid].y * acc); + } +)==="; + +int main() { + sycl::queue Queue; + + source_kb kbSrc = syclex::create_kernel_bundle_from_source( + Queue.get_context(), syclex::source_language::opencl, LocalAccCLSource); + exe_kb kbExe1 = syclex::build(kbSrc); + sycl::kernel test_kernel = kbExe1.ext_oneapi_get_kernel("test_la"); + + constexpr size_t Size = 1024; + constexpr size_t LocalSize = 256; + + int32_t *Ptr = malloc_device(Size, Queue); + + int32_t N = 42; + Queue + .submit([&](handler &cgh) { + local_accessor acc_local1(LocalSize, cgh); + local_accessor acc_local2(LocalSize, cgh); + + cgh.set_arg(0, Ptr); + cgh.set_arg(1, acc_local1); + cgh.set_arg(2, acc_local2); + cgh.set_arg(3, N); + + cgh.parallel_for(nd_range<1>(Size, LocalSize), test_kernel); + }) + .wait(); + + std::vector HostData(Size); + Queue.copy(Ptr, HostData.data(), Size).wait(); + + constexpr int32_t Acc = 0xA + 0xB + 0xC + 0xD; + constexpr int32_t WorkGroups = Size / LocalSize; + constexpr int32_t Tmp = Acc * WorkGroups; + for (size_t i = 0; i < Size; i++) { + int32_t local_id = i % LocalSize; + int32_t Ref = (local_id * N) + Tmp; + assert(HostData[i] == Ref); + } + + sycl::free(Ptr, Queue); + + return 0; +}