diff --git a/sycl/test-e2e/Graph/ValidUsage/linear_graph_copy.cpp b/sycl/test-e2e/Graph/ValidUsage/linear_graph_copy.cpp index 4d34da5bc2ffb..d10fafe54bc4b 100644 --- a/sycl/test-e2e/Graph/ValidUsage/linear_graph_copy.cpp +++ b/sycl/test-e2e/Graph/ValidUsage/linear_graph_copy.cpp @@ -50,7 +50,6 @@ int main() { Queue.copy(DataA.data(), PtrA, Size); Queue.copy(DataB.data(), PtrB, Size); Queue.copy(DataC.data(), PtrC, Size); - Queue.wait_and_throw(); Graph.begin_recording(Queue); Queue.submit([&](handler &CGH) { @@ -82,15 +81,13 @@ int main() { auto GraphExec = Graph.finalize(); - event Event; for (unsigned n = 0; n < Iterations; n++) { - Event = - Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); } - Queue.copy(PtrA, DataA.data(), Size, Event); - Queue.copy(PtrB, DataB.data(), Size, Event); - Queue.copy(PtrC, DataC.data(), Size, Event); + Queue.copy(PtrA, DataA.data(), Size); + Queue.copy(PtrB, DataB.data(), Size); + Queue.copy(PtrC, DataC.data(), Size); Queue.wait_and_throw(); free(PtrA, Queue); diff --git a/sycl/test-e2e/Graph/ValidUsage/linear_graph_copy_D2H.cpp b/sycl/test-e2e/Graph/ValidUsage/linear_graph_copy_D2H.cpp new file mode 100644 index 0000000000000..d5ad10257d239 --- /dev/null +++ b/sycl/test-e2e/Graph/ValidUsage/linear_graph_copy_D2H.cpp @@ -0,0 +1,115 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// Extra runs to test in-order command lists path +// RUN: %if level_zero %{env UR_L0_USE_DRIVER_INORDER_LISTS=1 SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// RUN: %if level_zero %{env UR_L0_USE_DRIVER_INORDER_LISTS=1 SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_USE_DRIVER_COUNTER_BASED_EVENTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// Tests that the optimization to use the L0 Copy Engine for memory commands +// does not interfere with the linear graph optimization +// +// REQUIRES: aspect-usm_host_allocations + +#include "../graph_common.hpp" + +#include + +int main() { + queue Queue{{sycl::property::queue::in_order{}}}; + + using T = int; + + const T ModValue = 7; + std::vector DataA(Size), DataB(Size), DataC(Size); + + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 10); + std::iota(DataC.begin(), DataC.end(), 1000); + + // Create reference data for output + std::vector ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC); + for (size_t i = 0; i < Iterations; i++) { + for (size_t j = 0; j < Size; j++) { + ReferenceA[j] += ModValue; + ReferenceB[j] = ReferenceA[j]; + ReferenceB[j] -= ModValue; + ReferenceC[j] = ReferenceB[j]; + ReferenceC[j] += ModValue; + } + } + + ext::oneapi::experimental::command_graph Graph{Queue.get_context(), + Queue.get_device()}; + + T *PtrA = malloc_device(Size, Queue); + T *PtrB = malloc_device(Size, Queue); + T *PtrC = malloc_device(Size, Queue); + + // L0 copy engine is disabled for D2D copies so we need to create additional + // D2H copy events in between to invoke it. + T *PtrBHost = malloc_host(Size, Queue); + T *PtrCHost = malloc_host(Size, Queue); + + Queue.copy(DataA.data(), PtrA, Size); + Queue.copy(DataB.data(), PtrB, Size); + Queue.copy(DataC.data(), PtrC, Size); + + Graph.begin_recording(Queue); + Queue.submit([&](handler &CGH) { + CGH.parallel_for(range<1>(Size), [=](item<1> id) { + auto LinID = id.get_linear_id(); + PtrA[LinID] += ModValue; + }); + }); + + Queue.submit( + [&](handler &CGH) { CGH.memcpy(PtrBHost, PtrA, Size * sizeof(T)); }); + Queue.submit( + [&](handler &CGH) { CGH.memcpy(PtrB, PtrBHost, Size * sizeof(T)); }); + + Queue.submit([&](handler &CGH) { + CGH.parallel_for(range<1>(Size), [=](item<1> id) { + auto LinID = id.get_linear_id(); + PtrB[LinID] -= ModValue; + }); + }); + + Queue.submit( + [&](handler &CGH) { CGH.memcpy(PtrCHost, PtrB, Size * sizeof(T)); }); + Queue.submit( + [&](handler &CGH) { CGH.memcpy(PtrC, PtrCHost, Size * sizeof(T)); }); + + Queue.submit([&](handler &CGH) { + CGH.parallel_for(range<1>(Size), [=](item<1> id) { + auto LinID = id.get_linear_id(); + PtrC[LinID] += ModValue; + }); + }); + + Graph.end_recording(); + + auto GraphExec = Graph.finalize(); + + for (unsigned n = 0; n < Iterations; n++) { + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + } + + Queue.copy(PtrA, DataA.data(), Size); + Queue.copy(PtrB, DataB.data(), Size); + Queue.copy(PtrC, DataC.data(), Size); + Queue.wait_and_throw(); + + free(PtrA, Queue); + free(PtrB, Queue); + free(PtrC, Queue); + + for (size_t i = 0; i < Size; i++) { + assert(check_value(i, ReferenceA[i], DataA[i], "DataA")); + assert(check_value(i, ReferenceB[i], DataB[i], "DataB")); + assert(check_value(i, ReferenceC[i], DataC[i], "DataC")); + } +}