Skip to content

[SYCL][Graph[E2E] Add an extra test to invoke copy engine on L0 #17324

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 4 additions & 7 deletions sycl/test-e2e/Graph/ValidUsage/linear_graph_copy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ int main() {
Queue.copy(DataA.data(), PtrA, Size);
Queue.copy(DataB.data(), PtrB, Size);
Queue.copy(DataC.data(), PtrC, Size);
Queue.wait_and_throw();

Graph.begin_recording(Queue);
Queue.submit([&](handler &CGH) {
Expand Down Expand Up @@ -82,15 +81,13 @@ int main() {

auto GraphExec = Graph.finalize();

event Event;
for (unsigned n = 0; n < Iterations; n++) {
Event =
Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
}

Queue.copy(PtrA, DataA.data(), Size, Event);
Queue.copy(PtrB, DataB.data(), Size, Event);
Queue.copy(PtrC, DataC.data(), Size, Event);
Queue.copy(PtrA, DataA.data(), Size);
Queue.copy(PtrB, DataB.data(), Size);
Queue.copy(PtrC, DataC.data(), Size);
Queue.wait_and_throw();

free(PtrA, Queue);
Expand Down
115 changes: 115 additions & 0 deletions sycl/test-e2e/Graph/ValidUsage/linear_graph_copy_D2H.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
// Extra run to check for immediate-command-list in Level Zero
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
//
// Extra runs to test in-order command lists path
// RUN: %if level_zero %{env UR_L0_USE_DRIVER_INORDER_LISTS=1 SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
// RUN: %if level_zero %{env UR_L0_USE_DRIVER_INORDER_LISTS=1 SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_USE_DRIVER_COUNTER_BASED_EVENTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
//
// Tests that the optimization to use the L0 Copy Engine for memory commands
// does not interfere with the linear graph optimization
//
// REQUIRES: aspect-usm_host_allocations

#include "../graph_common.hpp"

#include <sycl/properties/queue_properties.hpp>

int main() {
queue Queue{{sycl::property::queue::in_order{}}};

using T = int;

const T ModValue = 7;
std::vector<T> DataA(Size), DataB(Size), DataC(Size);

std::iota(DataA.begin(), DataA.end(), 1);
std::iota(DataB.begin(), DataB.end(), 10);
std::iota(DataC.begin(), DataC.end(), 1000);

// Create reference data for output
std::vector<T> ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC);
for (size_t i = 0; i < Iterations; i++) {
for (size_t j = 0; j < Size; j++) {
ReferenceA[j] += ModValue;
ReferenceB[j] = ReferenceA[j];
ReferenceB[j] -= ModValue;
ReferenceC[j] = ReferenceB[j];
ReferenceC[j] += ModValue;
}
}

ext::oneapi::experimental::command_graph Graph{Queue.get_context(),
Queue.get_device()};

T *PtrA = malloc_device<T>(Size, Queue);
T *PtrB = malloc_device<T>(Size, Queue);
T *PtrC = malloc_device<T>(Size, Queue);

// L0 copy engine is disabled for D2D copies so we need to create additional
// D2H copy events in between to invoke it.
T *PtrBHost = malloc_host<T>(Size, Queue);
T *PtrCHost = malloc_host<T>(Size, Queue);

Queue.copy(DataA.data(), PtrA, Size);
Queue.copy(DataB.data(), PtrB, Size);
Queue.copy(DataC.data(), PtrC, Size);

Graph.begin_recording(Queue);
Queue.submit([&](handler &CGH) {
CGH.parallel_for(range<1>(Size), [=](item<1> id) {
auto LinID = id.get_linear_id();
PtrA[LinID] += ModValue;
});
});

Queue.submit(
[&](handler &CGH) { CGH.memcpy(PtrBHost, PtrA, Size * sizeof(T)); });
Queue.submit(
[&](handler &CGH) { CGH.memcpy(PtrB, PtrBHost, Size * sizeof(T)); });

Queue.submit([&](handler &CGH) {
CGH.parallel_for(range<1>(Size), [=](item<1> id) {
auto LinID = id.get_linear_id();
PtrB[LinID] -= ModValue;
});
});

Queue.submit(
[&](handler &CGH) { CGH.memcpy(PtrCHost, PtrB, Size * sizeof(T)); });
Queue.submit(
[&](handler &CGH) { CGH.memcpy(PtrC, PtrCHost, Size * sizeof(T)); });

Queue.submit([&](handler &CGH) {
CGH.parallel_for(range<1>(Size), [=](item<1> id) {
auto LinID = id.get_linear_id();
PtrC[LinID] += ModValue;
});
});

Graph.end_recording();

auto GraphExec = Graph.finalize();

for (unsigned n = 0; n < Iterations; n++) {
Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
}

Queue.copy(PtrA, DataA.data(), Size);
Queue.copy(PtrB, DataB.data(), Size);
Queue.copy(PtrC, DataC.data(), Size);
Queue.wait_and_throw();

free(PtrA, Queue);
free(PtrB, Queue);
free(PtrC, Queue);

for (size_t i = 0; i < Size; i++) {
assert(check_value(i, ReferenceA[i], DataA[i], "DataA"));
assert(check_value(i, ReferenceB[i], DataB[i], "DataB"));
assert(check_value(i, ReferenceC[i], DataC[i], "DataC"));
}
}