From 05982098f4708840e8b8d8daafacf8ad46198b68 Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Mon, 31 Jul 2023 16:59:24 +0100 Subject: [PATCH 1/8] [SYCL][Graph] Add error checking to make_edge (#264) - make_edge now checks for cycles - no_cycle_check property can now be passed to skip them - Various other error checks in make_edge - Generic depth first search mechanism added to graph_impl - New e2e tests for cycle checks - Unit tests for other basic errors - Prevent adding duplicate edges - Adds testing for the graph structure after a cycle error is caught to ensure it is unchanged. - Skip cycle checks when dst has no successors --- sycl/source/detail/graph_impl.cpp | 144 ++++++++++++++++++- sycl/source/detail/graph_impl.hpp | 52 ++++++- sycl/test-e2e/Graph/Explicit/cycle_error.cpp | 86 +++++++++++ sycl/unittests/Extensions/CommandGraph.cpp | 102 +++++++++++++ 4 files changed, 377 insertions(+), 7 deletions(-) create mode 100644 sycl/test-e2e/Graph/Explicit/cycle_error.cpp diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 1a4f07285fe4b..f6f26ef0b049f 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -15,6 +15,8 @@ #include #include +#include + // Developer switch to use emulation mode on all backends, even those that // report native support, this is useful for debugging. #define FORCE_EMULATION_MODE 0 @@ -71,6 +73,40 @@ bool checkForRequirement(sycl::detail::AccessorImplHost *Req, } return SuccessorAddedDep; } + +/// Visits a node on the graph and it's successors recursively in a depth-first +/// approach. +/// @param[in] Node The current node being visited. +/// @param[in,out] VisitedNodes A set of unique nodes which have already been +/// visited. +/// @param[in] NodeStack Stack of nodes which are currently being visited on the +/// current path through the graph. +/// @param[in] NodeFunc The function object to be run on each node. A return +/// value of true indicates the search should be ended immediately and the +/// function will return. +/// @return True if the search should end immediately, false if not. +bool visitNodeDepthFirst( + std::shared_ptr Node, + std::set> &VisitedNodes, + std::deque> &NodeStack, + std::function &, + std::deque> &)> + NodeFunc) { + auto EarlyReturn = NodeFunc(Node, NodeStack); + if (EarlyReturn) { + return true; + } + NodeStack.push_back(Node); + Node->MVisited = true; + VisitedNodes.emplace(Node); + for (auto &Successor : Node->MSuccessors) { + if (visitNodeDepthFirst(Successor, VisitedNodes, NodeStack, NodeFunc)) { + return true; + } + } + NodeStack.pop_back(); + return false; +} } // anonymous namespace void exec_graph_impl::schedule() { @@ -226,6 +262,105 @@ bool graph_impl::clearQueues() { return AnyQueuesCleared; } +void graph_impl::searchDepthFirst( + std::function &, + std::deque> &)> + NodeFunc) { + // Track nodes visited during the search which can be used by NodeFunc in + // depth first search queries. Currently unusued but is an + // integral part of depth first searches. + std::set> VisitedNodes; + + for (auto &Root : MRoots) { + std::deque> NodeStack; + if (visitNodeDepthFirst(Root, VisitedNodes, NodeStack, NodeFunc)) { + break; + } + } + + // Reset the visited status of all nodes encountered in the search. + for (auto &Node : VisitedNodes) { + Node->MVisited = false; + } +} + +bool graph_impl::checkForCycles() { + // Using a depth-first search and checking if we vist a node more than once in + // the current path to identify if there are cycles. + bool CycleFound = false; + auto CheckFunc = [&](std::shared_ptr &Node, + std::deque> &NodeStack) { + // If the current node has previously been found in the current path through + // the graph then we have a cycle and we end the search early. + if (std::find(NodeStack.begin(), NodeStack.end(), Node) != + NodeStack.end()) { + CycleFound = true; + return true; + } + return false; + }; + searchDepthFirst(CheckFunc); + return CycleFound; +} + +void graph_impl::makeEdge(std::shared_ptr Src, + std::shared_ptr Dest) { + if (MRecordingQueues.size()) { + throw sycl::exception(make_error_code(sycl::errc::invalid), + "make_edge() cannot be called when a queue is " + "currently recording commands to a graph."); + } + if (Src == Dest) { + throw sycl::exception( + make_error_code(sycl::errc::invalid), + "make_edge() cannot be called when Src and Dest are the same."); + } + + bool SrcFound = false; + bool DestFound = false; + auto CheckForNodes = [&](std::shared_ptr &Node, + std::deque> &) { + if (Node == Src) { + SrcFound = true; + } + if (Node == Dest) { + DestFound = true; + } + return SrcFound && DestFound; + }; + + searchDepthFirst(CheckForNodes); + + if (!SrcFound) { + throw sycl::exception(make_error_code(sycl::errc::invalid), + "Src must be a node inside the graph."); + } + if (!DestFound) { + throw sycl::exception(make_error_code(sycl::errc::invalid), + "Dest must be a node inside the graph."); + } + + // We need to add the edges first before checking for cycles + Src->registerSuccessor(Dest, Src); + + // We can skip cycle checks if either Dest has no successors (cycle not + // possible) or cycle checks have been disabled with the no_cycle_check + // property; + if (Dest->MSuccessors.empty() || !MSkipCycleChecks) { + bool CycleFound = checkForCycles(); + + if (CycleFound) { + // Remove the added successor and predecessor + Src->MSuccessors.pop_back(); + Dest->MPredecessors.pop_back(); + + throw sycl::exception(make_error_code(sycl::errc::invalid), + "Command graphs cannot contain cycles."); + } + } + removeRoot(Dest); // remove receiver from root node list +} + // Check if nodes are empty and if so loop back through predecessors until we // find the real dependency. void exec_graph_impl::findRealDeps( @@ -463,8 +598,9 @@ exec_graph_impl::enqueue(const std::shared_ptr &Queue, modifiable_command_graph::modifiable_command_graph( const sycl::context &SyclContext, const sycl::device &SyclDevice, - const sycl::property_list &) - : impl(std::make_shared(SyclContext, SyclDevice)) {} + const sycl::property_list &PropList) + : impl(std::make_shared(SyclContext, SyclDevice, + PropList)) {} node modifiable_command_graph::addImpl(const std::vector &Deps) { std::vector> DepImpls; @@ -494,9 +630,7 @@ void modifiable_command_graph::make_edge(node &Src, node &Dest) { std::shared_ptr ReceiverImpl = sycl::detail::getSyclObjImpl(Dest); - SenderImpl->registerSuccessor(ReceiverImpl, - SenderImpl); // register successor - impl->removeRoot(ReceiverImpl); // remove receiver from root node list + impl->makeEdge(SenderImpl, ReceiverImpl); } command_graph diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 5526aeaccef44..31313d500c392 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -43,6 +44,9 @@ class node_impl { /// Command group object which stores all args etc needed to enqueue the node std::unique_ptr MCommandGroup; + /// Used for tracking visited status during cycle checks. + bool MVisited = false; + /// Add successor to the node. /// @param Node Node to add as a successor. /// @param Prev Predecessor to \p node being added as successor. @@ -51,6 +55,10 @@ class node_impl { /// use a raw \p this pointer, so the extra \Prev parameter is passed. void registerSuccessor(const std::shared_ptr &Node, const std::shared_ptr &Prev) { + if (std::find(MSuccessors.begin(), MSuccessors.end(), Node) != + MSuccessors.end()) { + return; + } MSuccessors.push_back(Node); Node->registerPredecessor(Prev); } @@ -58,6 +66,12 @@ class node_impl { /// Add predecessor to the node. /// @param Node Node to add as a predecessor. void registerPredecessor(const std::shared_ptr &Node) { + if (std::find_if(MPredecessors.begin(), MPredecessors.end(), + [&Node](const std::weak_ptr &Ptr) { + return Ptr.lock() == Node; + }) != MPredecessors.end()) { + return; + } MPredecessors.push_back(Node); } @@ -183,9 +197,15 @@ class graph_impl { /// Constructor. /// @param SyclContext Context to use for graph. /// @param SyclDevice Device to create nodes with. - graph_impl(const sycl::context &SyclContext, const sycl::device &SyclDevice) + /// @param PropList Optional list of properties. + graph_impl(const sycl::context &SyclContext, const sycl::device &SyclDevice, + const sycl::property_list &PropList = {}) : MContext(SyclContext), MDevice(SyclDevice), MRecordingQueues(), - MEventsMap(), MInorderQueueMap() {} + MEventsMap(), MInorderQueueMap() { + if (PropList.has_property()) { + MSkipCycleChecks = true; + } + } /// Insert node into list of root nodes. /// @param Root Node to add to list of root nodes. @@ -315,7 +335,32 @@ class graph_impl { MInorderQueueMap[QueueWeakPtr] = Node; } + /// Make an edge between two nodes in the graph. Performs some mandatory + /// error checks as well as an optional check for cycles introduced by making + /// this edge. + /// @param Src The source of the new edge. + /// @param Dest The destination of the new edge. + void makeEdge(std::shared_ptr Src, + std::shared_ptr Dest); + private: + /// Iterate over the graph depth-first and run \p NodeFunc on each node. + /// @param NodeFunc A function which receives as input a node in the graph to + /// perform operations on as well as the stack of nodes encountered in the + /// current path. The return value of this function determines whether an + /// early exit is triggered, if true the depth-first search will end + /// immediately and no further nodes will be visited. + void + searchDepthFirst(std::function &, + std::deque> &)> + NodeFunc); + + /// Check the graph for cycles by performing a depth-first search of the + /// graph. If a node is visited more than once in a given path through the + /// graph, a cycle is present and the search ends immediately. + /// @return True if a cycle is detected, false if not. + bool checkForCycles(); + /// Context associated with this graph. sycl::context MContext; /// Device associated with this graph. All graph nodes will execute on this @@ -333,6 +378,9 @@ class graph_impl { std::map, std::shared_ptr, std::owner_less>> MInorderQueueMap; + /// Controls whether we skip the cycle checks in makeEdge, set by the presence + /// of the no_cycle_check property on construction. + bool MSkipCycleChecks = false; }; /// Class representing the implementation of command_graph. diff --git a/sycl/test-e2e/Graph/Explicit/cycle_error.cpp b/sycl/test-e2e/Graph/Explicit/cycle_error.cpp new file mode 100644 index 0000000000000..2ca29aa67b9cf --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/cycle_error.cpp @@ -0,0 +1,86 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// Tests that introducing a cycle to the graph will throw when +// property::graph::no_cycle_check is not passed to the graph constructor and +// will not throw when it is. + +#include "../graph_common.hpp" + +void CreateGraphWithCyclesTest(bool DisableCycleChecks) { + + // If we are testing without cycle checks we need to do multiple iterations so + // we can test multiple types of cycle, since introducing a cycle with no + // checks may put the graph into an undefined state. + const size_t Iterations = DisableCycleChecks ? 2 : 1; + + queue Queue; + + property_list Props; + + if (DisableCycleChecks) { + Props = {ext::oneapi::experimental::property::graph::no_cycle_check{}}; + } + + for (size_t i = 0; i < Iterations; i++) { + ext::oneapi::experimental::command_graph Graph{Queue.get_context(), + Queue.get_device(), Props}; + + auto NodeA = Graph.add([&](sycl::handler &CGH) { + CGH.single_task([=]() {}); + }); + auto NodeB = Graph.add([&](sycl::handler &CGH) { + CGH.single_task([=]() {}); + }); + auto NodeC = Graph.add([&](sycl::handler &CGH) { + CGH.single_task([=]() {}); + }); + + // Make normal edges + std::error_code ErrorCode = sycl::make_error_code(sycl::errc::success); + try { + Graph.make_edge(NodeA, NodeB); + Graph.make_edge(NodeB, NodeC); + } catch (const sycl::exception &e) { + ErrorCode = e.code(); + } + + assert(ErrorCode == sycl::errc::success); + + // Introduce cycles to the graph. If we are performing cycle checks we can + // test both cycles, if they are disabled we need to test one per iteration. + if (i == 0 || !DisableCycleChecks) { + ErrorCode = sycl::make_error_code(sycl::errc::success); + try { + Graph.make_edge(NodeC, NodeA); + } catch (const sycl::exception &e) { + ErrorCode = e.code(); + } + + assert(ErrorCode == + (DisableCycleChecks ? sycl::errc::success : sycl::errc::invalid)); + } + + if (i == 1 || !DisableCycleChecks) { + ErrorCode = sycl::make_error_code(sycl::errc::success); + try { + Graph.make_edge(NodeC, NodeB); + } catch (const sycl::exception &e) { + ErrorCode = e.code(); + } + + assert(ErrorCode == + (DisableCycleChecks ? sycl::errc::success : sycl::errc::invalid)); + } + } +} + +int main() { + // Test with cycle checks + CreateGraphWithCyclesTest(false); + // Test without cycle checks + CreateGraphWithCyclesTest(true); + + return 0; +} diff --git a/sycl/unittests/Extensions/CommandGraph.cpp b/sycl/unittests/Extensions/CommandGraph.cpp index 590d86b8e0019..574cbc7bebf37 100644 --- a/sycl/unittests/Extensions/CommandGraph.cpp +++ b/sycl/unittests/Extensions/CommandGraph.cpp @@ -606,3 +606,105 @@ TEST_F(CommandGraphTest, InOrderQueueWithEmptyLast) { ASSERT_EQ(*ScheduleIt, PtrNode2); ASSERT_EQ(InOrderQueue.get_context(), GraphExecImpl->getContext()); } + +TEST_F(CommandGraphTest, MakeEdgeErrors) { + // Set up some nodes in the graph + auto NodeA = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + auto NodeB = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + + // Test error on calling make_edge when a queue is recording to the graph + Graph.begin_recording(Queue); + ASSERT_THROW( + { + try { + Graph.make_edge(NodeA, NodeB); + } catch (const sycl::exception &e) { + ASSERT_EQ(e.code(), make_error_code(sycl::errc::invalid)); + throw; + } + }, + sycl::exception); + + Graph.end_recording(Queue); + + // Test error on Src and Dest being the same + ASSERT_THROW( + { + try { + Graph.make_edge(NodeA, NodeA); + } catch (const sycl::exception &e) { + ASSERT_EQ(e.code(), make_error_code(sycl::errc::invalid)); + throw; + } + }, + sycl::exception); + + // Test Src or Dest not being found in the graph + experimental::command_graph GraphOther{ + Queue.get_context(), Queue.get_device()}; + auto NodeOther = GraphOther.add( + [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + + ASSERT_THROW( + { + try { + Graph.make_edge(NodeA, NodeOther); + } catch (const sycl::exception &e) { + ASSERT_EQ(e.code(), make_error_code(sycl::errc::invalid)); + throw; + } + }, + sycl::exception); + ASSERT_THROW( + { + try { + Graph.make_edge(NodeOther, NodeB); + } catch (const sycl::exception &e) { + ASSERT_EQ(e.code(), make_error_code(sycl::errc::invalid)); + throw; + } + }, + sycl::exception); + + // Test that adding a cycle with cycle checks leaves the graph in the correct + // state. + + auto CheckGraphStructure = [&]() { + auto GraphImpl = sycl::detail::getSyclObjImpl(Graph); + auto NodeAImpl = sycl::detail::getSyclObjImpl(NodeA); + auto NodeBImpl = sycl::detail::getSyclObjImpl(NodeB); + + ASSERT_EQ(GraphImpl->MRoots.size(), 1lu); + ASSERT_EQ(*(GraphImpl->MRoots.begin()), NodeAImpl); + + ASSERT_EQ(NodeAImpl->MSuccessors.size(), 1lu); + ASSERT_EQ(NodeAImpl->MPredecessors.size(), 0lu); + ASSERT_EQ(NodeAImpl->MSuccessors.front(), NodeBImpl); + + ASSERT_EQ(NodeBImpl->MSuccessors.size(), 0lu); + ASSERT_EQ(NodeBImpl->MPredecessors.size(), 1lu); + ASSERT_EQ(NodeBImpl->MPredecessors.front().lock(), NodeAImpl); + }; + // Make a normal edge + ASSERT_NO_THROW(Graph.make_edge(NodeA, NodeB)); + + // Check the expected structure of the graph + CheckGraphStructure(); + + // Introduce a cycle, make sure it throws + ASSERT_THROW( + { + try { + Graph.make_edge(NodeB, NodeA); + } catch (const sycl::exception &e) { + ASSERT_EQ(e.code(), make_error_code(sycl::errc::invalid)); + throw; + } + }, + sycl::exception); + + // Re-check graph structure to make sure the graph state has not been modified + CheckGraphStructure(); +} From 99e69449c6de9fc74dd96b09fb729cf9772446d6 Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Mon, 31 Jul 2023 16:59:40 +0100 Subject: [PATCH 2/8] [SYCL][Graph] Error when immediate command lists are used (#277) - Error when we detected immediate command lists - Throws exception with sycl::invalid - Test which uses property::queue::immediate_command_list to test errors. --- .../ur/adapters/level_zero/command_buffer.cpp | 6 +++ sycl/source/detail/graph_impl.cpp | 9 +++- .../Graph/immediate_command_list_error.cpp | 47 +++++++++++++++++++ 3 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 sycl/test-e2e/Graph/immediate_command_list_error.cpp diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/command_buffer.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/command_buffer.cpp index edf1f7c81663f..509cc5c4ff418 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/command_buffer.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/command_buffer.cpp @@ -661,6 +661,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( ur_exp_command_buffer_handle_t CommandBuffer, ur_queue_handle_t Queue, uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, ur_event_handle_t *Event) { + // There are issues with immediate command lists so return an error if the + // queue is in that mode. + if (Queue->UsingImmCmdLists) { + return UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES; + } + std::scoped_lock lock(Queue->Mutex); // Use compute engine rather than copy engine const auto UseCopyEngine = false; diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index f6f26ef0b049f..05c34263f803c 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -528,7 +528,14 @@ exec_graph_impl::enqueue(const std::shared_ptr &Queue, ->call_nocheck< sycl::detail::PiApiKind::piextEnqueueCommandBuffer>( CommandBuffer, Queue->getHandleRef(), 0, nullptr, OutEvent); - if (Res != pi_result::PI_SUCCESS) { + if (Res == pi_result::PI_ERROR_INVALID_QUEUE_PROPERTIES) { + throw sycl::exception( + make_error_code(errc::invalid), + "Graphs cannot be submitted to a queue which uses " + "immediate command lists. Use " + "sycl::ext::intel::property::queue::no_immediate_" + "command_list to disable them."); + } else if (Res != pi_result::PI_SUCCESS) { throw sycl::exception( errc::event, "Failed to enqueue event for command buffer submission"); diff --git a/sycl/test-e2e/Graph/immediate_command_list_error.cpp b/sycl/test-e2e/Graph/immediate_command_list_error.cpp new file mode 100644 index 0000000000000..bad3fac48007c --- /dev/null +++ b/sycl/test-e2e/Graph/immediate_command_list_error.cpp @@ -0,0 +1,47 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// Tests that graph submission will throw if the target queue is using immediate +// command lists and not throw if they are using regular command queues. + +#include "graph_common.hpp" + +int main() { + queue QueueImmediate{ + {sycl::ext::intel::property::queue::immediate_command_list{}}}; + queue QueueNoImmediate{ + QueueImmediate.get_context(), + QueueImmediate.get_device(), + {sycl::ext::intel::property::queue::no_immediate_command_list{}}}; + + exp_ext::command_graph Graph{QueueNoImmediate.get_context(), + QueueNoImmediate.get_device()}; + + std::error_code ErrorCode = make_error_code(sycl::errc::success); + try { + auto GraphExec = Graph.finalize(); + QueueNoImmediate.submit( + [&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + } catch (sycl::exception &E) { + ErrorCode = E.code(); + } + + assert(ErrorCode == make_error_code(errc::success)); + + ErrorCode = make_error_code(sycl::errc::success); + try { + auto GraphExec = Graph.finalize(); + QueueImmediate.submit( + [&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + } catch (sycl::exception &E) { + ErrorCode = E.code(); + } + + assert(ErrorCode == make_error_code(errc::invalid)); + + return 0; +} From 4b980d000e53cf2a4329cee80cc3718aa16efa64 Mon Sep 17 00:00:00 2001 From: Maxime France-Pillois Date: Thu, 3 Aug 2023 11:10:10 +0100 Subject: [PATCH 3/8] [SYCL][Graph] Throw exception when explicit add called on a graph recording a queue (#283) Factorizes the exception throwing method when the explicit API is used on a graph recording a queue Improves the test while_recording to test throwing an invalid exception for the two explicit graph:add entry points. Addresses issue #271 * Update sycl/source/detail/graph_impl.hpp Co-authored-by: Ben Tracy * Update sycl/test-e2e/Graph/Explicit/while_recording.cpp Co-authored-by: Ben Tracy * Update sycl/source/detail/graph_impl.hpp Co-authored-by: Ewan Crawford --------- Co-authored-by: Ben Tracy Co-authored-by: Ewan Crawford --- sycl/source/detail/graph_impl.cpp | 8 +++----- sycl/source/detail/graph_impl.hpp | 12 ++++++++++++ .../Graph/Explicit/add_node_while_recording.cpp | 14 ++++++++++---- 3 files changed, 25 insertions(+), 9 deletions(-) diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 05c34263f803c..6fa7d0d6fdf07 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -305,11 +305,7 @@ bool graph_impl::checkForCycles() { void graph_impl::makeEdge(std::shared_ptr Src, std::shared_ptr Dest) { - if (MRecordingQueues.size()) { - throw sycl::exception(make_error_code(sycl::errc::invalid), - "make_edge() cannot be called when a queue is " - "currently recording commands to a graph."); - } + throwIfGraphRecordingQueue("make_edge()"); if (Src == Dest) { throw sycl::exception( make_error_code(sycl::errc::invalid), @@ -610,6 +606,7 @@ modifiable_command_graph::modifiable_command_graph( PropList)) {} node modifiable_command_graph::addImpl(const std::vector &Deps) { + impl->throwIfGraphRecordingQueue("Explicit API \"Add()\" function"); std::vector> DepImpls; for (auto &D : Deps) { DepImpls.push_back(sycl::detail::getSyclObjImpl(D)); @@ -621,6 +618,7 @@ node modifiable_command_graph::addImpl(const std::vector &Deps) { node modifiable_command_graph::addImpl(std::function CGF, const std::vector &Deps) { + impl->throwIfGraphRecordingQueue("Explicit API \"Add()\" function"); std::vector> DepImpls; for (auto &D : Deps) { DepImpls.push_back(sycl::detail::getSyclObjImpl(D)); diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 31313d500c392..184ca326e0e76 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -343,6 +343,18 @@ class graph_impl { void makeEdge(std::shared_ptr Src, std::shared_ptr Dest); + /// Throws an invalid exception if this function is called + /// while a queue is recording commands to the graph. + /// @param ExceptionMsg Message to append to the exception message + void throwIfGraphRecordingQueue(const std::string ExceptionMsg) const { + if (MRecordingQueues.size()) { + throw sycl::exception(make_error_code(sycl::errc::invalid), + ExceptionMsg + + " cannot be called when a queue " + "is currently recording commands to a graph."); + } + } + private: /// Iterate over the graph depth-first and run \p NodeFunc on each node. /// @param NodeFunc A function which receives as input a node in the graph to diff --git a/sycl/test-e2e/Graph/Explicit/add_node_while_recording.cpp b/sycl/test-e2e/Graph/Explicit/add_node_while_recording.cpp index 2f5376a3a536c..44901e3fb452c 100644 --- a/sycl/test-e2e/Graph/Explicit/add_node_while_recording.cpp +++ b/sycl/test-e2e/Graph/Explicit/add_node_while_recording.cpp @@ -6,9 +6,6 @@ // // CHECK-NOT: LEAK -// Expected Fail as exception not implemented yet -// XFAIL: * - // Tests attempting to add a node to a command_graph while it is being // recorded to by a queue is an error. @@ -30,8 +27,17 @@ int main() { Success = true; } } + assert(Success); - Graph.end_recording(); + Success = false; + try { + Graph.add({}); + } catch (sycl::exception &E) { + auto StdErrc = E.code().value(); + Success = (StdErrc == static_cast(errc::invalid)); + } assert(Success); + + Graph.end_recording(); return 0; } From 8922f41d1e8e8e557f577e8dadc160af3b148be3 Mon Sep 17 00:00:00 2001 From: Maxime France-Pillois Date: Thu, 3 Aug 2023 15:32:19 +0100 Subject: [PATCH 4/8] [SYCL][Graph] Implement exceptions for incompatible extensions (#276) * [SYCL][Graph] Implement exceptions for incompatible extensions Throws an invalid exception when trying to use the following extensions along with Graph. - sycl_ext_oneapi_enqueue_barrier - sycl_ext_oneapi_memcpy2d - sycl_ext_codeplay_kernel_fusion - sycl_ext_oneapi_kernel_properties - sycl_ext_oneapi_device_global Closes Issue: #154 * [SYCL][Graph] Implement exceptions for incompatible extensions Adds info to exception message Moves tests from e2e to unitests when possible * [SYCL][Graph] Implement exceptions for incompatible extensions Corrects some typos and adds comments. * [SYCL][Graph] Implement exceptions for incompatible extensions Used a template function to throw exception instead of a parametrized function. * [SYCL][Graph] Implement exceptions for incompatible extensions Moves Sycl-extension enum definition. Limits graph recording to non-explicit path in the new tests. * [SYCL][Graph] Implement exceptions for incompatible extensions Updates Linux ABI dump file with the new handler function throwing exception. --- sycl/include/sycl/handler.hpp | 44 ++- .../detail/fusion/fusion_wrapper_impl.cpp | 5 + sycl/source/detail/graph_impl.cpp | 6 + sycl/source/handler.cpp | 46 +++ ...raph_exception_global_device_extension.cpp | 150 +++++++ sycl/test/abi/sycl_symbols_linux.dump | 4 + sycl/unittests/Extensions/CommandGraph.cpp | 366 ++++++++++++++++++ 7 files changed, 620 insertions(+), 1 deletion(-) create mode 100644 sycl/test-e2e/Graph/graph_exception_global_device_extension.cpp diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 862e7ffcb5a24..b4599bc6e18d9 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -110,8 +110,19 @@ class pipe; } namespace ext::oneapi::experimental::detail { +// List of sycl experimental extensions +// This enum is used to define the extension from which a function is called. +// This is used in handler::throwIfGraphAssociated() to specify +// the message of the thrown expection. +enum SyclExtensions { + sycl_ext_oneapi_kernel_properties, + sycl_ext_oneapi_enqueue_barrier, + sycl_ext_oneapi_memcpy2d, + sycl_ext_oneapi_device_global +}; + class graph_impl; -} +} // namespace ext::oneapi::experimental::detail namespace detail { class handler_impl; @@ -2085,6 +2096,7 @@ class __SYCL_EXPORT handler { std::enable_if_t< ext::oneapi::experimental::is_property_list::value> single_task(PropertiesT Props, _KERNELFUNCPARAM(KernelFunc)) { + throwIfGraphAssociatedAndKernelProperties(); single_task_lambda_impl(Props, KernelFunc); } @@ -2095,6 +2107,7 @@ class __SYCL_EXPORT handler { ext::oneapi::experimental::is_property_list::value> parallel_for(range<1> NumWorkItems, PropertiesT Props, _KERNELFUNCPARAM(KernelFunc)) { + throwIfGraphAssociatedAndKernelProperties(); parallel_for_lambda_impl( NumWorkItems, Props, std::move(KernelFunc)); } @@ -2105,6 +2118,7 @@ class __SYCL_EXPORT handler { ext::oneapi::experimental::is_property_list::value> parallel_for(range<2> NumWorkItems, PropertiesT Props, _KERNELFUNCPARAM(KernelFunc)) { + throwIfGraphAssociatedAndKernelProperties(); parallel_for_lambda_impl( NumWorkItems, Props, std::move(KernelFunc)); } @@ -2115,6 +2129,7 @@ class __SYCL_EXPORT handler { ext::oneapi::experimental::is_property_list::value> parallel_for(range<3> NumWorkItems, PropertiesT Props, _KERNELFUNCPARAM(KernelFunc)) { + throwIfGraphAssociatedAndKernelProperties(); parallel_for_lambda_impl( NumWorkItems, Props, std::move(KernelFunc)); } @@ -2125,6 +2140,7 @@ class __SYCL_EXPORT handler { ext::oneapi::experimental::is_property_list::value> parallel_for(nd_range Range, PropertiesT Properties, _KERNELFUNCPARAM(KernelFunc)) { + throwIfGraphAssociatedAndKernelProperties(); parallel_for_impl(Range, Properties, std::move(KernelFunc)); } @@ -2137,6 +2153,7 @@ class __SYCL_EXPORT handler { detail::AreAllButLastReductions::value && ext::oneapi::experimental::is_property_list::value> parallel_for(range<1> Range, PropertiesT Properties, RestT &&...Rest) { + throwIfGraphAssociatedAndKernelProperties(); detail::reduction_parallel_for(*this, Range, Properties, std::forward(Rest)...); } @@ -2148,6 +2165,7 @@ class __SYCL_EXPORT handler { detail::AreAllButLastReductions::value && ext::oneapi::experimental::is_property_list::value> parallel_for(range<2> Range, PropertiesT Properties, RestT &&...Rest) { + throwIfGraphAssociatedAndKernelProperties(); detail::reduction_parallel_for(*this, Range, Properties, std::forward(Rest)...); } @@ -2159,6 +2177,7 @@ class __SYCL_EXPORT handler { detail::AreAllButLastReductions::value && ext::oneapi::experimental::is_property_list::value> parallel_for(range<3> Range, PropertiesT Properties, RestT &&...Rest) { + throwIfGraphAssociatedAndKernelProperties(); detail::reduction_parallel_for(*this, Range, Properties, std::forward(Rest)...); } @@ -2213,6 +2232,7 @@ class __SYCL_EXPORT handler { int Dims, typename PropertiesT> void parallel_for_work_group(range NumWorkGroups, PropertiesT Props, _KERNELFUNCPARAM(KernelFunc)) { + throwIfGraphAssociatedAndKernelProperties(); parallel_for_work_group_lambda_impl(NumWorkGroups, Props, KernelFunc); @@ -2223,6 +2243,7 @@ class __SYCL_EXPORT handler { void parallel_for_work_group(range NumWorkGroups, range WorkGroupSize, PropertiesT Props, _KERNELFUNCPARAM(KernelFunc)) { + throwIfGraphAssociatedAndKernelProperties(); parallel_for_work_group_lambda_impl( NumWorkGroups, WorkGroupSize, Props, KernelFunc); @@ -2530,6 +2551,8 @@ class __SYCL_EXPORT handler { /// until all commands previously submitted to this queue have entered the /// complete state. void ext_oneapi_barrier() { + throwIfGraphAssociated(); throwIfActionIsCreated(); setType(detail::CG::Barrier); } @@ -2615,6 +2638,8 @@ class __SYCL_EXPORT handler { typename = std::enable_if_t>> void ext_oneapi_memcpy2d(void *Dest, size_t DestPitch, const void *Src, size_t SrcPitch, size_t Width, size_t Height) { + throwIfGraphAssociated(); throwIfActionIsCreated(); if (Width > DestPitch) throw sycl::exception(sycl::make_error_code(errc::invalid), @@ -2793,6 +2818,8 @@ class __SYCL_EXPORT handler { void memcpy(ext::oneapi::experimental::device_global &Dest, const void *Src, size_t NumBytes = sizeof(T), size_t DestOffset = 0) { + throwIfGraphAssociated(); if (sizeof(T) < DestOffset + NumBytes) throw sycl::exception(make_error_code(errc::invalid), "Copy to device_global is out of bounds."); @@ -2825,6 +2852,8 @@ class __SYCL_EXPORT handler { memcpy(void *Dest, const ext::oneapi::experimental::device_global &Src, size_t NumBytes = sizeof(T), size_t SrcOffset = 0) { + throwIfGraphAssociated(); if (sizeof(T) < SrcOffset + NumBytes) throw sycl::exception(make_error_code(errc::invalid), "Copy from device_global is out of bounds."); @@ -3346,8 +3375,21 @@ class __SYCL_EXPORT handler { "handler::require() before it can be used."); } + template + std::enable_if_t< + ext::oneapi::experimental::is_property_list::value> + throwIfGraphAssociatedAndKernelProperties() { + if (!std::is_same_v) + throwIfGraphAssociated(); + } + // Set value of the gpu cache configuration for the kernel. void setKernelCacheConfig(sycl::detail::pi::PiKernelCacheConfig); + + template + void throwIfGraphAssociated(); }; } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/fusion/fusion_wrapper_impl.cpp b/sycl/source/detail/fusion/fusion_wrapper_impl.cpp index 492b0bc4aa852..d846b018ab64c 100644 --- a/sycl/source/detail/fusion/fusion_wrapper_impl.cpp +++ b/sycl/source/detail/fusion/fusion_wrapper_impl.cpp @@ -27,6 +27,11 @@ bool fusion_wrapper_impl::is_in_fusion_mode() const { } void fusion_wrapper_impl::start_fusion() { + if (MQueue->getCommandGraph()) { + throw sycl::exception(sycl::make_error_code(errc::invalid), + "SYCL kernel fusion can NOT be started " + "on a queue that is in a recording state."); + } detail::Scheduler::getInstance().startFusion(MQueue); } diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 6fa7d0d6fdf07..547373c515e79 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -647,6 +647,12 @@ modifiable_command_graph::finalize(const sycl::property_list &) const { bool modifiable_command_graph::begin_recording(queue &RecordingQueue) { auto QueueImpl = sycl::detail::getSyclObjImpl(RecordingQueue); + if (QueueImpl->is_in_fusion_mode()) { + throw sycl::exception(sycl::make_error_code(errc::invalid), + "SYCL queue in kernel in fusion mode " + "can NOT be recorded."); + } + if (QueueImpl->get_context() != impl->getContext()) { throw sycl::exception(sycl::make_error_code(errc::invalid), "begin_recording called for a queue whose context " diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 95db5c5eb66af..f12478429d1f2 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -794,6 +794,8 @@ void handler::verifyUsedKernelBundle(const std::string &KernelName) { } void handler::ext_oneapi_barrier(const std::vector &WaitList) { + throwIfGraphAssociated(); throwIfActionIsCreated(); MCGType = detail::CG::BarrierWaitlist; MEventsWaitWithBarrier.resize(WaitList.size()); @@ -1338,5 +1340,49 @@ handler::getCommandGraph() const { return MQueue->getCommandGraph(); } +template void handler::throwIfGraphAssociated< + ext::oneapi::experimental::detail::SyclExtensions:: + sycl_ext_oneapi_kernel_properties>(); +template void handler::throwIfGraphAssociated< + ext::oneapi::experimental::detail::SyclExtensions:: + sycl_ext_oneapi_enqueue_barrier>(); +template void +handler::throwIfGraphAssociated(); +template void handler::throwIfGraphAssociated< + ext::oneapi::experimental::detail::SyclExtensions:: + sycl_ext_oneapi_device_global>(); + +template +void handler::throwIfGraphAssociated() { + std::string ExceptionMsg = ""; + + if constexpr (ExtensionT == + ext::oneapi::experimental::detail::SyclExtensions:: + sycl_ext_oneapi_kernel_properties) { + ExceptionMsg = "sycl_ext_oneapi_kernel_properties"; + } + if constexpr (ExtensionT == + ext::oneapi::experimental::detail::SyclExtensions:: + sycl_ext_oneapi_enqueue_barrier) { + ExceptionMsg = "sycl_ext_oneapi_enqueue_barrier"; + } + if constexpr (ExtensionT == ext::oneapi::experimental::detail:: + SyclExtensions::sycl_ext_oneapi_memcpy2d) { + ExceptionMsg = "sycl_ext_oneapi_memcpy2d"; + } + if constexpr (ExtensionT == + ext::oneapi::experimental::detail::SyclExtensions:: + sycl_ext_oneapi_device_global) { + ExceptionMsg = "sycl_ext_oneapi_device_global"; + } + + if (MGraph || MQueue->getCommandGraph()) { + throw sycl::exception(sycl::make_error_code(errc::invalid), + "The feature " + ExceptionMsg + + " is not yet available " + "along with SYCL Graph extension."); + } +} } // namespace _V1 } // namespace sycl diff --git a/sycl/test-e2e/Graph/graph_exception_global_device_extension.cpp b/sycl/test-e2e/Graph/graph_exception_global_device_extension.cpp new file mode 100644 index 0000000000000..e674beec4693f --- /dev/null +++ b/sycl/test-e2e/Graph/graph_exception_global_device_extension.cpp @@ -0,0 +1,150 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// +// The test checks that invalid exception is thrown +// when trying to use sycl_ext_oneapi_device_global +// along with Graph. + +#include "graph_common.hpp" + +using TestProperties = decltype(sycl::ext::oneapi::experimental::properties{}); + +sycl::ext::oneapi::experimental::device_global + MemcpyDeviceGlobal; +sycl::ext::oneapi::experimental::device_global + CopyDeviceGlobal; + +enum OperationPath { Explicit, RecordReplay, Shortcut }; + +template void test() { + queue Q; + int MemcpyWrite = 42, CopyWrite = 24, MemcpyRead = 1, CopyRead = 2; + + exp_ext::command_graph Graph{Q.get_context(), Q.get_device()}; + + if constexpr (PathKind != OperationPath::Explicit) { + Graph.begin_recording(Q); + } + + // Copy from device globals before having written anything. + std::error_code ExceptionCode = make_error_code(sycl::errc::success); + try { + if constexpr (PathKind == OperationPath::Shortcut) { + Q.memcpy(&MemcpyRead, MemcpyDeviceGlobal); + } + if constexpr (PathKind == OperationPath::RecordReplay) { + Q.submit([&](handler &CGH) { + return CGH.memcpy(&MemcpyRead, MemcpyDeviceGlobal); + }); + } + if constexpr (PathKind == OperationPath::Explicit) { + Graph.add([&](handler &CGH) { + return CGH.memcpy(&MemcpyRead, MemcpyDeviceGlobal); + }); + } + } catch (exception &Exception) { + ExceptionCode = Exception.code(); + } + assert(ExceptionCode == sycl::errc::invalid); + + ExceptionCode = make_error_code(sycl::errc::success); + try { + if constexpr (PathKind == OperationPath::Shortcut) { + Q.copy(CopyDeviceGlobal, &CopyRead); + } + if constexpr (PathKind == OperationPath::RecordReplay) { + Q.submit( + [&](handler &CGH) { return CGH.copy(CopyDeviceGlobal, &CopyRead); }); + } + if constexpr (PathKind == OperationPath::Explicit) { + Graph.add( + [&](handler &CGH) { return CGH.copy(CopyDeviceGlobal, &CopyRead); }); + } + } catch (exception &Exception) { + ExceptionCode = Exception.code(); + } + assert(ExceptionCode == sycl::errc::invalid); + + // Write to device globals and then read their values. + ExceptionCode = make_error_code(sycl::errc::success); + try { + if constexpr (PathKind == OperationPath::Shortcut) { + Q.memcpy(MemcpyDeviceGlobal, &MemcpyWrite); + } + if constexpr (PathKind == OperationPath::RecordReplay) { + Q.submit([&](handler &CGH) { + return CGH.memcpy(MemcpyDeviceGlobal, &MemcpyWrite); + }); + } + if constexpr (PathKind == OperationPath::Explicit) { + Graph.add([&](handler &CGH) { + return CGH.memcpy(MemcpyDeviceGlobal, &MemcpyWrite); + }); + } + } catch (exception &Exception) { + ExceptionCode = Exception.code(); + } + assert(ExceptionCode == sycl::errc::invalid); + + ExceptionCode = make_error_code(sycl::errc::success); + try { + if constexpr (PathKind == OperationPath::Shortcut) { + Q.copy(&CopyWrite, CopyDeviceGlobal); + } else if constexpr (PathKind == OperationPath::RecordReplay) { + Q.submit( + [&](handler &CGH) { return CGH.copy(&CopyWrite, CopyDeviceGlobal); }); + } else if constexpr (PathKind == OperationPath::Explicit) { + Graph.add( + [&](handler &CGH) { return CGH.copy(&CopyWrite, CopyDeviceGlobal); }); + } + } catch (exception &Exception) { + ExceptionCode = Exception.code(); + } + assert(ExceptionCode == sycl::errc::invalid); + + ExceptionCode = make_error_code(sycl::errc::success); + try { + if constexpr (PathKind == OperationPath::Shortcut) { + Q.memcpy(&MemcpyRead, MemcpyDeviceGlobal); + } else if constexpr (PathKind == OperationPath::RecordReplay) { + Q.submit([&](handler &CGH) { + return CGH.memcpy(&MemcpyRead, MemcpyDeviceGlobal); + }); + } else if constexpr (PathKind == OperationPath::Explicit) { + Graph.add([&](handler &CGH) { + return CGH.memcpy(&MemcpyRead, MemcpyDeviceGlobal); + }); + } + } catch (exception &Exception) { + ExceptionCode = Exception.code(); + } + assert(ExceptionCode == sycl::errc::invalid); + + ExceptionCode = make_error_code(sycl::errc::success); + try { + if constexpr (PathKind == OperationPath::Shortcut) { + Q.copy(CopyDeviceGlobal, &CopyRead); + } else if constexpr (PathKind == OperationPath::RecordReplay) { + Q.submit( + [&](handler &CGH) { return CGH.copy(CopyDeviceGlobal, &CopyRead); }); + } else if constexpr (PathKind == OperationPath::Explicit) { + Graph.add( + [&](handler &CGH) { return CGH.copy(CopyDeviceGlobal, &CopyRead); }); + } + } catch (exception &Exception) { + ExceptionCode = Exception.code(); + } + assert(ExceptionCode == sycl::errc::invalid); + + if constexpr (PathKind != OperationPath::Explicit) { + Graph.end_recording(); + } +} + +int main() { + test(); + test(); + test(); + return 0; +} diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index c5b1a56dae6d4..d617ffbc1633e 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -4098,6 +4098,10 @@ _ZN4sycl3_V17handler22memcpyFromDeviceGlobalEPvPKvbmm _ZN4sycl3_V17handler22setHandlerKernelBundleENS0_6kernelE _ZN4sycl3_V17handler22setHandlerKernelBundleERKSt10shared_ptrINS0_6detail18kernel_bundle_implEE _ZN4sycl3_V17handler22verifyUsedKernelBundleERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE +_ZN4sycl3_V17handler22throwIfGraphAssociatedILNS0_3ext6oneapi12experimental6detail14SyclExtensionsE0EEEvv +_ZN4sycl3_V17handler22throwIfGraphAssociatedILNS0_3ext6oneapi12experimental6detail14SyclExtensionsE1EEEvv +_ZN4sycl3_V17handler22throwIfGraphAssociatedILNS0_3ext6oneapi12experimental6detail14SyclExtensionsE3EEEvv +_ZN4sycl3_V17handler22throwIfGraphAssociatedILNS0_3ext6oneapi12experimental6detail14SyclExtensionsE2EEEvv _ZN4sycl3_V17handler24GetRangeRoundingSettingsERmS2_S2_ _ZN4sycl3_V17handler24ext_intel_read_host_pipeERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEPvmb _ZN4sycl3_V17handler24ext_oneapi_memcpy2d_implEPvmPKvmmm diff --git a/sycl/unittests/Extensions/CommandGraph.cpp b/sycl/unittests/Extensions/CommandGraph.cpp index 574cbc7bebf37..6015706569d49 100644 --- a/sycl/unittests/Extensions/CommandGraph.cpp +++ b/sycl/unittests/Extensions/CommandGraph.cpp @@ -20,6 +20,279 @@ using namespace sycl; using namespace sycl::ext::oneapi; +namespace { +/// Define the three possible path to add node to a SYCL Graph. +/// Shortcut is a sub-type of Record&Replay using Queue shortcut +/// instead of standard kernel submitions. +enum OperationPath { Explicit, RecordReplay, Shortcut }; + +/// Function types and classes for testing Kernel with properties extension +enum class Variant { Function, Functor, FunctorAndProperty }; + +template +class ReqdWGSizePositiveA; +template class ReqPositiveA; + +template range repeatRange(size_t Val); +template <> range<1> repeatRange<1>(size_t Val) { return range<1>{Val}; } +template <> range<2> repeatRange<2>(size_t Val) { return range<2>{Val, Val}; } +template <> range<3> repeatRange<3>(size_t Val) { + return range<3>{Val, Val, Val}; +} + +template struct KernelFunctorWithWGSizeProp { + void operator()(nd_item) const {} + void operator()(item) const {} + + auto get(sycl::ext::oneapi::experimental::properties_tag) { + return sycl::ext::oneapi::experimental::properties{ + sycl::ext::oneapi::experimental::work_group_size}; + } +}; + +/// Tries to add a Parallel_for node with kernel properties to the graph G +/// It tests that an invalid exception has been thrown +/// Since sycl_ext_oneapi_kernel_properties extension can not be used +/// along with SYCL Graph. +/// +/// @param G Modifiable graph to add commands to. +/// @param Q Queue to submit nodes to. +/// @param Props Properties associated to the submitted kernel +/// @param KernelFunc pointer to the kernel +template +void addKernelWithProperties( + sycl::ext::oneapi::experimental::detail::modifiable_command_graph &G, + queue &Q, PropertiesT Props, KernelType KernelFunc) { + constexpr size_t Dims = sizeof...(Is); + + // Test Parallel_for + std::error_code ExceptionCode = make_error_code(sycl::errc::success); + try { + if constexpr (PathKind == OperationPath::RecordReplay) { + Q.submit([&](handler &CGH) { + CGH.parallel_for>( + nd_range(repeatRange(8), range(Is...)), Props, + KernelFunc); + }); + } + if constexpr (PathKind == OperationPath::Shortcut) { + Q.parallel_for>( + nd_range(repeatRange(8), range(Is...)), Props, + KernelFunc); + } + if constexpr (PathKind == OperationPath::Explicit) { + G.add([&](handler &CGH) { + CGH.parallel_for>( + nd_range(repeatRange(8), range(Is...)), Props, + KernelFunc); + }); + } + } catch (exception &Exception) { + ExceptionCode = Exception.code(); + } + ASSERT_EQ(ExceptionCode, sycl::errc::invalid); +} + +/// Tries to add a Single task node with kernel properties to the graph G +/// It tests that an invalid exception has been thrown +/// Since sycl_ext_oneapi_kernel_properties extension can not be used +/// along with SYCL Graph. +/// +/// @param G Modifiable graph to add commands to. +/// @param Q Queue to submit nodes to. +/// @param Props Properties associated to the submitted kernel +/// @param KernelFunc pointer to the kernel +template +void testSingleTaskProperties(experimental::detail::modifiable_command_graph &G, + queue &Q, PropertiesT Props, + KernelType KernelFunc) { + + // Test Single_task + std::error_code ExceptionCode = make_error_code(sycl::errc::success); + try { + if constexpr (PathKind == OperationPath::RecordReplay) { + G.begin_recording(Q); + Q.submit([&](sycl::handler &CGH) { + CGH.single_task>(Props, + KernelFunc); + }); + G.end_recording(); + } + if constexpr (PathKind == OperationPath::Explicit) { + G.add([&](sycl::handler &CGH) { + CGH.single_task>(Props, + KernelFunc); + }); + } + } catch (exception &Exception) { + ExceptionCode = Exception.code(); + } + ASSERT_EQ(ExceptionCode, sycl::errc::invalid); +} + +/// This function groups all the different test cases +/// when adding a Parallel_for node with kernel properties to the graph G +/// +/// @param G Modifiable graph to add commands to. +/// @param Q Queue to submit nodes to. +template +void testParallelForProperties( + queue &Q, experimental::detail::modifiable_command_graph &G) { + auto Props = ext::oneapi::experimental::properties{ + experimental::work_group_size}; + auto KernelFunction = [](auto) {}; + + KernelFunctorWithWGSizeProp KernelFunctor; + + G.begin_recording(Q); + + addKernelWithProperties(G, Q, Props, KernelFunction); + addKernelWithProperties(G, Q, Props, + KernelFunctor); + + addKernelWithProperties( + G, Q, Props, KernelFunction); + addKernelWithProperties(G, Q, Props, KernelFunctor); + + G.end_recording(); + + addKernelWithProperties( + G, Q, Props, KernelFunction); + addKernelWithProperties(G, Q, Props, KernelFunctor); +} + +/// Tries to enqueue oneapi barrier to the graph G +/// It tests that an invalid exception has been thrown +/// Since sycl_ext_oneapi_enqueue_barrier extension can not be used +/// along with SYCL Graph. +template void testEnqueueBarrier() { + sycl::context Context; + sycl::queue Q1(Context, sycl::default_selector_v); + + experimental::command_graph Graph1{ + Q1.get_context(), Q1.get_device()}; + + Graph1.add([&](sycl::handler &cgh) {}); + Graph1.add([&](sycl::handler &cgh) {}); + + if constexpr (PathKind != OperationPath::Explicit) { + Graph1.begin_recording(Q1); + } + + // call queue::ext_oneapi_submit_barrier() + std::error_code ExceptionCode = make_error_code(sycl::errc::success); + try { + if constexpr (PathKind == OperationPath::Shortcut) { + Q1.ext_oneapi_submit_barrier(); + } + if constexpr (PathKind == OperationPath::RecordReplay) { + Q1.submit([&](sycl::handler &CGH) { CGH.ext_oneapi_barrier(); }); + } + if constexpr (PathKind == OperationPath::Explicit) { + Graph1.add([&](handler &CGH) { CGH.ext_oneapi_barrier(); }); + } + + } catch (exception &Exception) { + ExceptionCode = Exception.code(); + } + ASSERT_EQ(ExceptionCode, sycl::errc::invalid); + + if constexpr (PathKind != OperationPath::Explicit) { + Graph1.end_recording(); + } + + sycl::queue Q2(Context, sycl::default_selector_v); + sycl::queue Q3(Context, sycl::default_selector_v); + + experimental::command_graph Graph2{ + Q2.get_context(), Q2.get_device()}; + experimental::command_graph Graph3{ + Q3.get_context(), Q3.get_device()}; + + Graph2.begin_recording(Q2); + Graph3.begin_recording(Q3); + + auto Event1 = Q2.submit([&](sycl::handler &cgh) {}); + auto Event2 = Q3.submit([&](sycl::handler &cgh) {}); + + if constexpr (PathKind == OperationPath::Explicit) { + Graph2.end_recording(); + Graph3.end_recording(); + } + + // call handler::barrier(const std::vector &WaitList) + ExceptionCode = make_error_code(sycl::errc::success); + try { + if constexpr (PathKind == OperationPath::Shortcut) { + Q3.ext_oneapi_submit_barrier({Event1, Event2}); + } + if constexpr (PathKind == OperationPath::RecordReplay) { + Q3.submit([&](sycl::handler &CGH) { + CGH.ext_oneapi_barrier({Event1, Event2}); + }); + } + if constexpr (PathKind == OperationPath::Explicit) { + Graph3.add([&](handler &CGH) { + CGH.ext_oneapi_barrier({Event1, Event2}); + }); + } + + } catch (exception &Exception) { + ExceptionCode = Exception.code(); + } + ASSERT_EQ(ExceptionCode, sycl::errc::invalid); + + if constexpr (PathKind != OperationPath::Explicit) { + Graph2.end_recording(); + Graph3.end_recording(); + } +} + +/// Tries to add a memcpy2D node to the graph G +/// It tests that an invalid exception has been thrown +/// Since sycl_ext_oneapi_memcpy2d extension can not be used +/// along with SYCL Graph. +/// +/// @param G Modifiable graph to add commands to. +/// @param Q Queue to submit nodes to. +/// @param Dest Pointer to the memory destination +/// @param DestPitch pitch at the destination +/// @param Src Pointer to the memory source +/// @param SrcPitch pitch at the source +/// @param Witdh width of the data to copy +/// @param Height height of the data to copy +template +void addMemcpy2D(experimental::detail::modifiable_command_graph &G, queue &Q, + void *Dest, size_t DestPitch, const void *Src, size_t SrcPitch, + size_t Width, size_t Height) { + std::error_code ExceptionCode = make_error_code(sycl::errc::success); + try { + if constexpr (PathKind == OperationPath::RecordReplay) { + Q.submit([&](handler &CGH) { + CGH.ext_oneapi_memcpy2d(Dest, DestPitch, Src, SrcPitch, Width, Height); + }); + } + if constexpr (PathKind == OperationPath::Shortcut) { + Q.ext_oneapi_memcpy2d(Dest, DestPitch, Src, SrcPitch, Width, Height); + } + if constexpr (PathKind == OperationPath::Explicit) { + G.add([&](handler &CGH) { + CGH.ext_oneapi_memcpy2d(Dest, DestPitch, Src, SrcPitch, Width, Height); + }); + } + } catch (exception &Exception) { + ExceptionCode = Exception.code(); + } + ASSERT_EQ(ExceptionCode, sycl::errc::invalid); +} + +} // namespace + class CommandGraphTest : public ::testing::Test { public: CommandGraphTest() @@ -708,3 +981,96 @@ TEST_F(CommandGraphTest, MakeEdgeErrors) { // Re-check graph structure to make sure the graph state has not been modified CheckGraphStructure(); } + +TEST_F(CommandGraphTest, EnqueueBarrierExceptionCheck) { + testEnqueueBarrier(); + testEnqueueBarrier(); + testEnqueueBarrier(); +} + +TEST_F(CommandGraphTest, FusionExtensionExceptionCheck) { + queue Q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + experimental::command_graph Graph{ + Q.get_context(), Q.get_device()}; + + ext::codeplay::experimental::fusion_wrapper fw{Q}; + + // Test: Start fusion on a queue that is in recording mode + Graph.begin_recording(Q); + + std::error_code ExceptionCode = make_error_code(sycl::errc::success); + try { + fw.start_fusion(); + } catch (exception &Exception) { + ExceptionCode = Exception.code(); + } + ASSERT_EQ(ExceptionCode, sycl::errc::invalid); + + Graph.end_recording(Q); + + // Test: begin recording a queue in fusion mode + + fw.start_fusion(); + + ExceptionCode = make_error_code(sycl::errc::success); + try { + Graph.begin_recording(Q); + } catch (exception &Exception) { + ExceptionCode = Exception.code(); + } + ASSERT_EQ(ExceptionCode, sycl::errc::invalid); +} + +TEST_F(CommandGraphTest, KernelPropertiesExceptionCheck) { + + // Test Parallel for entry point + testParallelForProperties<4>(Queue, Graph); + testParallelForProperties<4, 4>(Queue, Graph); + testParallelForProperties<8, 4>(Queue, Graph); + testParallelForProperties<4, 8>(Queue, Graph); + testParallelForProperties<4, 4, 4>(Queue, Graph); + testParallelForProperties<4, 4, 8>(Queue, Graph); + testParallelForProperties<8, 4, 4>(Queue, Graph); + testParallelForProperties<4, 8, 4>(Queue, Graph); + + // Test Single Task entry point + auto Props = ext::oneapi::experimental::properties{ + ext::oneapi::experimental::work_group_size<4>}; + auto KernelFunction = [](auto) {}; + testSingleTaskProperties(Graph, Queue, Props, + KernelFunction); + testSingleTaskProperties(Graph, Queue, Props, + KernelFunction); +} + +TEST_F(CommandGraphTest, Memcpy2DExceptionCheck) { + constexpr size_t RECT_WIDTH = 30; + constexpr size_t RECT_HEIGHT = 21; + constexpr size_t SRC_ELEMS = RECT_WIDTH * RECT_HEIGHT; + constexpr size_t DST_ELEMS = SRC_ELEMS; + + using T = int; + + Graph.begin_recording(Queue); + + T *USMMemSrc = malloc_device(SRC_ELEMS, Queue); + T *USMMemDst = malloc_device(DST_ELEMS, Queue); + + addMemcpy2D( + Graph, Queue, USMMemDst, RECT_WIDTH * sizeof(T), USMMemSrc, + RECT_WIDTH * sizeof(T), RECT_WIDTH * sizeof(T), RECT_HEIGHT); + + addMemcpy2D( + Graph, Queue, USMMemDst, RECT_WIDTH * sizeof(T), USMMemSrc, + RECT_WIDTH * sizeof(T), RECT_WIDTH * sizeof(T), RECT_HEIGHT); + + Graph.end_recording(); + + addMemcpy2D( + Graph, Queue, USMMemDst, RECT_WIDTH * sizeof(T), USMMemSrc, + RECT_WIDTH * sizeof(T), RECT_WIDTH * sizeof(T), RECT_HEIGHT); + + sycl::free(USMMemSrc, Queue); + sycl::free(USMMemDst, Queue); +} From bfcd55171d7db440725f06cfd70175cfca91b288 Mon Sep 17 00:00:00 2001 From: Maxime France-Pillois Date: Fri, 4 Aug 2023 11:53:17 +0100 Subject: [PATCH 5/8] [SYCL][Graph] Makes command graph functions thread-safe (#265) * [SYCL][Graph] Makes command graph functions thread-safe Addresses comments made on the first PR commit. Mutexes are now added to Graph implementation entry points instead of end points as was the case in the previous commit. Adds "build_pthread_inc" lit test macro to facilitate the compilation of the threading tests. Removes std::barrier (std-20) dependency in threading tests. Addresses Issue: #85 * [SYCL][Graph] Makes command graph functions thread-safe Moves threading tests that do not require a device to run to unitests * Update sycl/source/detail/graph_impl.cpp Co-authored-by: Ben Tracy * [SYCL][Graph] Makes command graph functions thread-safe Adds some comments. * Update sycl/source/handler.cpp Co-authored-by: Pablo Reble * Update sycl/source/detail/graph_impl.hpp Co-authored-by: Ewan Crawford * [SYCL][Graph] Makes command graph functions thread-safe Adds dedidacted sub-class to unitests for multi-threading unitests * [SYCL][Graph] Makes command graph functions thread-safe Adds comments * [SYCL][Graph] thread-safe: bug fix after rebase --------- Co-authored-by: Ben Tracy Co-authored-by: Pablo Reble Co-authored-by: Ewan Crawford --- sycl/source/detail/graph_impl.cpp | 18 +- sycl/source/detail/graph_impl.hpp | 277 +++++++++++++++++- sycl/source/detail/queue_impl.hpp | 1 + sycl/source/handler.cpp | 18 ++ sycl/test-e2e/Graph/Explicit/basic_usm.cpp | 2 +- .../Graph/Explicit/multiple_exec_graphs.cpp | 2 +- sycl/test-e2e/Graph/Inputs/basic_usm.cpp | 9 + .../Graph/Inputs/multiple_exec_graphs.cpp | 93 +++++- .../test-e2e/Graph/RecordReplay/basic_usm.cpp | 4 +- .../RecordReplay/multiple_exec_graphs.cpp | 2 +- sycl/test-e2e/Graph/graph_common.hpp | 25 ++ sycl/test-e2e/format.py | 12 + sycl/unittests/Extensions/CommandGraph.cpp | 229 ++++++++++++++- 13 files changed, 668 insertions(+), 24 deletions(-) diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 547373c515e79..63e009804fa0a 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -220,7 +220,6 @@ graph_impl::add(sycl::detail::CG::CGTYPE CGType, checkForRequirement(Req, NodePtr, UniqueDeps); } } - // Add any nodes specified by event dependencies into the dependency list for (auto Dep : CommandGroup->getEvents()) { if (auto NodeImpl = MEventsMap.find(Dep); NodeImpl != MEventsMap.end()) { @@ -474,6 +473,8 @@ void exec_graph_impl::createCommandBuffers(sycl::device Device) { } exec_graph_impl::~exec_graph_impl() { + WriteLock Lock(MMutex); + // clear all recording queue if not done before (no call to end_recording) MGraphImpl->clearQueues(); @@ -499,6 +500,8 @@ exec_graph_impl::~exec_graph_impl() { sycl::event exec_graph_impl::enqueue(const std::shared_ptr &Queue, sycl::detail::CG::StorageInitHelper CGData) { + WriteLock Lock(MMutex); + auto CreateNewEvent([&]() { auto NewEvent = std::make_shared(Queue); NewEvent->setContextImpl(Queue->getContextImplPtr()); @@ -612,6 +615,7 @@ node modifiable_command_graph::addImpl(const std::vector &Deps) { DepImpls.push_back(sycl::detail::getSyclObjImpl(D)); } + graph_impl::WriteLock Lock(impl->MMutex); std::shared_ptr NodeImpl = impl->add(DepImpls); return sycl::detail::createSyclObjFromImpl(NodeImpl); } @@ -624,6 +628,7 @@ node modifiable_command_graph::addImpl(std::function CGF, DepImpls.push_back(sycl::detail::getSyclObjImpl(D)); } + graph_impl::WriteLock Lock(impl->MMutex); std::shared_ptr NodeImpl = impl->add(impl, CGF, {}, DepImpls); return sycl::detail::createSyclObjFromImpl(NodeImpl); @@ -635,6 +640,7 @@ void modifiable_command_graph::make_edge(node &Src, node &Dest) { std::shared_ptr ReceiverImpl = sycl::detail::getSyclObjImpl(Dest); + graph_impl::WriteLock Lock(impl->MMutex); impl->makeEdge(SenderImpl, ReceiverImpl); } @@ -666,6 +672,7 @@ bool modifiable_command_graph::begin_recording(queue &RecordingQueue) { if (QueueImpl->getCommandGraph() == nullptr) { QueueImpl->setCommandGraph(impl); + graph_impl::WriteLock Lock(impl->MMutex); impl->addQueue(QueueImpl); return true; } @@ -687,12 +694,16 @@ bool modifiable_command_graph::begin_recording( return QueueStateChanged; } -bool modifiable_command_graph::end_recording() { return impl->clearQueues(); } +bool modifiable_command_graph::end_recording() { + graph_impl::WriteLock Lock(impl->MMutex); + return impl->clearQueues(); +} bool modifiable_command_graph::end_recording(queue &RecordingQueue) { auto QueueImpl = sycl::detail::getSyclObjImpl(RecordingQueue); if (QueueImpl->getCommandGraph() == impl) { QueueImpl->setCommandGraph(nullptr); + graph_impl::WriteLock Lock(impl->MMutex); impl->removeQueue(QueueImpl); return true; } @@ -719,6 +730,9 @@ executable_command_graph::executable_command_graph( const std::shared_ptr &Graph, const sycl::context &Ctx) : MTag(rand()), impl(std::make_shared(Ctx, Graph)) { + // Graph is read and written in this scope so we lock + // this graph with full priviledges. + graph_impl::WriteLock Lock(Graph->MMutex); finalizeImpl(); // Create backend representation for executable graph } diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 184ca326e0e76..731b7914b2ae1 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -21,6 +21,7 @@ #include #include #include +#include namespace sycl { inline namespace _V1 { @@ -181,6 +182,167 @@ class node_impl { return nullptr; } + /// Prints Node information to Stream + /// @param Stream where to print the Node information + void printDotCG(std::ostream &Stream) { + sycl::detail::CG::CGTYPE CGType = MCommandGroup->getType(); + + Stream << "\"" << MCommandGroup.get() + << "\" [style=filled, fillcolor=\"#FFD28A\", label=\""; + + Stream << "ID = " << MCommandGroup.get() << "\\n"; + Stream << "TYPE = "; + + switch (CGType) { + case sycl::detail::CG::CGTYPE::None: + Stream << "None \\n"; + break; + case sycl::detail::CG::CGTYPE::Kernel: { + Stream << "CGExecKernel \\n"; + sycl::detail::CGExecKernel *kernel = + static_cast(MCommandGroup.get()); + Stream << "NAME = " << kernel->MKernelName << "\\n"; + break; + } + case sycl::detail::CG::CGTYPE::CopyAccToPtr: + case sycl::detail::CG::CGTYPE::CopyPtrToAcc: + case sycl::detail::CG::CGTYPE::CopyAccToAcc: + Stream << "CGCopy \\n"; + break; + case sycl::detail::CG::CGTYPE::Fill: + Stream << "CGFill \\n"; + break; + case sycl::detail::CG::CGTYPE::UpdateHost: + Stream << "CGCUpdateHost \\n"; + break; + case sycl::detail::CG::CGTYPE::CopyUSM: + Stream << "CGCopyUSM \\n"; + break; + case sycl::detail::CG::CGTYPE::FillUSM: + Stream << "CGFillUSM \\n"; + break; + case sycl::detail::CG::CGTYPE::PrefetchUSM: + Stream << "CGPrefetchUSM \\n"; + break; + case sycl::detail::CG::CGTYPE::AdviseUSM: + Stream << "CGAdviseUSM \\n"; + break; + case sycl::detail::CG::CGTYPE::CodeplayHostTask: + Stream << "CGHostTask \\n"; + break; + case sycl::detail::CG::CGTYPE::Barrier: + Stream << "CGBarrier \\n"; + break; + case sycl::detail::CG::CGTYPE::Copy2DUSM: + Stream << "CGCopy2DUSM \\n"; + break; + case sycl::detail::CG::CGTYPE::Fill2DUSM: + Stream << "CGFill2DUSM \\n"; + break; + case sycl::detail::CG::CGTYPE::Memset2DUSM: + Stream << "CGMemset2DUSM \\n"; + break; + case sycl::detail::CG::CGTYPE::ReadWriteHostPipe: + Stream << "CGReadWriteHostPipe \\n"; + break; + case sycl::detail::CG::CGTYPE::CopyToDeviceGlobal: + Stream << "CGCopyToDeviceGlobal \\n"; + break; + case sycl::detail::CG::CGTYPE::CopyFromDeviceGlobal: + Stream << "CGCopyFromDeviceGlobal \\n"; + break; + case sycl::detail::CG::CGTYPE::ExecCommandBuffer: + Stream << "CGExecCommandBuffer \\n"; + break; + default: + Stream << "Other \\n"; + break; + } + Stream << "\"];" << std::endl; + } + + /// Recursive Depth first traversal of linked nodes + /// to print node information and connection to Stream + /// @param Stream where to print node information + /// @Visited vector of the already visited nodes + void printDotRecursive(std::ostream &Stream, + std::vector &Visited) { + // if Node has been already visited, we skip it + if (std::find(Visited.begin(), Visited.end(), this) != Visited.end()) + return; + + Visited.push_back(this); + + printDotCG(Stream); + for (const auto &Dep : MPredecessors) { + auto NodeDep = Dep.lock(); + Stream << " \"" << MCommandGroup.get() << "\" -> \"" + << NodeDep->MCommandGroup.get() << "\"" << std::endl; + } + + for (std::shared_ptr Succ : MSuccessors) { + Succ->printDotRecursive(Stream, Visited); + } + } + + /// Tests is the caller is similar to Node + /// @return True if the two nodes are similars + bool isSimilar(std::shared_ptr Node) { + if (MCGType != Node->MCGType) + return false; + + if (MSuccessors.size() != Node->MSuccessors.size()) + return false; + + if (MPredecessors.size() != Node->MPredecessors.size()) + return false; + + if ((MCGType == sycl::detail::CG::CGTYPE::Kernel) && + (MCGType == sycl::detail::CG::CGTYPE::Kernel)) { + sycl::detail::CGExecKernel *ExecKernelA = + static_cast(MCommandGroup.get()); + sycl::detail::CGExecKernel *ExecKernelB = + static_cast(Node->MCommandGroup.get()); + + if (ExecKernelA->MKernelName.compare(ExecKernelB->MKernelName) != 0) + return false; + } + return true; + } + + /// Recursive traversal of successor nodes checking for + /// equivalent node successions in Node + /// @param Node pointer to the starting node for structure comparison + /// @return true is same structure found, false otherwise + bool checkNodeRecursive(std::shared_ptr Node) { + size_t FoundCnt = 0; + for (std::shared_ptr SuccA : MSuccessors) { + for (std::shared_ptr SuccB : Node->MSuccessors) { + if (isSimilar(Node)) { + if (SuccA->checkNodeRecursive(SuccB)) { + FoundCnt++; + break; + } + } + } + } + if (FoundCnt != MSuccessors.size()) { + return false; + } + + return true; + } + + /// Recusively computes the number of successor nodes + /// @return number of successor nodes + size_t depthSearchCount() const { + size_t NumberOfNodes = 1; + for (const auto &Succ : MSuccessors) { + NumberOfNodes += Succ->depthSearchCount(); + } + return NumberOfNodes; + } + private: /// Creates a copy of the node's CG by casting to it's actual type, then using /// that to copy construct and create a new unique ptr from that copy. @@ -194,6 +356,12 @@ class node_impl { /// Implementation details of command_graph. class graph_impl { public: + using ReadLock = std::shared_lock; + using WriteLock = std::unique_lock; + + /// Protects all the fields that can be changed by class' methods. + mutable std::shared_mutex MMutex; + /// Constructor. /// @param SyclContext Context to use for graph. /// @param SyclDevice Device to create nodes with. @@ -207,10 +375,6 @@ class graph_impl { } } - /// Insert node into list of root nodes. - /// @param Root Node to add to list of root nodes. - void addRoot(const std::shared_ptr &Root); - /// Remove node from list of root nodes. /// @param Root Node to remove from list of root nodes. void removeRoot(const std::shared_ptr &Root); @@ -284,13 +448,13 @@ class graph_impl { /// @return Event associated with node. std::shared_ptr getEventForNode(std::shared_ptr NodeImpl) const { + ReadLock Lock(MMutex); if (auto EventImpl = std::find_if( MEventsMap.begin(), MEventsMap.end(), [NodeImpl](auto &it) { return it.second == NodeImpl; }); EventImpl != MEventsMap.end()) { return EventImpl->first; } - throw sycl::exception( sycl::make_error_code(errc::invalid), "No event has been recorded for the specified graph node"); @@ -335,6 +499,85 @@ class graph_impl { MInorderQueueMap[QueueWeakPtr] = Node; } + /// Checks if the graph_impl of Graph has a similar structure to + /// the graph_impl of the caller. + /// Graphs are considered similar if they have same numbers of nodes + /// of the same type with similar predecessor and successor nodes (number and + /// type). Two nodes are considered similar if they have the same + /// command-group type. For command-groups of type "kernel", the "signature" + /// of the kernel is also compared (i.e. the name of the command-group). + /// @param Graph if reference to the graph to compare with. + /// @param DebugPrint if set to true throw exception with additional debug + /// information about the spotted graph differences. + /// @return true if the two graphs are similar, false otherwise + bool hasSimilarStructure(std::shared_ptr Graph, + bool DebugPrint = false) const { + if (this == Graph.get()) + return true; + + if (MContext != Graph->MContext) { + if (DebugPrint) { + throw sycl::exception(sycl::make_error_code(errc::invalid), + "MContext are not the same."); + } + return false; + } + + if (MDevice != Graph->MDevice) { + if (DebugPrint) { + throw sycl::exception(sycl::make_error_code(errc::invalid), + "MDevice are not the same."); + } + return false; + } + + if (MEventsMap.size() != Graph->MEventsMap.size()) { + if (DebugPrint) { + throw sycl::exception(sycl::make_error_code(errc::invalid), + "MEventsMap sizes are not the same."); + } + return false; + } + + if (MInorderQueueMap.size() != Graph->MInorderQueueMap.size()) { + if (DebugPrint) { + throw sycl::exception(sycl::make_error_code(errc::invalid), + "MInorderQueueMap sizes are not the same."); + } + return false; + } + + if (MRoots.size() != Graph->MRoots.size()) { + if (DebugPrint) { + throw sycl::exception(sycl::make_error_code(errc::invalid), + "MRoots sizes are not the same."); + } + return false; + } + + size_t RootsFound = 0; + for (std::shared_ptr NodeA : MRoots) { + for (std::shared_ptr NodeB : Graph->MRoots) { + if (NodeA->isSimilar(NodeB)) { + if (NodeA->checkNodeRecursive(NodeB)) { + RootsFound++; + break; + } + } + } + } + + if (RootsFound != MRoots.size()) { + if (DebugPrint) { + throw sycl::exception(sycl::make_error_code(errc::invalid), + "Root Nodes do NOT match."); + } + return false; + } + + return true; + } + /// Make an edge between two nodes in the graph. Performs some mandatory /// error checks as well as an optional check for cycles introduced by making /// this edge. @@ -355,6 +598,16 @@ class graph_impl { } } + // Returns the number of nodes in the Graph + // @return Number of nodes in the Graph + size_t getNumberOfNodes() const { + size_t NumberOfNodes = 0; + for (const auto &Node : MRoots) { + NumberOfNodes += Node->depthSearchCount(); + } + return NumberOfNodes; + } + private: /// Iterate over the graph depth-first and run \p NodeFunc on each node. /// @param NodeFunc A function which receives as input a node in the graph to @@ -393,11 +646,21 @@ class graph_impl { /// Controls whether we skip the cycle checks in makeEdge, set by the presence /// of the no_cycle_check property on construction. bool MSkipCycleChecks = false; + + /// Insert node into list of root nodes. + /// @param Root Node to add to list of root nodes. + void addRoot(const std::shared_ptr &Root); }; /// Class representing the implementation of command_graph. class exec_graph_impl { public: + using ReadLock = std::shared_lock; + using WriteLock = std::unique_lock; + + /// Protects all the fields that can be changed by class' methods. + mutable std::shared_mutex MMutex; + /// Constructor. /// @param Context Context to create graph with. /// @param GraphImpl Modifiable graph implementation to create with. @@ -473,6 +736,10 @@ class exec_graph_impl { std::list> MSchedule; /// Pointer to the modifiable graph impl associated with this executable /// graph. + /// Thread-safe implementation note: in the current implementation + /// multiple exec_graph_impl can reference the same graph_impl object. + /// This specificity must be taken into account when trying to lock + /// the graph_impl mutex from an exec_graph_impl to avoid deadlock. std::shared_ptr MGraphImpl; /// Map of devices to command buffers. std::unordered_map diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index cf3e06e5e8e30..803bafeb2333a 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -683,6 +683,7 @@ class queue_impl { void setCommandGraph( std::shared_ptr Graph) { + std::lock_guard Lock(MMutex); MGraph = Graph; } diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index f12478429d1f2..c5bbfd8818509 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -456,6 +456,11 @@ event handler::finalize() { std::shared_ptr NodeImpl = nullptr; + // GraphImpl is read and written in this scope so we lock this graph + // with full priviledges. + ext::oneapi::experimental::detail::graph_impl::WriteLock Lock( + GraphImpl->MMutex); + // Create a new node in the graph representing this command-group if (MQueue->isInOrder()) { // In-order queues create implicit linear dependencies between nodes. @@ -1304,6 +1309,10 @@ void handler::ext_oneapi_graph( Graph) { MCGType = detail::CG::ExecCommandBuffer; auto GraphImpl = detail::getSyclObjImpl(Graph); + // GraphImpl is only read in this scope so we lock this graph for read only + ext::oneapi::experimental::detail::graph_impl::ReadLock Lock( + GraphImpl->MMutex); + std::shared_ptr ParentGraph; if (MQueue) { ParentGraph = MQueue->getCommandGraph(); @@ -1311,8 +1320,17 @@ void handler::ext_oneapi_graph( ParentGraph = MGraph; } + ext::oneapi::experimental::detail::graph_impl::WriteLock ParentLock; // If a parent graph is set that means we are adding or recording a subgraph if (ParentGraph) { + // ParentGraph is read and written in this scope so we lock this graph + // with full priviledges. + // We only lock for Record&Replay API because the graph has already been + // lock if this function was called from the explicit API function add + if (MQueue) { + ParentLock = ext::oneapi::experimental::detail::graph_impl::WriteLock( + ParentGraph->MMutex); + } // Store the node representing the subgraph in the handler so that we can // return it to the user later. MSubgraphNode = ParentGraph->addSubgraphNodes(GraphImpl->getSchedule()); diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm.cpp index c7adb7f282da4..c7a8879bc918a 100644 --- a/sycl/test-e2e/Graph/Explicit/basic_usm.cpp +++ b/sycl/test-e2e/Graph/Explicit/basic_usm.cpp @@ -1,5 +1,5 @@ // REQUIRES: level_zero, gpu -// RUN: %{build} -o %t.out +// RUN: %{build_pthread_inc} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} diff --git a/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp index a414e3f4b8d6c..ac45368ed4e02 100644 --- a/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp +++ b/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp @@ -1,5 +1,5 @@ // REQUIRES: level_zero, gpu -// RUN: %{build} -o %t.out +// RUN: %{build_pthread_inc} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} diff --git a/sycl/test-e2e/Graph/Inputs/basic_usm.cpp b/sycl/test-e2e/Graph/Inputs/basic_usm.cpp index 5a8c9291ff0ef..476d84f95b956 100644 --- a/sycl/test-e2e/Graph/Inputs/basic_usm.cpp +++ b/sycl/test-e2e/Graph/Inputs/basic_usm.cpp @@ -2,12 +2,14 @@ // and submission of the graph. #include "../graph_common.hpp" +#include int main() { queue Queue; using T = int; + const unsigned NumThreads = std::thread::hardware_concurrency(); std::vector DataA(Size), DataB(Size), DataC(Size); std::iota(DataA.begin(), DataA.end(), 1); @@ -32,8 +34,15 @@ int main() { // Add commands to graph add_nodes(Graph, Queue, Size, PtrA, PtrB, PtrC); + Barrier SyncPoint{NumThreads}; + auto GraphExec = Graph.finalize(); + auto SubmitGraph = [&]() { + SyncPoint.wait(); + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + }; + event Event; for (unsigned n = 0; n < Iterations; n++) { Event = Queue.submit([&](handler &CGH) { diff --git a/sycl/test-e2e/Graph/Inputs/multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/Inputs/multiple_exec_graphs.cpp index 8bdf3caa1415d..ab74b3aa89625 100644 --- a/sycl/test-e2e/Graph/Inputs/multiple_exec_graphs.cpp +++ b/sycl/test-e2e/Graph/Inputs/multiple_exec_graphs.cpp @@ -2,12 +2,40 @@ // graph. #include "../graph_common.hpp" +#include + +#include + +bool checkExecGraphSchedule( + std::shared_ptr + GraphA, + std::shared_ptr + GraphB) { + auto ScheduleA = GraphA->getSchedule(); + auto ScheduleB = GraphB->getSchedule(); + if (ScheduleA.size() != ScheduleB.size()) + return false; + + std::vector< + std::shared_ptr> + VScheduleA{std::begin(ScheduleA), std::end(ScheduleA)}; + std::vector< + std::shared_ptr> + VScheduleB{std::begin(ScheduleB), std::end(ScheduleB)}; + + for (size_t i = 0; i < VScheduleA.size(); i++) { + if (!VScheduleA[i]->isSimilar(VScheduleB[i])) + return false; + } + return true; +} int main() { queue Queue; using T = int; + const unsigned NumThreads = std::thread::hardware_concurrency(); std::vector DataA(Size), DataB(Size), DataC(Size); std::iota(DataA.begin(), DataA.end(), 1); @@ -34,15 +62,31 @@ int main() { add_nodes(Graph, Queue, Size, PtrA, PtrB, PtrC); - // Finalize and execute several iterations of the graph - event Event; - for (unsigned n = 0; n < Iterations; n++) { + Barrier SyncPoint{NumThreads}; + + std::map> + GraphsExecMap; + auto FinalizeGraph = [&](int ThreadNum) { + SyncPoint.wait(); auto GraphExec = Graph.finalize(); - Event = Queue.submit([&](handler &CGH) { - CGH.depends_on(Event); - CGH.ext_oneapi_graph(GraphExec); - }); + GraphsExecMap.insert( + std::map>:: + value_type(ThreadNum, GraphExec)); + Queue.submit([&](sycl::handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + }; + + std::vector Threads; + Threads.reserve(NumThreads); + + for (unsigned i = 0; i < NumThreads; ++i) { + Threads.emplace_back(FinalizeGraph, i); } + + for (unsigned i = 0; i < NumThreads; ++i) { + Threads[i].join(); + } + Queue.wait_and_throw(); Queue.copy(PtrA, DataA.data(), Size); @@ -50,13 +94,40 @@ int main() { Queue.copy(PtrC, DataC.data(), Size); Queue.wait_and_throw(); + // Ref computation + queue QueueRef{Queue.get_context(), Queue.get_device()}; + exp_ext::command_graph GraphRef{Queue.get_context(), Queue.get_device()}; + + T *PtrARef = malloc_device(Size, QueueRef); + T *PtrBRef = malloc_device(Size, QueueRef); + T *PtrCRef = malloc_device(Size, QueueRef); + + QueueRef.copy(DataA.data(), PtrARef, Size); + QueueRef.copy(DataB.data(), PtrBRef, Size); + QueueRef.copy(DataC.data(), PtrCRef, Size); + QueueRef.wait_and_throw(); + + add_nodes(GraphRef, QueueRef, Size, PtrARef, PtrBRef, PtrCRef); + + for (unsigned i = 0; i < NumThreads; ++i) { + auto GraphExecRef = GraphRef.finalize(); + QueueRef.submit( + [&](sycl::handler &CGH) { CGH.ext_oneapi_graph(GraphExecRef); }); + auto GraphExecImpl = + sycl::detail::getSyclObjImpl(GraphsExecMap.find(i)->second); + auto GraphExecRefImpl = sycl::detail::getSyclObjImpl(GraphExecRef); + assert(checkExecGraphSchedule(GraphExecImpl, GraphExecRefImpl)); + } + + QueueRef.wait_and_throw(); + + free(PtrARef, QueueRef); + free(PtrBRef, QueueRef); + free(PtrCRef, QueueRef); + free(PtrA, Queue); free(PtrB, Queue); free(PtrC, Queue); - assert(ReferenceA == DataA); - assert(ReferenceB == DataB); - assert(ReferenceC == DataC); - return 0; } diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp index 65b7a146fbf95..78616fc80a13b 100644 --- a/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp @@ -1,5 +1,5 @@ // REQUIRES: level_zero, gpu -// RUN: %{build} -o %t.out +// RUN: %{build_pthread_inc} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} @@ -8,4 +8,4 @@ #define GRAPH_E2E_RECORD_REPLAY -#include "../Inputs/basic_usm.cpp" \ No newline at end of file +#include "../Inputs/basic_usm.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp index 8a59f12d316b4..9b84b62380e5b 100644 --- a/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp @@ -1,5 +1,5 @@ // REQUIRES: level_zero, gpu -// RUN: %{build} -o %t.out +// RUN: %{build_pthread_inc} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} diff --git a/sycl/test-e2e/Graph/graph_common.hpp b/sycl/test-e2e/Graph/graph_common.hpp index c5ccc1ba6d271..1a059abf271b9 100644 --- a/sycl/test-e2e/Graph/graph_common.hpp +++ b/sycl/test-e2e/Graph/graph_common.hpp @@ -2,6 +2,8 @@ #include +#include // std::conditional_variable +#include // std::mutex, std::unique_lock #include // Test constants. @@ -407,3 +409,26 @@ constexpr float Gamma = 3.0f; float dotp_reference_result(size_t N) { return N * (Alpha * 1.0f + Beta * 2.0f) * (Gamma * 3.0f + Beta * 2.0f); } + +/* Single use thread barrier which makes threads wait until defined number of + * threads reach it. + * std:barrier should be used instead once compiler is moved to C++20 standard. + */ +class Barrier { +public: + Barrier() = delete; + explicit Barrier(std::size_t count) : threadNum(count) {} + void wait() { + std::unique_lock lock(mutex); + if (--threadNum == 0) { + cv.notify_all(); + } else { + cv.wait(lock, [this] { return threadNum == 0; }); + } + } + +private: + std::mutex mutex; + std::condition_variable cv; + std::size_t threadNum; +}; diff --git a/sycl/test-e2e/format.py b/sycl/test-e2e/format.py index 17750ddb2350c..53031ecadbb7c 100644 --- a/sycl/test-e2e/format.py +++ b/sycl/test-e2e/format.py @@ -101,6 +101,18 @@ def execute(self, test, litConfig): # -other restrictions). substitutions.append(('%{build}', '%clangxx -fsycl -fsycl-targets=%{sycl_triple} %s')) + # get GIT root path + stream = os.popen('git rev-parse --show-toplevel') + git_root_path = stream.read()[:-1] + + if 'windows' in test.config.available_features: + source_files_path = git_root_path+"\sycl\source" + else: + source_files_path = git_root_path+"/sycl/source" + + compilation_cmd_pthread = "%clangxx -I" + source_files_path + " -pthread -fsycl -fsycl-targets=%{sycl_triple} %s" + substitutions.append(('%{build_pthread_inc}', compilation_cmd_pthread)) + def get_extra_env(sycl_devices): # Note: It's possible that the system has a device from below but # current llvm-lit invocation isn't configured to include it. We diff --git a/sycl/unittests/Extensions/CommandGraph.cpp b/sycl/unittests/Extensions/CommandGraph.cpp index 6015706569d49..beacadce8bc00 100644 --- a/sycl/unittests/Extensions/CommandGraph.cpp +++ b/sycl/unittests/Extensions/CommandGraph.cpp @@ -9,6 +9,7 @@ #include "sycl/ext/oneapi/experimental/graph.hpp" #include +#include "../thread_safety/ThreadUtils.h" #include "detail/graph_impl.hpp" #include @@ -20,6 +21,9 @@ using namespace sycl; using namespace sycl::ext::oneapi; +// anonymous namespace used to avoid code redundancy by defining functions +// used by multiple times by unitests. +// Defining anonymous namespace prevents from function naming conflits namespace { /// Define the three possible path to add node to a SYCL Graph. /// Shortcut is a sub-type of Record&Replay using Queue shortcut @@ -291,7 +295,67 @@ void addMemcpy2D(experimental::detail::modifiable_command_graph &G, queue &Q, ASSERT_EQ(ExceptionCode, sycl::errc::invalid); } -} // namespace +bool depthSearchSuccessorCheck( + std::shared_ptr Node) { + if (Node->MSuccessors.size() > 1) + return false; + + for (const auto &Succ : Node->MSuccessors) { + return Succ->depthSearchCount(); + } + return true; +} + +/// Submits four kernels with diamond dependency to the queue Q +/// @param Q Queue to submit nodes to. +void runKernels(queue Q) { + auto NodeA = Q.submit( + [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + auto NodeB = Q.submit([&](sycl::handler &cgh) { + cgh.depends_on(NodeA); + cgh.single_task([]() {}); + }); + auto NodeC = Q.submit([&](sycl::handler &cgh) { + cgh.depends_on(NodeA); + cgh.single_task([]() {}); + }); + auto NodeD = Q.submit([&](sycl::handler &cgh) { + cgh.depends_on({NodeB, NodeC}); + cgh.single_task([]() {}); + }); +} + +/// Submits four kernels without any additional dependencies the queue Q +/// @param Q Queue to submit nodes to. +void runKernelsInOrder(queue Q) { + auto NodeA = Q.submit( + [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + auto NodeB = Q.submit( + [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + auto NodeC = Q.submit( + [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + auto NodeD = Q.submit( + [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); +} + +/// Adds four kernels with diamond dependency to the Graph G +/// @param G Modifiable graph to add commands to. +void addKernels( + experimental::command_graph G) { + auto NodeA = G.add( + [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + auto NodeB = G.add( + [&](sycl::handler &cgh) { cgh.single_task([]() {}); }, + {experimental::property::node::depends_on(NodeA)}); + auto NodeC = G.add( + [&](sycl::handler &cgh) { cgh.single_task([]() {}); }, + {experimental::property::node::depends_on(NodeA)}); + auto NodeD = G.add( + [&](sycl::handler &cgh) { cgh.single_task([]() {}); }, + {experimental::property::node::depends_on(NodeB, NodeC)}); +} + +} // anonymous namespace class CommandGraphTest : public ::testing::Test { public: @@ -1074,3 +1138,166 @@ TEST_F(CommandGraphTest, Memcpy2DExceptionCheck) { sycl::free(USMMemSrc, Queue); sycl::free(USMMemDst, Queue); } + +class MultiThreadGraphTest : public CommandGraphTest { +public: + MultiThreadGraphTest() + : CommandGraphTest(), NumThreads(std::thread::hardware_concurrency()), + SyncPoint(NumThreads) { + Threads.reserve(NumThreads); + } + +protected: + const unsigned NumThreads; + Barrier SyncPoint; + std::vector Threads; +}; + +TEST_F(MultiThreadGraphTest, BeginEndRecording) { + auto RecordGraph = [&]() { + queue MyQueue{Queue.get_context(), Queue.get_device()}; + + SyncPoint.wait(); + + Graph.begin_recording(MyQueue); + runKernels(MyQueue); + Graph.end_recording(MyQueue); + }; + + for (unsigned i = 0; i < NumThreads; ++i) { + Threads.emplace_back(RecordGraph); + } + + for (unsigned i = 0; i < NumThreads; ++i) { + Threads[i].join(); + } + + // Reference computation + queue QueueRef; + experimental::command_graph GraphRef{ + Queue.get_context(), Queue.get_device()}; + + for (unsigned i = 0; i < NumThreads; ++i) { + queue MyQueue; + GraphRef.begin_recording(MyQueue); + runKernels(MyQueue); + GraphRef.end_recording(MyQueue); + } + + auto GraphImpl = sycl::detail::getSyclObjImpl(Graph); + auto GraphRefImpl = sycl::detail::getSyclObjImpl(GraphRef); + ASSERT_EQ(GraphImpl->hasSimilarStructure(GraphRefImpl), true); +} + +TEST_F(MultiThreadGraphTest, ExplicitAddNodes) { + auto RecordGraph = [&]() { + queue MyQueue{Queue.get_context(), Queue.get_device()}; + + SyncPoint.wait(); + addKernels(Graph); + }; + + for (unsigned i = 0; i < NumThreads; ++i) { + Threads.emplace_back(RecordGraph); + } + + for (unsigned i = 0; i < NumThreads; ++i) { + Threads[i].join(); + } + + // Reference computation + queue QueueRef; + experimental::command_graph GraphRef{ + Queue.get_context(), Queue.get_device()}; + + for (unsigned i = 0; i < NumThreads; ++i) { + addKernels(GraphRef); + } + + auto GraphImpl = sycl::detail::getSyclObjImpl(Graph); + auto GraphRefImpl = sycl::detail::getSyclObjImpl(GraphRef); + ASSERT_EQ(GraphImpl->hasSimilarStructure(GraphRefImpl), true); +} + +TEST_F(MultiThreadGraphTest, RecordAddNodes) { + Graph.begin_recording(Queue); + auto RecordGraph = [&]() { + SyncPoint.wait(); + runKernels(Queue); + }; + + for (unsigned i = 0; i < NumThreads; ++i) { + Threads.emplace_back(RecordGraph); + } + + for (unsigned i = 0; i < NumThreads; ++i) { + Threads[i].join(); + } + + // We stop recording the Queue when all threads have finished their processing + Graph.end_recording(Queue); + + // Reference computation + queue QueueRef; + experimental::command_graph GraphRef{ + Queue.get_context(), Queue.get_device()}; + + GraphRef.begin_recording(QueueRef); + for (unsigned i = 0; i < NumThreads; ++i) { + runKernels(QueueRef); + } + GraphRef.end_recording(QueueRef); + + auto GraphImpl = sycl::detail::getSyclObjImpl(Graph); + auto GraphRefImpl = sycl::detail::getSyclObjImpl(GraphRef); + ASSERT_EQ(GraphImpl->hasSimilarStructure(GraphRefImpl), true); +} + +TEST_F(MultiThreadGraphTest, RecordAddNodesInOrderQueue) { + sycl::property_list Properties{sycl::property::queue::in_order()}; + queue InOrderQueue{Dev, Properties}; + + experimental::command_graph + InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()}; + + InOrderGraph.begin_recording(InOrderQueue); + auto RecordGraph = [&]() { + SyncPoint.wait(); + runKernelsInOrder(InOrderQueue); + }; + + for (unsigned i = 0; i < NumThreads; ++i) { + Threads.emplace_back(RecordGraph); + } + + for (unsigned i = 0; i < NumThreads; ++i) { + Threads[i].join(); + } + + // We stop recording the Queue when all threads have finished their processing + InOrderGraph.end_recording(InOrderQueue); + + // Reference computation + queue InOrderQueueRef{Dev, Properties}; + experimental::command_graph + InOrderGraphRef{InOrderQueueRef.get_context(), + InOrderQueueRef.get_device()}; + + InOrderGraphRef.begin_recording(InOrderQueueRef); + for (unsigned i = 0; i < NumThreads; ++i) { + runKernelsInOrder(InOrderQueueRef); + } + InOrderGraphRef.end_recording(InOrderQueueRef); + + auto GraphImpl = sycl::detail::getSyclObjImpl(InOrderGraph); + auto GraphRefImpl = sycl::detail::getSyclObjImpl(InOrderGraphRef); + ASSERT_EQ(GraphImpl->getNumberOfNodes(), GraphRefImpl->getNumberOfNodes()); + + // In-order graph must have only a single root + ASSERT_EQ(GraphImpl->MRoots.size(), 1lu); + + // Check structure graph + for (auto Node : GraphImpl->MRoots) { + ASSERT_EQ(depthSearchSuccessorCheck(Node), true); + } +} From e578f69102c95ab2ac2ba8a1564aacdafe914467 Mon Sep 17 00:00:00 2001 From: Maxime France-Pillois Date: Mon, 7 Aug 2023 14:26:22 +0100 Subject: [PATCH 6/8] [SYCL][Graph] Makes command graph functions thread-safe (bugfix) Removes the test-e2e dependency to graph_impl.hpp by changing the e2e test to an unitests. --- .../Graph/Explicit/multiple_exec_graphs.cpp | 2 +- .../Graph/Inputs/multiple_exec_graphs.cpp | 93 ++--------- .../RecordReplay/multiple_exec_graphs.cpp | 2 +- sycl/test-e2e/format.py | 11 +- sycl/unittests/Extensions/CommandGraph.cpp | 152 +++++++++++++----- 5 files changed, 124 insertions(+), 136 deletions(-) diff --git a/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp index ac45368ed4e02..a414e3f4b8d6c 100644 --- a/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp +++ b/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp @@ -1,5 +1,5 @@ // REQUIRES: level_zero, gpu -// RUN: %{build_pthread_inc} -o %t.out +// RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} diff --git a/sycl/test-e2e/Graph/Inputs/multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/Inputs/multiple_exec_graphs.cpp index ab74b3aa89625..8bdf3caa1415d 100644 --- a/sycl/test-e2e/Graph/Inputs/multiple_exec_graphs.cpp +++ b/sycl/test-e2e/Graph/Inputs/multiple_exec_graphs.cpp @@ -2,40 +2,12 @@ // graph. #include "../graph_common.hpp" -#include - -#include - -bool checkExecGraphSchedule( - std::shared_ptr - GraphA, - std::shared_ptr - GraphB) { - auto ScheduleA = GraphA->getSchedule(); - auto ScheduleB = GraphB->getSchedule(); - if (ScheduleA.size() != ScheduleB.size()) - return false; - - std::vector< - std::shared_ptr> - VScheduleA{std::begin(ScheduleA), std::end(ScheduleA)}; - std::vector< - std::shared_ptr> - VScheduleB{std::begin(ScheduleB), std::end(ScheduleB)}; - - for (size_t i = 0; i < VScheduleA.size(); i++) { - if (!VScheduleA[i]->isSimilar(VScheduleB[i])) - return false; - } - return true; -} int main() { queue Queue; using T = int; - const unsigned NumThreads = std::thread::hardware_concurrency(); std::vector DataA(Size), DataB(Size), DataC(Size); std::iota(DataA.begin(), DataA.end(), 1); @@ -62,31 +34,15 @@ int main() { add_nodes(Graph, Queue, Size, PtrA, PtrB, PtrC); - Barrier SyncPoint{NumThreads}; - - std::map> - GraphsExecMap; - auto FinalizeGraph = [&](int ThreadNum) { - SyncPoint.wait(); + // Finalize and execute several iterations of the graph + event Event; + for (unsigned n = 0; n < Iterations; n++) { auto GraphExec = Graph.finalize(); - GraphsExecMap.insert( - std::map>:: - value_type(ThreadNum, GraphExec)); - Queue.submit([&](sycl::handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); - }; - - std::vector Threads; - Threads.reserve(NumThreads); - - for (unsigned i = 0; i < NumThreads; ++i) { - Threads.emplace_back(FinalizeGraph, i); + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(GraphExec); + }); } - - for (unsigned i = 0; i < NumThreads; ++i) { - Threads[i].join(); - } - Queue.wait_and_throw(); Queue.copy(PtrA, DataA.data(), Size); @@ -94,40 +50,13 @@ int main() { Queue.copy(PtrC, DataC.data(), Size); Queue.wait_and_throw(); - // Ref computation - queue QueueRef{Queue.get_context(), Queue.get_device()}; - exp_ext::command_graph GraphRef{Queue.get_context(), Queue.get_device()}; - - T *PtrARef = malloc_device(Size, QueueRef); - T *PtrBRef = malloc_device(Size, QueueRef); - T *PtrCRef = malloc_device(Size, QueueRef); - - QueueRef.copy(DataA.data(), PtrARef, Size); - QueueRef.copy(DataB.data(), PtrBRef, Size); - QueueRef.copy(DataC.data(), PtrCRef, Size); - QueueRef.wait_and_throw(); - - add_nodes(GraphRef, QueueRef, Size, PtrARef, PtrBRef, PtrCRef); - - for (unsigned i = 0; i < NumThreads; ++i) { - auto GraphExecRef = GraphRef.finalize(); - QueueRef.submit( - [&](sycl::handler &CGH) { CGH.ext_oneapi_graph(GraphExecRef); }); - auto GraphExecImpl = - sycl::detail::getSyclObjImpl(GraphsExecMap.find(i)->second); - auto GraphExecRefImpl = sycl::detail::getSyclObjImpl(GraphExecRef); - assert(checkExecGraphSchedule(GraphExecImpl, GraphExecRefImpl)); - } - - QueueRef.wait_and_throw(); - - free(PtrARef, QueueRef); - free(PtrBRef, QueueRef); - free(PtrCRef, QueueRef); - free(PtrA, Queue); free(PtrB, Queue); free(PtrC, Queue); + assert(ReferenceA == DataA); + assert(ReferenceB == DataB); + assert(ReferenceC == DataC); + return 0; } diff --git a/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp index 9b84b62380e5b..8a59f12d316b4 100644 --- a/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp @@ -1,5 +1,5 @@ // REQUIRES: level_zero, gpu -// RUN: %{build_pthread_inc} -o %t.out +// RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} diff --git a/sycl/test-e2e/format.py b/sycl/test-e2e/format.py index 53031ecadbb7c..2a90ce4f8cd3f 100644 --- a/sycl/test-e2e/format.py +++ b/sycl/test-e2e/format.py @@ -100,17 +100,8 @@ def execute(self, test, litConfig): # -that new tests by default would runnable there (unless they have # -other restrictions). substitutions.append(('%{build}', '%clangxx -fsycl -fsycl-targets=%{sycl_triple} %s')) - - # get GIT root path - stream = os.popen('git rev-parse --show-toplevel') - git_root_path = stream.read()[:-1] - - if 'windows' in test.config.available_features: - source_files_path = git_root_path+"\sycl\source" - else: - source_files_path = git_root_path+"/sycl/source" - compilation_cmd_pthread = "%clangxx -I" + source_files_path + " -pthread -fsycl -fsycl-targets=%{sycl_triple} %s" + compilation_cmd_pthread = "%clangxx -pthread -fsycl -fsycl-targets=%{sycl_triple} %s" substitutions.append(('%{build_pthread_inc}', compilation_cmd_pthread)) def get_extra_env(sycl_devices): diff --git a/sycl/unittests/Extensions/CommandGraph.cpp b/sycl/unittests/Extensions/CommandGraph.cpp index beacadce8bc00..a413b291c1c88 100644 --- a/sycl/unittests/Extensions/CommandGraph.cpp +++ b/sycl/unittests/Extensions/CommandGraph.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include @@ -310,18 +311,18 @@ bool depthSearchSuccessorCheck( /// @param Q Queue to submit nodes to. void runKernels(queue Q) { auto NodeA = Q.submit( - [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); auto NodeB = Q.submit([&](sycl::handler &cgh) { cgh.depends_on(NodeA); - cgh.single_task([]() {}); + cgh.single_task>([]() {}); }); auto NodeC = Q.submit([&](sycl::handler &cgh) { cgh.depends_on(NodeA); - cgh.single_task([]() {}); + cgh.single_task>([]() {}); }); auto NodeD = Q.submit([&](sycl::handler &cgh) { cgh.depends_on({NodeB, NodeC}); - cgh.single_task([]() {}); + cgh.single_task>([]() {}); }); } @@ -329,13 +330,13 @@ void runKernels(queue Q) { /// @param Q Queue to submit nodes to. void runKernelsInOrder(queue Q) { auto NodeA = Q.submit( - [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); auto NodeB = Q.submit( - [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); auto NodeC = Q.submit( - [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); auto NodeD = Q.submit( - [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); } /// Adds four kernels with diamond dependency to the Graph G @@ -343,16 +344,40 @@ void runKernelsInOrder(queue Q) { void addKernels( experimental::command_graph G) { auto NodeA = G.add( - [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); - auto NodeB = G.add( - [&](sycl::handler &cgh) { cgh.single_task([]() {}); }, - {experimental::property::node::depends_on(NodeA)}); - auto NodeC = G.add( - [&](sycl::handler &cgh) { cgh.single_task([]() {}); }, - {experimental::property::node::depends_on(NodeA)}); - auto NodeD = G.add( - [&](sycl::handler &cgh) { cgh.single_task([]() {}); }, - {experimental::property::node::depends_on(NodeB, NodeC)}); + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto NodeB = + G.add([&](sycl::handler &cgh) { cgh.single_task>([]() {}); }, + {experimental::property::node::depends_on(NodeA)}); + auto NodeC = + G.add([&](sycl::handler &cgh) { cgh.single_task>([]() {}); }, + {experimental::property::node::depends_on(NodeA)}); + auto NodeD = + G.add([&](sycl::handler &cgh) { cgh.single_task>([]() {}); }, + {experimental::property::node::depends_on(NodeB, NodeC)}); +} + +bool checkExecGraphSchedule( + std::shared_ptr + GraphA, + std::shared_ptr + GraphB) { + auto ScheduleA = GraphA->getSchedule(); + auto ScheduleB = GraphB->getSchedule(); + if (ScheduleA.size() != ScheduleB.size()) + return false; + + std::vector< + std::shared_ptr> + VScheduleA{std::begin(ScheduleA), std::end(ScheduleA)}; + std::vector< + std::shared_ptr> + VScheduleB{std::begin(ScheduleB), std::end(ScheduleB)}; + + for (size_t i = 0; i < VScheduleA.size(); i++) { + if (!VScheduleA[i]->isSimilar(VScheduleB[i])) + return false; + } + return true; } } // anonymous namespace @@ -394,7 +419,7 @@ TEST_F(CommandGraphTest, AddNode) { ASSERT_TRUE(GraphImpl->MRoots.empty()); auto Node1 = Graph.add( - [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); ASSERT_NE(sycl::detail::getSyclObjImpl(Node1), nullptr); ASSERT_FALSE(sycl::detail::getSyclObjImpl(Node1)->isEmpty()); ASSERT_EQ(GraphImpl->MRoots.size(), 1lu); @@ -454,7 +479,7 @@ TEST_F(CommandGraphTest, Finalize) { // Add independent node auto Node2 = Graph.add( - [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); // Add a node that depends on Node1 due to the accessor auto Node3 = Graph.add([&](sycl::handler &cgh) { @@ -485,7 +510,7 @@ TEST_F(CommandGraphTest, MakeEdge) { // Add two independent nodes auto Node1 = Graph.add( - [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); auto Node2 = Graph.add([&](sycl::handler &cgh) {}); ASSERT_EQ(GraphImpl->MRoots.size(), 2ul); ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node1)->MSuccessors.empty()); @@ -579,7 +604,7 @@ TEST_F(CommandGraphTest, BeginEndRecording) { TEST_F(CommandGraphTest, GetCGCopy) { auto Node1 = Graph.add([&](sycl::handler &cgh) {}); auto Node2 = Graph.add( - [&](sycl::handler &cgh) { cgh.single_task([]() {}); }, + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }, {experimental::property::node::depends_on(Node1)}); // Get copy of CG of Node2 and check equality @@ -601,21 +626,21 @@ TEST_F(CommandGraphTest, GetCGCopy) { TEST_F(CommandGraphTest, SubGraph) { // Add sub-graph with two nodes auto Node1Graph = Graph.add( - [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); auto Node2Graph = Graph.add( - [&](sycl::handler &cgh) { cgh.single_task([]() {}); }, + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }, {experimental::property::node::depends_on(Node1Graph)}); auto GraphExec = Graph.finalize(); // Add node to main graph followed by sub-graph and another node experimental::command_graph MainGraph(Queue.get_context(), Dev); auto Node1MainGraph = MainGraph.add( - [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); auto Node2MainGraph = MainGraph.add([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }, {experimental::property::node::depends_on(Node1MainGraph)}); auto Node3MainGraph = MainGraph.add( - [&](sycl::handler &cgh) { cgh.single_task([]() {}); }, + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }, {experimental::property::node::depends_on(Node2MainGraph)}); // Assert order of the added sub-graph @@ -653,10 +678,10 @@ TEST_F(CommandGraphTest, RecordSubGraph) { // Record sub-graph with two nodes Graph.begin_recording(Queue); auto Node1Graph = Queue.submit( - [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); auto Node2Graph = Queue.submit([&](sycl::handler &cgh) { cgh.depends_on(Node1Graph); - cgh.single_task([]() {}); + cgh.single_task>([]() {}); }); Graph.end_recording(Queue); auto GraphExec = Graph.finalize(); @@ -665,14 +690,14 @@ TEST_F(CommandGraphTest, RecordSubGraph) { experimental::command_graph MainGraph(Queue.get_context(), Dev); MainGraph.begin_recording(Queue); auto Node1MainGraph = Queue.submit( - [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); auto Node2MainGraph = Queue.submit([&](handler &cgh) { cgh.depends_on(Node1MainGraph); cgh.ext_oneapi_graph(GraphExec); }); auto Node3MainGraph = Queue.submit([&](sycl::handler &cgh) { cgh.depends_on(Node2MainGraph); - cgh.single_task([]() {}); + cgh.single_task>([]() {}); }); MainGraph.end_recording(Queue); @@ -722,7 +747,7 @@ TEST_F(CommandGraphTest, InOrderQueue) { // Record in-order queue with three nodes InOrderGraph.begin_recording(InOrderQueue); auto Node1Graph = InOrderQueue.submit( - [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); auto PtrNode1 = sycl::detail::getSyclObjImpl(InOrderGraph) @@ -731,7 +756,7 @@ TEST_F(CommandGraphTest, InOrderQueue) { ASSERT_TRUE(PtrNode1->MPredecessors.empty()); auto Node2Graph = InOrderQueue.submit( - [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); auto PtrNode2 = sycl::detail::getSyclObjImpl(InOrderGraph) @@ -744,7 +769,7 @@ TEST_F(CommandGraphTest, InOrderQueue) { ASSERT_EQ(PtrNode2->MPredecessors.front().lock(), PtrNode1); auto Node3Graph = InOrderQueue.submit( - [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); auto PtrNode3 = sycl::detail::getSyclObjImpl(InOrderGraph) @@ -782,7 +807,7 @@ TEST_F(CommandGraphTest, InOrderQueueWithEmpty) { // node InOrderGraph.begin_recording(InOrderQueue); auto Node1Graph = InOrderQueue.submit( - [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); auto PtrNode1 = sycl::detail::getSyclObjImpl(InOrderGraph) @@ -803,7 +828,7 @@ TEST_F(CommandGraphTest, InOrderQueueWithEmpty) { ASSERT_EQ(PtrNode2->MPredecessors.front().lock(), PtrNode1); auto Node3Graph = InOrderQueue.submit( - [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); auto PtrNode3 = sycl::detail::getSyclObjImpl(InOrderGraph) @@ -847,7 +872,7 @@ TEST_F(CommandGraphTest, InOrderQueueWithEmptyFirst) { ASSERT_TRUE(PtrNode1->MPredecessors.empty()); auto Node2Graph = InOrderQueue.submit( - [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); auto PtrNode2 = sycl::detail::getSyclObjImpl(InOrderGraph) @@ -860,7 +885,7 @@ TEST_F(CommandGraphTest, InOrderQueueWithEmptyFirst) { ASSERT_EQ(PtrNode2->MPredecessors.front().lock(), PtrNode1); auto Node3Graph = InOrderQueue.submit( - [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); auto PtrNode3 = sycl::detail::getSyclObjImpl(InOrderGraph) @@ -896,7 +921,7 @@ TEST_F(CommandGraphTest, InOrderQueueWithEmptyLast) { // Record in-order queue with two regular nodes then an empty node InOrderGraph.begin_recording(InOrderQueue); auto Node1Graph = InOrderQueue.submit( - [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); auto PtrNode1 = sycl::detail::getSyclObjImpl(InOrderGraph) @@ -905,7 +930,7 @@ TEST_F(CommandGraphTest, InOrderQueueWithEmptyLast) { ASSERT_TRUE(PtrNode1->MPredecessors.empty()); auto Node2Graph = InOrderQueue.submit( - [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); auto PtrNode2 = sycl::detail::getSyclObjImpl(InOrderGraph) @@ -947,9 +972,9 @@ TEST_F(CommandGraphTest, InOrderQueueWithEmptyLast) { TEST_F(CommandGraphTest, MakeEdgeErrors) { // Set up some nodes in the graph auto NodeA = Graph.add( - [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); auto NodeB = Graph.add( - [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); // Test error on calling make_edge when a queue is recording to the graph Graph.begin_recording(Queue); @@ -982,7 +1007,7 @@ TEST_F(CommandGraphTest, MakeEdgeErrors) { experimental::command_graph GraphOther{ Queue.get_context(), Queue.get_device()}; auto NodeOther = GraphOther.add( - [&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); ASSERT_THROW( { @@ -1301,3 +1326,46 @@ TEST_F(MultiThreadGraphTest, RecordAddNodesInOrderQueue) { ASSERT_EQ(depthSearchSuccessorCheck(Node), true); } } + +TEST_F(MultiThreadGraphTest, Finalize) { + addKernels(Graph); + + std::map> + GraphsExecMap; + auto FinalizeGraph = [&](int ThreadNum) { + SyncPoint.wait(); + auto GraphExec = Graph.finalize(); + + GraphsExecMap.insert( + std::map>:: + value_type(ThreadNum, GraphExec)); + Queue.submit([&](sycl::handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + }; + + for (unsigned i = 0; i < NumThreads; ++i) { + Threads.emplace_back(FinalizeGraph, i); + } + + for (unsigned i = 0; i < NumThreads; ++i) { + Threads[i].join(); + } + + // Reference computation + queue QueueRef; + experimental::command_graph GraphRef{ + Queue.get_context(), Queue.get_device()}; + + addKernels(GraphRef); + + for (unsigned i = 0; i < NumThreads; ++i) { + auto GraphExecRef = GraphRef.finalize(); + QueueRef.submit( + [&](sycl::handler &CGH) { CGH.ext_oneapi_graph(GraphExecRef); }); + auto GraphExecImpl = + sycl::detail::getSyclObjImpl(GraphsExecMap.find(i)->second); + auto GraphExecRefImpl = sycl::detail::getSyclObjImpl(GraphExecRef); + ASSERT_EQ(checkExecGraphSchedule(GraphExecImpl, GraphExecRefImpl), true); + } +} From dc236a01be27be5cf922624e8ce1ff5f438b33cf Mon Sep 17 00:00:00 2001 From: Maxime France-Pillois Date: Mon, 7 Aug 2023 17:24:23 +0100 Subject: [PATCH 7/8] [SYCL][Graph] Throw exception when creating graph for unsupported backend (#280) * [SYCL][Graph] Throw exception when creating graph for unsupported backend - Checks backend when creating graphs and throws an exception is the backend is not supported. - Adds an e2e test to verify this exception throwing. - Updates some comments - Improves mock usage in Unitest to avoid having to force emulation mode --------- Co-authored-by: Pablo Reble Co-authored-by: Julian Miller --- sycl/source/detail/graph_impl.hpp | 10 ++++++ .../Graph/exception_unsupported_backend.cpp | 35 +++++++++++++++++++ sycl/unittests/Extensions/CommandGraph.cpp | 4 +-- sycl/unittests/helpers/PiMockPlugin.hpp | 2 +- 4 files changed, 48 insertions(+), 3 deletions(-) create mode 100644 sycl/test-e2e/Graph/exception_unsupported_backend.cpp diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 731b7914b2ae1..a154ae8f8b18f 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -373,6 +373,16 @@ class graph_impl { if (PropList.has_property()) { MSkipCycleChecks = true; } + if (SyclDevice.get_info< + ext::oneapi::experimental::info::device::graph_support>() == + info::graph_support_level::unsupported) { + std::stringstream Stream; + Stream << SyclDevice.get_backend(); + std::string BackendString = Stream.str(); + throw sycl::exception( + sycl::make_error_code(errc::invalid), + BackendString + " backend is not supported by SYCL Graph extension."); + } } /// Remove node from list of root nodes. diff --git a/sycl/test-e2e/Graph/exception_unsupported_backend.cpp b/sycl/test-e2e/Graph/exception_unsupported_backend.cpp new file mode 100644 index 0000000000000..8b3e079aa476f --- /dev/null +++ b/sycl/test-e2e/Graph/exception_unsupported_backend.cpp @@ -0,0 +1,35 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// Tests the ability to finalize a empty command graph +// The test checks that invalid exception is thrown +// when trying to create a graph with an unsupported backend. + +#include "graph_common.hpp" + +int GetUnsupportedBackend(const sycl::device &Dev) { + // Return 1 if the device backend is unsupported or 0 else. + // 0 does not prevent another device to be picked as a second choice + return Dev.get_info< + ext::oneapi::experimental::info::device::graph_support>() == + ext::oneapi::experimental::info::graph_support_level::unsupported; +} + +int main() { + sycl::device Dev{GetUnsupportedBackend}; + queue Queue{Dev}; + + if (Dev.get_info() != + ext::oneapi::experimental::info::graph_support_level::unsupported) + return 0; + + std::error_code ExceptionCode = make_error_code(sycl::errc::success); + try { + exp_ext::command_graph Graph{Queue.get_context(), Dev}; + } catch (exception &Exception) { + ExceptionCode = Exception.code(); + } + assert(ExceptionCode == sycl::errc::invalid); + + return 0; +} diff --git a/sycl/unittests/Extensions/CommandGraph.cpp b/sycl/unittests/Extensions/CommandGraph.cpp index a413b291c1c88..bc5982a6970f8 100644 --- a/sycl/unittests/Extensions/CommandGraph.cpp +++ b/sycl/unittests/Extensions/CommandGraph.cpp @@ -474,7 +474,7 @@ TEST_F(CommandGraphTest, Finalize) { sycl::buffer Buf(1); auto Node1 = Graph.add([&](sycl::handler &cgh) { sycl::accessor A(Buf, cgh, sycl::write_only, sycl::no_init); - cgh.single_task([=]() { A[0] = 1; }); + cgh.single_task>([]() {}); }); // Add independent node @@ -484,7 +484,7 @@ TEST_F(CommandGraphTest, Finalize) { // Add a node that depends on Node1 due to the accessor auto Node3 = Graph.add([&](sycl::handler &cgh) { sycl::accessor A(Buf, cgh, sycl::write_only, sycl::no_init); - cgh.single_task([=]() { A[0] = 3; }); + cgh.single_task>([]() {}); }); // Guarantee order of independent nodes 1 and 2 diff --git a/sycl/unittests/helpers/PiMockPlugin.hpp b/sycl/unittests/helpers/PiMockPlugin.hpp index 23d1f8f24daae..dac3fce0fddc9 100644 --- a/sycl/unittests/helpers/PiMockPlugin.hpp +++ b/sycl/unittests/helpers/PiMockPlugin.hpp @@ -163,7 +163,7 @@ inline pi_result mock_piDeviceGetInfo(pi_device device, size_t *param_value_size_ret) { constexpr char MockDeviceName[] = "Mock device"; constexpr char MockSupportedExtensions[] = - "cl_khr_fp64 cl_khr_fp16 cl_khr_il_program"; + "cl_khr_fp64 cl_khr_fp16 cl_khr_il_program ur_exp_command_buffer"; switch (param_name) { case PI_DEVICE_INFO_TYPE: { // Act like any device is a GPU. From 2d155961e94491dc99abd63395332f03a7c0fd21 Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Wed, 9 Aug 2023 15:47:55 +0100 Subject: [PATCH 8/8] [SYCL][Graph] Error on invalid buffer behaviour (#287) * [SYCL][Graph] Makes command graph functions thread-safe (bugfix) Removes the test-e2e dependency to graph_impl.hpp by changing the e2e test to an unitests. * [SYCL][Graph] Error on invalid buffer behaviour - Error for buffers with write back enabled. - Error for host accessors to buffers in use by a graph. - Unit tests added for these cases - Fixed E2E tests which were not compliant with these restrictions. * [SYCL][Graph] Implement assume_data_outlives_buffer property - Add assume_data_outlives_buffer property - Add check to disallow buffers with host data without this property - Add a generic test for this. - Update E2E tests which now require the property * Update symbol dumps * [SYCL][Graph] Implement assume_buffer_outlives_graph property and checks * [SYCL][Graph] Add E2E test for assume_buffer_outlives_graph property * [SYCL][Graph] Fix circular reference between queue and graph_impl - Use weak_ptrs in both classes to prevent circular references and unnecessary lifetime extensions - Update unit tests for new properties * [SYCL][Graph] Remove unnecessary test line * [SYCL][Graph] Make graph use count in memobjects atomic * [SYCL][Graph] Make markNoLongerBeingUsedInGraph() thread safe --------- Co-authored-by: Maxime France-Pillois Co-authored-by: Ewan Crawford --- sycl/include/sycl/accessor.hpp | 25 +++ sycl/include/sycl/detail/property_helper.hpp | 4 +- .../sycl/ext/oneapi/experimental/graph.hpp | 24 ++- sycl/source/accessor.cpp | 5 + sycl/source/detail/graph_impl.cpp | 34 +++- sycl/source/detail/graph_impl.hpp | 35 +++- sycl/source/detail/queue_impl.cpp | 4 +- sycl/source/detail/queue_impl.hpp | 5 +- sycl/source/detail/sycl_mem_obj_t.hpp | 29 ++++ sycl/source/handler.cpp | 7 + .../assume_buffer_outlives_graph_property.cpp | 11 ++ .../assume_data_outlives_buffer_property.cpp | 11 ++ .../assume_buffer_outlives_graph_property.cpp | 52 ++++++ .../assume_data_outlives_buffer_property.cpp | 84 ++++++++++ sycl/test-e2e/Graph/Inputs/basic_buffer.cpp | 36 +++-- sycl/test-e2e/Graph/Inputs/buffer_copy.cpp | 152 +++++++++--------- sycl/test-e2e/Graph/Inputs/buffer_copy_2d.cpp | 141 ++++++++-------- .../Graph/Inputs/buffer_copy_host2target.cpp | 22 ++- .../Inputs/buffer_copy_host2target_2d.cpp | 21 ++- .../Inputs/buffer_copy_host2target_offset.cpp | 26 +-- .../Graph/Inputs/buffer_copy_offsets.cpp | 32 ++-- .../Graph/Inputs/buffer_copy_target2host.cpp | 22 ++- .../Inputs/buffer_copy_target2host_2d.cpp | 22 ++- .../Inputs/buffer_copy_target2host_offset.cpp | 26 +-- .../test-e2e/Graph/Inputs/buffer_ordering.cpp | 5 +- .../Graph/Inputs/dotp_buffer_reduction.cpp | 31 ++-- .../Graph/Inputs/dotp_usm_reduction.cpp | 6 +- .../Graph/Inputs/event_status_querying.cpp | 125 +++++++------- .../Graph/Inputs/sub_graph_reduction.cpp | 12 +- .../Graph/Inputs/temp_buffer_reinterpret.cpp | 19 ++- .../assume_buffer_outlives_graph_property.cpp | 11 ++ .../assume_data_outlives_buffer_property.cpp | 11 ++ sycl/test-e2e/format.py | 2 +- sycl/test/abi/sycl_symbols_linux.dump | 1 + sycl/test/abi/sycl_symbols_windows.dump | 51 +++--- sycl/unittests/Extensions/CommandGraph.cpp | 62 ++++++- 36 files changed, 825 insertions(+), 341 deletions(-) create mode 100644 sycl/test-e2e/Graph/Explicit/assume_buffer_outlives_graph_property.cpp create mode 100644 sycl/test-e2e/Graph/Explicit/assume_data_outlives_buffer_property.cpp create mode 100644 sycl/test-e2e/Graph/Inputs/assume_buffer_outlives_graph_property.cpp create mode 100644 sycl/test-e2e/Graph/Inputs/assume_data_outlives_buffer_property.cpp create mode 100644 sycl/test-e2e/Graph/RecordReplay/assume_buffer_outlives_graph_property.cpp create mode 100644 sycl/test-e2e/Graph/RecordReplay/assume_data_outlives_buffer_property.cpp diff --git a/sycl/include/sycl/accessor.hpp b/sycl/include/sycl/accessor.hpp index 9a673a3278ec1..dda66e19b01af 100644 --- a/sycl/include/sycl/accessor.hpp +++ b/sycl/include/sycl/accessor.hpp @@ -540,6 +540,7 @@ class __SYCL_EXPORT AccessorBaseHost { const range<3> &getMemoryRange() const; void *getPtr() const noexcept; bool isPlaceholder() const; + bool isMemoryObjectUsedByGraph() const; detail::AccHostDataT &getAccData(); @@ -1454,6 +1455,18 @@ class __SYCL_EBO __SYCL_SPECIAL_CLASS __SYCL_TYPE(accessor) accessor : typename std::iterator_traits::difference_type; using size_type = std::size_t; + /// If creating a host_accessor this checks to see if the underlying memory + /// object is currently in use by a command_graph, and throws if it is. + void throwIfUsedByGraph() const { +#ifndef __SYCL_DEVICE_ONLY__ + if (IsHostBuf && AccessorBaseHost::isMemoryObjectUsedByGraph()) { + throw sycl::exception(make_error_code(errc::invalid), + "Host accessors cannot be created for buffers " + "which are currently in use by a command graph."); + } +#endif + } + // The list of accessor constructors with their arguments // -------+---------+-------+----+-----+-------------- // Dimensions = 0 @@ -1533,6 +1546,7 @@ class __SYCL_EBO __SYCL_SPECIAL_CLASS __SYCL_TYPE(accessor) accessor : detail::getSyclObjImpl(BufferRef).get(), AdjustedDim, sizeof(DataT), IsPlaceH, BufferRef.OffsetInBytes, BufferRef.IsSubBuffer, PropertyList) { + throwIfUsedByGraph(); preScreenAccessor(PropertyList); if (!AccessorBaseHost::isPlaceholder()) addHostAccessorAndWait(AccessorBaseHost::impl.get()); @@ -1572,6 +1586,7 @@ class __SYCL_EBO __SYCL_SPECIAL_CLASS __SYCL_TYPE(accessor) accessor : detail::getSyclObjImpl(BufferRef).get(), AdjustedDim, sizeof(DataT), IsPlaceH, BufferRef.OffsetInBytes, BufferRef.IsSubBuffer, PropertyList) { + throwIfUsedByGraph(); preScreenAccessor(PropertyList); if (!AccessorBaseHost::isPlaceholder()) addHostAccessorAndWait(AccessorBaseHost::impl.get()); @@ -1607,6 +1622,7 @@ class __SYCL_EBO __SYCL_SPECIAL_CLASS __SYCL_TYPE(accessor) accessor : getAdjustedMode(PropertyList), detail::getSyclObjImpl(BufferRef).get(), Dimensions, sizeof(DataT), BufferRef.OffsetInBytes, BufferRef.IsSubBuffer, PropertyList) { + throwIfUsedByGraph(); preScreenAccessor(PropertyList); detail::associateWithHandler(CommandGroupHandler, this, AccessTarget); initHostAcc(); @@ -1643,6 +1659,7 @@ class __SYCL_EBO __SYCL_SPECIAL_CLASS __SYCL_TYPE(accessor) accessor : getAdjustedMode(PropertyList), detail::getSyclObjImpl(BufferRef).get(), Dimensions, sizeof(DataT), BufferRef.OffsetInBytes, BufferRef.IsSubBuffer, PropertyList) { + throwIfUsedByGraph(); preScreenAccessor(PropertyList); detail::associateWithHandler(CommandGroupHandler, this, AccessTarget); initHostAcc(); @@ -1675,6 +1692,7 @@ class __SYCL_EBO __SYCL_SPECIAL_CLASS __SYCL_TYPE(accessor) accessor : detail::getSyclObjImpl(BufferRef).get(), Dimensions, sizeof(DataT), IsPlaceH, BufferRef.OffsetInBytes, BufferRef.IsSubBuffer, PropertyList) { + throwIfUsedByGraph(); preScreenAccessor(PropertyList); if (!AccessorBaseHost::isPlaceholder()) addHostAccessorAndWait(AccessorBaseHost::impl.get()); @@ -1710,6 +1728,7 @@ class __SYCL_EBO __SYCL_SPECIAL_CLASS __SYCL_TYPE(accessor) accessor : detail::getSyclObjImpl(BufferRef).get(), Dimensions, sizeof(DataT), IsPlaceH, BufferRef.OffsetInBytes, BufferRef.IsSubBuffer, PropertyList) { + throwIfUsedByGraph(); preScreenAccessor(PropertyList); if (!AccessorBaseHost::isPlaceholder()) addHostAccessorAndWait(AccessorBaseHost::impl.get()); @@ -1772,6 +1791,7 @@ class __SYCL_EBO __SYCL_SPECIAL_CLASS __SYCL_TYPE(accessor) accessor : getAdjustedMode(PropertyList), detail::getSyclObjImpl(BufferRef).get(), Dimensions, sizeof(DataT), BufferRef.OffsetInBytes, BufferRef.IsSubBuffer, PropertyList) { + throwIfUsedByGraph(); preScreenAccessor(PropertyList); detail::associateWithHandler(CommandGroupHandler, this, AccessTarget); initHostAcc(); @@ -1806,6 +1826,7 @@ class __SYCL_EBO __SYCL_SPECIAL_CLASS __SYCL_TYPE(accessor) accessor : getAdjustedMode(PropertyList), detail::getSyclObjImpl(BufferRef).get(), Dimensions, sizeof(DataT), BufferRef.OffsetInBytes, BufferRef.IsSubBuffer, PropertyList) { + throwIfUsedByGraph(); preScreenAccessor(PropertyList); initHostAcc(); detail::associateWithHandler(CommandGroupHandler, this, AccessTarget); @@ -1981,6 +2002,7 @@ class __SYCL_EBO __SYCL_SPECIAL_CLASS __SYCL_TYPE(accessor) accessor : detail::getSyclObjImpl(BufferRef).get(), Dimensions, sizeof(DataT), IsPlaceH, BufferRef.OffsetInBytes, BufferRef.IsSubBuffer, PropertyList) { + throwIfUsedByGraph(); preScreenAccessor(PropertyList); if (!AccessorBaseHost::isPlaceholder()) addHostAccessorAndWait(AccessorBaseHost::impl.get()); @@ -2023,6 +2045,7 @@ class __SYCL_EBO __SYCL_SPECIAL_CLASS __SYCL_TYPE(accessor) accessor : detail::getSyclObjImpl(BufferRef).get(), Dimensions, sizeof(DataT), IsPlaceH, BufferRef.OffsetInBytes, BufferRef.IsSubBuffer, PropertyList) { + throwIfUsedByGraph(); preScreenAccessor(PropertyList); if (!AccessorBaseHost::isPlaceholder()) addHostAccessorAndWait(AccessorBaseHost::impl.get()); @@ -2094,6 +2117,7 @@ class __SYCL_EBO __SYCL_SPECIAL_CLASS __SYCL_TYPE(accessor) accessor : detail::getSyclObjImpl(BufferRef).get(), Dimensions, sizeof(DataT), BufferRef.OffsetInBytes, BufferRef.IsSubBuffer, PropertyList) { + throwIfUsedByGraph(); preScreenAccessor(PropertyList); if (BufferRef.isOutOfBounds(AccessOffset, AccessRange, BufferRef.get_range())) @@ -2136,6 +2160,7 @@ class __SYCL_EBO __SYCL_SPECIAL_CLASS __SYCL_TYPE(accessor) accessor : detail::getSyclObjImpl(BufferRef).get(), Dimensions, sizeof(DataT), BufferRef.OffsetInBytes, BufferRef.IsSubBuffer, PropertyList) { + throwIfUsedByGraph(); preScreenAccessor(PropertyList); if (BufferRef.isOutOfBounds(AccessOffset, AccessRange, BufferRef.get_range())) diff --git a/sycl/include/sycl/detail/property_helper.hpp b/sycl/include/sycl/detail/property_helper.hpp index 3dfadf746d272..e7f47547e446b 100644 --- a/sycl/include/sycl/detail/property_helper.hpp +++ b/sycl/include/sycl/detail/property_helper.hpp @@ -46,8 +46,10 @@ enum DataLessPropKind { GraphNoCycleCheck = 19, QueueSubmissionBatched = 20, QueueSubmissionImmediate = 21, + GraphAssumeDataOutlivesBuffer = 22, + GraphAssumeBufferOutlivesGraph = 23, // Indicates the last known dataless property. - LastKnownDataLessPropKind = 21, + LastKnownDataLessPropKind = 23, // Exceeding 32 may cause ABI breaking change on some of OSes. DataLessPropKindSize = 32 }; diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp index be5210922fb08..b8e5efdad958d 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp @@ -59,13 +59,35 @@ namespace graph { /// Property passed to command_graph constructor to disable checking for cycles. /// -/// \todo Cycle check not yet implemented. class no_cycle_check : public ::sycl::detail::DataLessProperty< ::sycl::detail::GraphNoCycleCheck> { public: no_cycle_check() = default; }; +/// Property passed to command_graph constructor to allow buffers to be used +/// with graphs. Passing this property represents a promise from the user that +/// the buffer will outlive any graph that it is used in. +/// +class assume_buffer_outlives_graph + : public ::sycl::detail::DataLessProperty< + ::sycl::detail::GraphAssumeBufferOutlivesGraph> { +public: + assume_buffer_outlives_graph() = default; +}; + +/// Property passed to command_graph constructor to allow buffers created with +/// host pointers. Passing this property represents a promise from the user that +/// the host data will outlive the buffer and by extension any graph that it is +/// used in. +/// +class assume_data_outlives_buffer + : public ::sycl::detail::DataLessProperty< + ::sycl::detail::GraphAssumeDataOutlivesBuffer> { +public: + assume_data_outlives_buffer() = default; +}; + } // namespace graph namespace node { diff --git a/sycl/source/accessor.cpp b/sycl/source/accessor.cpp index b4fb635de7cae..6f3b822026029 100644 --- a/sycl/source/accessor.cpp +++ b/sycl/source/accessor.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include +#include #include namespace sycl { @@ -68,6 +69,10 @@ void *AccessorBaseHost::getMemoryObject() const { return impl->MSYCLMemObj; } bool AccessorBaseHost::isPlaceholder() const { return impl->MIsPlaceH; } +bool AccessorBaseHost::isMemoryObjectUsedByGraph() const { + return static_cast(impl->MSYCLMemObj)->isUsedInGraph(); +} + LocalAccessorBaseHost::LocalAccessorBaseHost( sycl::range<3> Size, int Dims, int ElemSize, const property_list &PropertyList) { diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 63e009804fa0a..9cf116ed65f74 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -117,6 +118,13 @@ void exec_graph_impl::schedule() { } } +graph_impl::~graph_impl() { + clearQueues(); + for (auto &MemObj : MMemObjs) { + MemObj->markNoLongerBeingUsedInGraph(); + } +} + std::shared_ptr graph_impl::addSubgraphNodes( const std::list> &NodeList) { // Find all input and output nodes from the node list @@ -214,7 +222,27 @@ graph_impl::add(sycl::detail::CG::CGTYPE CGType, // A unique set of dependencies obtained by checking requirements and events std::set> UniqueDeps; const auto &Requirements = CommandGroup->getRequirements(); + if (!MAllowBuffers && Requirements.size()) { + throw sycl::exception(make_error_code(errc::invalid), + "Cannot use buffers in a graph without passing the " + "assume_buffer_outlives_graph property on " + "Graph construction."); + } + for (auto &Req : Requirements) { + // Track and mark the memory objects being used by the graph. + auto MemObj = static_cast(Req->MSYCLMemObj); + if (MemObj->getUserPtr() && !MAllowBuffersHostPointers) { + throw sycl::exception( + make_error_code(errc::invalid), + "Cannot use a buffer which was created with a host pointer in a " + "graph without passing the assume_data_outlives_buffer property on " + "Graph construction."); + } + bool WasInserted = MMemObjs.insert(MemObj).second; + if (WasInserted) { + MemObj->markBeingUsedInGraph(); + } // Look through the graph for nodes which share this requirement for (auto NodePtr : MRoots) { checkForRequirement(Req, NodePtr, UniqueDeps); @@ -253,8 +281,10 @@ graph_impl::add(sycl::detail::CG::CGTYPE CGType, bool graph_impl::clearQueues() { bool AnyQueuesCleared = false; for (auto &Queue : MRecordingQueues) { - Queue->setCommandGraph(nullptr); - AnyQueuesCleared = true; + if (auto ValidQueue = Queue.lock(); ValidQueue) { + ValidQueue->setCommandGraph(nullptr); + AnyQueuesCleared = true; + } } MRecordingQueues.clear(); diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index a154ae8f8b18f..111494da4b36d 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -26,6 +26,10 @@ namespace sycl { inline namespace _V1 { +namespace detail { +class SYCLMemObjT; +} + namespace ext { namespace oneapi { namespace experimental { @@ -373,6 +377,14 @@ class graph_impl { if (PropList.has_property()) { MSkipCycleChecks = true; } + if (PropList.has_property()) { + MAllowBuffersHostPointers = true; + } + if (PropList + .has_property()) { + MAllowBuffers = true; + } + if (SyclDevice.get_info< ext::oneapi::experimental::info::device::graph_support>() == info::graph_support_level::unsupported) { @@ -385,6 +397,8 @@ class graph_impl { } } + ~graph_impl(); + /// Remove node from list of root nodes. /// @param Root Node to remove from list of root nodes. void removeRoot(const std::shared_ptr &Root); @@ -636,13 +650,19 @@ class graph_impl { /// @return True if a cycle is detected, false if not. bool checkForCycles(); + /// Insert node into list of root nodes. + /// @param Root Node to add to list of root nodes. + void addRoot(const std::shared_ptr &Root); + /// Context associated with this graph. sycl::context MContext; /// Device associated with this graph. All graph nodes will execute on this /// device. sycl::device MDevice; /// Unique set of queues which are currently recording to this graph. - std::set> MRecordingQueues; + std::set, + std::owner_less>> + MRecordingQueues; /// Map of events to their associated recorded nodes. std::unordered_map, std::shared_ptr> @@ -656,10 +676,17 @@ class graph_impl { /// Controls whether we skip the cycle checks in makeEdge, set by the presence /// of the no_cycle_check property on construction. bool MSkipCycleChecks = false; + /// Unique set of SYCL Memory Objects which are currently in use in the graph. + std::set MMemObjs; - /// Insert node into list of root nodes. - /// @param Root Node to add to list of root nodes. - void addRoot(const std::shared_ptr &Root); + /// Controls whether we allow buffers that are created with host pointers to + /// be used in the graph. Set by the presence of the + /// assume_data_outlives_buffer property. + bool MAllowBuffersHostPointers = false; + + /// Controls whether we allow buffers to be used in the graph. Set by the + /// presence of the assume_buffer_outlives_graph property. + bool MAllowBuffers = false; }; /// Class representing the implementation of command_graph. diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 8760904ed6724..672ecc4e1e8f9 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -147,7 +147,7 @@ event queue_impl::memcpy(const std::shared_ptr &Self, #endif // If we have a command graph set we need to capture the copy through normal // queue submission rather than execute the copy directly. - if (MGraph) { + if (MGraph.lock()) { return submit( [&](handler &CGH) { CGH.depends_on(DepEvents); @@ -476,7 +476,7 @@ void queue_impl::wait(const detail::code_location &CodeLoc) { TelemetryEvent = instrumentationProlog(CodeLoc, Name, StreamID, IId); #endif - if (MGraph) { + if (MGraph.lock()) { throw sycl::exception(make_error_code(errc::invalid), "wait cannot be called for a queue which is " "recording to a command graph."); diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 803bafeb2333a..08ae8307fa2d9 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -689,7 +689,7 @@ class queue_impl { std::shared_ptr getCommandGraph() const { - return MGraph; + return MGraph.lock(); } protected: @@ -868,8 +868,7 @@ class queue_impl { // Command graph which is associated with this queue for the purposes of // recording commands to it. - std::shared_ptr MGraph = - nullptr; + std::weak_ptr MGraph{}; friend class sycl::ext::oneapi::experimental::detail::node_impl; }; diff --git a/sycl/source/detail/sycl_mem_obj_t.hpp b/sycl/source/detail/sycl_mem_obj_t.hpp index fd74b1bc99dc5..967149230bc74 100644 --- a/sycl/source/detail/sycl_mem_obj_t.hpp +++ b/sycl/source/detail/sycl_mem_obj_t.hpp @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -276,6 +277,32 @@ class __SYCL_EXPORT SYCLMemObjT : public SYCLMemObjI { void markAsInternal() { MIsInternal = true; } + /// Returns true if this memory object requires a write_back on destruction. + bool needsWriteBack() const { return MNeedWriteBack && MUploadDataFunctor; } + + /// Increment an internal counter for how many graphs are currently using this + /// memory object. + void markBeingUsedInGraph() { MGraphUseCount += 1; } + + /// Decrement an internal counter for how many graphs are currently using this + /// memory object. + void markNoLongerBeingUsedInGraph() { + // Compare exchange loop to safely decrement MGraphUseCount + while (true) { + size_t CurrentVal = MGraphUseCount; + if (CurrentVal == 0) { + break; + } + if (MGraphUseCount.compare_exchange_strong(CurrentVal, CurrentVal - 1) == + false) { + continue; + } + } + } + + /// Returns true if any graphs are currently using this memory object. + bool isUsedInGraph() const { return MGraphUseCount > 0; } + protected: // An allocateMem helper that determines which host ptr to use void determineHostPtr(const ContextImplPtr &Context, bool InitFromUserData, @@ -320,6 +347,8 @@ class __SYCL_EXPORT SYCLMemObjT : public SYCLMemObjI { // objects can be released in a deferred manner regardless of whether a host // pointer was provided or not. bool MIsInternal = false; + // The number of graphs which are currently using this memory object. + std::atomic MGraphUseCount = 0; }; } // namespace detail } // namespace _V1 diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index c5bbfd8818509..82cd056b0cb22 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -502,6 +502,13 @@ void handler::addReduction(const std::shared_ptr &ReduObj) { void handler::associateWithHandlerCommon(detail::AccessorImplPtr AccImpl, int AccTarget) { + if (getCommandGraph() && + static_cast(AccImpl->MSYCLMemObj) + ->needsWriteBack()) { + throw sycl::exception(make_error_code(errc::invalid), + "Accessors to buffers which have write_back enabled " + "are not allowed to be used in command graphs."); + } detail::Requirement *Req = AccImpl.get(); // Add accessor to the list of requirements. CGData.MRequirements.push_back(Req); diff --git a/sycl/test-e2e/Graph/Explicit/assume_buffer_outlives_graph_property.cpp b/sycl/test-e2e/Graph/Explicit/assume_buffer_outlives_graph_property.cpp new file mode 100644 index 0000000000000..a7ea5b10974a4 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/assume_buffer_outlives_graph_property.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/assume_buffer_outlives_graph_property.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/assume_data_outlives_buffer_property.cpp b/sycl/test-e2e/Graph/Explicit/assume_data_outlives_buffer_property.cpp new file mode 100644 index 0000000000000..00c13cf87834f --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/assume_data_outlives_buffer_property.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/assume_data_outlives_buffer_property.cpp" diff --git a/sycl/test-e2e/Graph/Inputs/assume_buffer_outlives_graph_property.cpp b/sycl/test-e2e/Graph/Inputs/assume_buffer_outlives_graph_property.cpp new file mode 100644 index 0000000000000..415767c7888d9 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/assume_buffer_outlives_graph_property.cpp @@ -0,0 +1,52 @@ +// Tests that using a buffer in a graph will throw, unless the +// assume_buffer_outlives_graph property is passed on graph creation. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + using T = unsigned short; + + buffer Buffer{range<1>{1}}; + Buffer.set_write_back(false); + + // Test with the property + { + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + + std::error_code ErrorCode = make_error_code(sycl::errc::success); + // This should not throw because we have passed the property + try { + add_node(Graph, Queue, [&](handler &CGH) { + auto acc = Buffer.get_access(CGH); + CGH.single_task([=]() {}); + }); + } catch (const sycl::exception &e) { + ErrorCode = e.code(); + } + assert(ErrorCode == sycl::errc::success); + } + + // Test without the property + { + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + std::error_code ErrorCode = make_error_code(sycl::errc::success); + // This should throw because we have not passed the property + try { + add_node(Graph, Queue, [&](handler &CGH) { + auto acc = Buffer.get_access(CGH); + CGH.single_task([=]() {}); + }); + } catch (const sycl::exception &e) { + ErrorCode = e.code(); + } + assert(ErrorCode == sycl::errc::invalid); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/assume_data_outlives_buffer_property.cpp b/sycl/test-e2e/Graph/Inputs/assume_data_outlives_buffer_property.cpp new file mode 100644 index 0000000000000..b16b595127590 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/assume_data_outlives_buffer_property.cpp @@ -0,0 +1,84 @@ +// Tests that using a buffer which is created with a host pointer in a graph +// will throw, unless the assume_data_outlives_buffer property is passed on +// graph creation. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + using T = unsigned short; + + T Data = 0; + + buffer BufferHost{&Data, range<1>{1}}; + BufferHost.set_write_back(false); + buffer BufferNoHost{range<1>{1}}; + BufferNoHost.set_write_back(false); + + // Test with the property + { + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}, + exp_ext::property::graph::assume_data_outlives_buffer{}}}; + + std::error_code ErrorCode = make_error_code(sycl::errc::success); + // This should not throw because we have passed the property + try { + add_node(Graph, Queue, [&](handler &CGH) { + auto acc = BufferHost.get_access(CGH); + CGH.single_task([=]() {}); + }); + } catch (const sycl::exception &e) { + ErrorCode = e.code(); + } + assert(ErrorCode == sycl::errc::success); + + // This should not throw regardless of property use + try { + add_node(Graph, Queue, [&](handler &CGH) { + auto acc = BufferNoHost.get_access(CGH); + CGH.single_task([=]() {}); + }); + } catch (const sycl::exception &e) { + ErrorCode = e.code(); + } + assert(ErrorCode == sycl::errc::success); + } + + // Test without the property + { + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + + std::error_code ErrorCode = make_error_code(sycl::errc::success); + // This should throw because we haven't used the property + try { + add_node(Graph, Queue, [&](handler &CGH) { + auto acc = BufferHost.get_access(CGH); + CGH.single_task([=]() {}); + }); + } catch (const sycl::exception &e) { + ErrorCode = e.code(); + } + assert(ErrorCode == sycl::errc::invalid); + + ErrorCode = sycl::errc::success; + // This should not throw regardless of property use + try { + add_node(Graph, Queue, [&](handler &CGH) { + auto acc = BufferNoHost.get_access(CGH); + CGH.single_task([=]() {}); + }); + } catch (const sycl::exception &e) { + ErrorCode = e.code(); + } + assert(ErrorCode == sycl::errc::success); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/basic_buffer.cpp b/sycl/test-e2e/Graph/Inputs/basic_buffer.cpp index 0d34e3f51a822..5a2c553faa55b 100644 --- a/sycl/test-e2e/Graph/Inputs/basic_buffer.cpp +++ b/sycl/test-e2e/Graph/Inputs/basic_buffer.cpp @@ -18,28 +18,33 @@ int main() { calculate_reference_data(Iterations, Size, ReferenceA, ReferenceB, ReferenceC); - { - exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; buffer BufferA{DataA.data(), range<1>{DataA.size()}}; BufferA.set_write_back(false); buffer BufferB{DataB.data(), range<1>{DataB.size()}}; BufferB.set_write_back(false); buffer BufferC{DataC.data(), range<1>{DataC.size()}}; BufferC.set_write_back(false); - - // Add commands to graph - add_nodes(Graph, Queue, Size, BufferA, BufferB, BufferC); - - auto GraphExec = Graph.finalize(); - - event Event; - for (unsigned n = 0; n < Iterations; n++) { - Event = Queue.submit([&](handler &CGH) { - CGH.depends_on(Event); - CGH.ext_oneapi_graph(GraphExec); - }); + { + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}, + exp_ext::property::graph::assume_data_outlives_buffer{}}}; + + // Add commands to graph + add_nodes(Graph, Queue, Size, BufferA, BufferB, BufferC); + + auto GraphExec = Graph.finalize(); + + event Event; + for (unsigned n = 0; n < Iterations; n++) { + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(GraphExec); + }); + } + Queue.wait_and_throw(); } - Queue.wait_and_throw(); host_accessor HostAccA(BufferA); host_accessor HostAccB(BufferB); @@ -50,7 +55,6 @@ int main() { assert(ReferenceB[i] == HostAccB[i]); assert(ReferenceC[i] == HostAccC[i]); } - } return 0; } diff --git a/sycl/test-e2e/Graph/Inputs/buffer_copy.cpp b/sycl/test-e2e/Graph/Inputs/buffer_copy.cpp index 56623b53b2d36..539d6fb49dd9b 100644 --- a/sycl/test-e2e/Graph/Inputs/buffer_copy.cpp +++ b/sycl/test-e2e/Graph/Inputs/buffer_copy.cpp @@ -26,8 +26,6 @@ int main() { } } - exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; - buffer BufferA{DataA}; BufferA.set_write_back(false); buffer BufferB{DataB}; @@ -35,79 +33,87 @@ int main() { buffer BufferC{DataC}; BufferC.set_write_back(false); - // Copy from B to A - auto NodeA = add_node(Graph, Queue, [&](handler &CGH) { - auto AccA = BufferA.get_access(CGH); - auto AccB = BufferB.get_access(CGH); - CGH.copy(AccB, AccA); - }); - - // Read & write A - auto NodeB = add_node( - Graph, Queue, - [&](handler &CGH) { - auto AccA = BufferA.get_access(CGH); - CGH.parallel_for(range<1>(Size), [=](item<1> id) { - auto LinID = id.get_linear_id(); - AccA[LinID] += ModValue; - }); - }, - NodeA); - - // Read & write B - auto NodeModB = add_node( - Graph, Queue, - [&](handler &CGH) { - auto AccB = BufferB.get_access(CGH); - CGH.parallel_for(range<1>(Size), [=](item<1> id) { - auto LinID = id.get_linear_id(); - AccB[LinID] += ModValue; - }); - }, - NodeA); - - // memcpy from A to B - auto NodeC = add_node( - Graph, Queue, - [&](handler &CGH) { - auto AccA = BufferA.get_access(CGH); - auto AccB = BufferB.get_access(CGH); - CGH.copy(AccA, AccB); - }, - NodeB, NodeModB); - - // Read and write B - auto NodeD = add_node( - Graph, Queue, - [&](handler &CGH) { - auto AccB = BufferB.get_access(CGH); - CGH.parallel_for(range<1>(Size), [=](item<1> id) { - auto LinID = id.get_linear_id(); - AccB[LinID] += ModValue; - }); - }, - NodeC); - - // Copy from B to C - add_node( - Graph, Queue, - [&](handler &CGH) { - auto AccB = BufferB.get_access(CGH); - auto AccC = BufferC.get_access(CGH); - CGH.copy(AccB, AccC); - }, - NodeD); - - auto GraphExec = Graph.finalize(); - - event Event; - for (unsigned n = 0; n < Iterations; n++) { - Event = Queue.submit([&](handler &CGH) { - CGH.depends_on(Event); - CGH.ext_oneapi_graph(GraphExec); + { + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}, + exp_ext::property::graph::assume_data_outlives_buffer{}}}; + + // Copy from B to A + auto NodeA = add_node(Graph, Queue, [&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + auto AccB = BufferB.get_access(CGH); + CGH.copy(AccB, AccA); }); + + // Read & write A + auto NodeB = add_node( + Graph, Queue, + [&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + CGH.parallel_for(range<1>(Size), [=](item<1> id) { + auto LinID = id.get_linear_id(); + AccA[LinID] += ModValue; + }); + }, + NodeA); + + // Read & write B + auto NodeModB = add_node( + Graph, Queue, + [&](handler &CGH) { + auto AccB = BufferB.get_access(CGH); + CGH.parallel_for(range<1>(Size), [=](item<1> id) { + auto LinID = id.get_linear_id(); + AccB[LinID] += ModValue; + }); + }, + NodeA); + + // memcpy from A to B + auto NodeC = add_node( + Graph, Queue, + [&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + auto AccB = BufferB.get_access(CGH); + CGH.copy(AccA, AccB); + }, + NodeB, NodeModB); + + // Read and write B + auto NodeD = add_node( + Graph, Queue, + [&](handler &CGH) { + auto AccB = BufferB.get_access(CGH); + CGH.parallel_for(range<1>(Size), [=](item<1> id) { + auto LinID = id.get_linear_id(); + AccB[LinID] += ModValue; + }); + }, + NodeC); + + // Copy from B to C + add_node( + Graph, Queue, + [&](handler &CGH) { + auto AccB = BufferB.get_access(CGH); + auto AccC = BufferC.get_access(CGH); + CGH.copy(AccB, AccC); + }, + NodeD); + + auto GraphExec = Graph.finalize(); + + event Event; + for (unsigned n = 0; n < Iterations; n++) { + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(GraphExec); + }); + } + Queue.wait_and_throw(); } - Queue.wait_and_throw(); host_accessor HostAccA(BufferA); host_accessor HostAccB(BufferB); diff --git a/sycl/test-e2e/Graph/Inputs/buffer_copy_2d.cpp b/sycl/test-e2e/Graph/Inputs/buffer_copy_2d.cpp index 574cf9c84981c..179d624254fd9 100644 --- a/sycl/test-e2e/Graph/Inputs/buffer_copy_2d.cpp +++ b/sycl/test-e2e/Graph/Inputs/buffer_copy_2d.cpp @@ -27,8 +27,6 @@ int main() { } } - exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; - // Make the buffers 2D so we can test the rect copy path buffer BufferA{DataA.data(), range<2>(Size, Size)}; BufferA.set_write_back(false); @@ -36,74 +34,81 @@ int main() { BufferB.set_write_back(false); buffer BufferC{DataC.data(), range<2>(Size, Size)}; BufferC.set_write_back(false); - - // Copy from B to A - auto NodeA = add_node(Graph, Queue, [&](handler &CGH) { - auto AccA = BufferA.get_access(CGH); - auto AccB = BufferB.get_access(CGH); - CGH.copy(AccB, AccA); - }); - - // Read & write A - auto NodeB = add_node( - Graph, Queue, - [&](handler &CGH) { - auto AccA = BufferA.get_access(CGH); - CGH.parallel_for(range<2>(Size, Size), - [=](item<2> id) { AccA[id] += ModValue; }); - }, - NodeA); - - // Read & write B - auto NodeModB = add_node( - Graph, Queue, - [&](handler &CGH) { - auto AccB = BufferB.get_access(CGH); - CGH.parallel_for(range<2>(Size, Size), - [=](item<2> id) { AccB[id] += ModValue; }); - }, - NodeA); - - // memcpy from A to B - auto NodeC = add_node( - Graph, Queue, - [&](handler &CGH) { - auto AccA = BufferA.get_access(CGH); - auto AccB = BufferB.get_access(CGH); - CGH.copy(AccA, AccB); - }, - NodeModB); - - // Read and write B - auto NodeD = add_node( - Graph, Queue, - [&](handler &CGH) { - auto AccB = BufferB.get_access(CGH); - CGH.parallel_for(range<2>(Size, Size), - [=](item<2> id) { AccB[id] += ModValue; }); - }, - NodeC); - - // Copy from B to C - add_node( - Graph, Queue, - [&](handler &CGH) { - auto AccB = BufferB.get_access(CGH); - auto AccC = BufferC.get_access(CGH); - CGH.copy(AccB, AccC); - }, - NodeD); - - auto GraphExec = Graph.finalize(); - - event Event; - for (unsigned n = 0; n < Iterations; n++) { - Event = Queue.submit([&](handler &CGH) { - CGH.depends_on(Event); - CGH.ext_oneapi_graph(GraphExec); + { + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}, + exp_ext::property::graph::assume_data_outlives_buffer{}}}; + + // Copy from B to A + auto NodeA = add_node(Graph, Queue, [&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + auto AccB = BufferB.get_access(CGH); + CGH.copy(AccB, AccA); }); + + // Read & write A + auto NodeB = add_node( + Graph, Queue, + [&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + CGH.parallel_for(range<2>(Size, Size), + [=](item<2> id) { AccA[id] += ModValue; }); + }, + NodeA); + + // Read & write B + auto NodeModB = add_node( + Graph, Queue, + [&](handler &CGH) { + auto AccB = BufferB.get_access(CGH); + CGH.parallel_for(range<2>(Size, Size), + [=](item<2> id) { AccB[id] += ModValue; }); + }, + NodeA); + + // memcpy from A to B + auto NodeC = add_node( + Graph, Queue, + [&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + auto AccB = BufferB.get_access(CGH); + CGH.copy(AccA, AccB); + }, + NodeModB); + + // Read and write B + auto NodeD = add_node( + Graph, Queue, + [&](handler &CGH) { + auto AccB = BufferB.get_access(CGH); + CGH.parallel_for(range<2>(Size, Size), + [=](item<2> id) { AccB[id] += ModValue; }); + }, + NodeC); + + // Copy from B to C + add_node( + Graph, Queue, + [&](handler &CGH) { + auto AccB = BufferB.get_access(CGH); + auto AccC = BufferC.get_access(CGH); + CGH.copy(AccB, AccC); + }, + NodeD); + + auto GraphExec = Graph.finalize(); + + event Event; + for (unsigned n = 0; n < Iterations; n++) { + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(GraphExec); + }); + } + Queue.wait_and_throw(); } - Queue.wait_and_throw(); host_accessor HostAccA(BufferA); host_accessor HostAccB(BufferB); diff --git a/sycl/test-e2e/Graph/Inputs/buffer_copy_host2target.cpp b/sycl/test-e2e/Graph/Inputs/buffer_copy_host2target.cpp index 8a25673ea6645..efff98d1fb045 100644 --- a/sycl/test-e2e/Graph/Inputs/buffer_copy_host2target.cpp +++ b/sycl/test-e2e/Graph/Inputs/buffer_copy_host2target.cpp @@ -17,18 +17,24 @@ int main() { ReferenceA[i] = DataB[i]; } - exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; - buffer BufferA(DataA.data(), range<1>(Size)); BufferA.set_write_back(false); - auto NodeA = add_node(Graph, Queue, [&](handler &CGH) { - auto AccA = BufferA.get_access(CGH); - CGH.copy(DataB.data(), AccA); - }); + { + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}, + exp_ext::property::graph::assume_data_outlives_buffer{}}}; + + auto NodeA = add_node(Graph, Queue, [&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + CGH.copy(DataB.data(), AccA); + }); - auto GraphExec = Graph.finalize(); - Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }).wait(); + auto GraphExec = Graph.finalize(); + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }).wait(); + } host_accessor HostAccA(BufferA); diff --git a/sycl/test-e2e/Graph/Inputs/buffer_copy_host2target_2d.cpp b/sycl/test-e2e/Graph/Inputs/buffer_copy_host2target_2d.cpp index 9d581a9dbe76e..d81e5f7928fe0 100644 --- a/sycl/test-e2e/Graph/Inputs/buffer_copy_host2target_2d.cpp +++ b/sycl/test-e2e/Graph/Inputs/buffer_copy_host2target_2d.cpp @@ -17,20 +17,25 @@ int main() { ReferenceA[i] = DataB[i]; } - exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; - // Make the buffers 2D so we can test the rect write path buffer BufferA{DataA.data(), range<2>(Size, Size)}; BufferA.set_write_back(false); - auto NodeA = add_node(Graph, Queue, [&](handler &CGH) { - auto AccA = BufferA.get_access(CGH); - CGH.copy(DataB.data(), AccA); - }); + { + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}, + exp_ext::property::graph::assume_data_outlives_buffer{}}}; - auto GraphExec = Graph.finalize(); - Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }).wait(); + auto NodeA = add_node(Graph, Queue, [&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + CGH.copy(DataB.data(), AccA); + }); + auto GraphExec = Graph.finalize(); + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }).wait(); + } host_accessor HostAccA(BufferA); for (size_t i = 0; i < Size; i++) { diff --git a/sycl/test-e2e/Graph/Inputs/buffer_copy_host2target_offset.cpp b/sycl/test-e2e/Graph/Inputs/buffer_copy_host2target_offset.cpp index 4bde5d8a2fa55..e109a7bdad80f 100644 --- a/sycl/test-e2e/Graph/Inputs/buffer_copy_host2target_offset.cpp +++ b/sycl/test-e2e/Graph/Inputs/buffer_copy_host2target_offset.cpp @@ -20,19 +20,25 @@ int main() { ReferenceA[i] = DataB[i - Offset]; } - exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; - buffer BufferA(DataA.data(), range<1>(Size + Offset)); BufferA.set_write_back(false); - auto NodeA = add_node(Graph, Queue, [&](handler &CGH) { - auto AccA = BufferA.get_access(CGH, range<1>(Size), - id<1>(Offset)); - CGH.copy(DataB.data(), AccA); - }); - - auto GraphExec = Graph.finalize(); - Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }).wait(); + { + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}, + exp_ext::property::graph::assume_data_outlives_buffer{}}}; + + auto NodeA = add_node(Graph, Queue, [&](handler &CGH) { + auto AccA = BufferA.get_access(CGH, range<1>(Size), + id<1>(Offset)); + CGH.copy(DataB.data(), AccA); + }); + + auto GraphExec = Graph.finalize(); + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }).wait(); + } host_accessor HostAccA(BufferA); diff --git a/sycl/test-e2e/Graph/Inputs/buffer_copy_offsets.cpp b/sycl/test-e2e/Graph/Inputs/buffer_copy_offsets.cpp index a52aae9220617..8626b3acc09cd 100644 --- a/sycl/test-e2e/Graph/Inputs/buffer_copy_offsets.cpp +++ b/sycl/test-e2e/Graph/Inputs/buffer_copy_offsets.cpp @@ -25,24 +25,30 @@ int main() { ReferenceB[j] = DataA[(j - OffsetDst) + OffsetSrc]; } - exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; - buffer BufferA{DataA}; BufferA.set_write_back(false); buffer BufferB{DataB}; BufferB.set_write_back(false); - // Copy from A to B - auto NodeA = add_node(Graph, Queue, [&](handler &CGH) { - auto AccA = BufferA.get_access( - CGH, range<1>(Size - OffsetSrc), id<1>(OffsetSrc)); - auto AccB = BufferB.get_access( - CGH, range<1>(Size - OffsetDst), id<1>(OffsetDst)); - CGH.copy(AccA, AccB); - }); - - auto GraphExec = Graph.finalize(); - Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }).wait(); + { + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}, + exp_ext::property::graph::assume_data_outlives_buffer{}}}; + + // Copy from A to B + auto NodeA = add_node(Graph, Queue, [&](handler &CGH) { + auto AccA = BufferA.get_access( + CGH, range<1>(Size - OffsetSrc), id<1>(OffsetSrc)); + auto AccB = BufferB.get_access( + CGH, range<1>(Size - OffsetDst), id<1>(OffsetDst)); + CGH.copy(AccA, AccB); + }); + + auto GraphExec = Graph.finalize(); + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }).wait(); + } host_accessor HostAccA(BufferA); host_accessor HostAccB(BufferB); diff --git a/sycl/test-e2e/Graph/Inputs/buffer_copy_target2host.cpp b/sycl/test-e2e/Graph/Inputs/buffer_copy_target2host.cpp index 460ecd4ee945f..916d54bb8825c 100644 --- a/sycl/test-e2e/Graph/Inputs/buffer_copy_target2host.cpp +++ b/sycl/test-e2e/Graph/Inputs/buffer_copy_target2host.cpp @@ -18,18 +18,24 @@ int main() { ReferenceB[i] = DataA[i]; } - exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; - buffer BufferA(DataA.data(), range<1>(Size)); BufferA.set_write_back(false); - auto NodeA = add_node(Graph, Queue, [&](handler &CGH) { - auto AccA = BufferA.get_access(CGH); - CGH.copy(AccA, DataB.data()); - }); + { + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}, + exp_ext::property::graph::assume_data_outlives_buffer{}}}; + + auto NodeA = add_node(Graph, Queue, [&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + CGH.copy(AccA, DataB.data()); + }); - auto GraphExec = Graph.finalize(); - Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }).wait(); + auto GraphExec = Graph.finalize(); + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }).wait(); + } for (size_t i = 0; i < Size; i++) { assert(ReferenceA[i] == DataA[i]); diff --git a/sycl/test-e2e/Graph/Inputs/buffer_copy_target2host_2d.cpp b/sycl/test-e2e/Graph/Inputs/buffer_copy_target2host_2d.cpp index 69050d2a8a1c6..ccf0fa62770c6 100644 --- a/sycl/test-e2e/Graph/Inputs/buffer_copy_target2host_2d.cpp +++ b/sycl/test-e2e/Graph/Inputs/buffer_copy_target2host_2d.cpp @@ -18,19 +18,25 @@ int main() { ReferenceB[i] = DataA[i]; } - exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; - // Make the buffers 2D so we can test the rect read path buffer BufferA{DataA.data(), range<2>(Size, Size)}; BufferA.set_write_back(false); - auto NodeA = add_node(Graph, Queue, [&](handler &CGH) { - auto AccA = BufferA.get_access(CGH); - CGH.copy(AccA, DataB.data()); - }); + { + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}, + exp_ext::property::graph::assume_data_outlives_buffer{}}}; + + auto NodeA = add_node(Graph, Queue, [&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + CGH.copy(AccA, DataB.data()); + }); - auto GraphExec = Graph.finalize(); - Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }).wait(); + auto GraphExec = Graph.finalize(); + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }).wait(); + } host_accessor HostAccA(BufferA); diff --git a/sycl/test-e2e/Graph/Inputs/buffer_copy_target2host_offset.cpp b/sycl/test-e2e/Graph/Inputs/buffer_copy_target2host_offset.cpp index dcb6d290b1205..de6e44f44a7fa 100644 --- a/sycl/test-e2e/Graph/Inputs/buffer_copy_target2host_offset.cpp +++ b/sycl/test-e2e/Graph/Inputs/buffer_copy_target2host_offset.cpp @@ -21,19 +21,25 @@ int main() { ReferenceB[i] = DataB[i]; } - exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; - buffer BufferA(DataA.data(), range<1>(Size)); BufferA.set_write_back(false); - auto NodeA = add_node(Graph, Queue, [&](handler &CGH) { - auto AccA = BufferA.get_access( - CGH, range<1>(Size - Offset), id<1>(Offset)); - CGH.copy(AccA, DataB.data()); - }); - - auto GraphExec = Graph.finalize(); - Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }).wait(); + { + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}, + exp_ext::property::graph::assume_data_outlives_buffer{}}}; + + auto NodeA = add_node(Graph, Queue, [&](handler &CGH) { + auto AccA = BufferA.get_access( + CGH, range<1>(Size - Offset), id<1>(Offset)); + CGH.copy(AccA, DataB.data()); + }); + + auto GraphExec = Graph.finalize(); + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }).wait(); + } for (size_t i = 0; i < Size; i++) { assert(ReferenceA[i] == DataA[i]); diff --git a/sycl/test-e2e/Graph/Inputs/buffer_ordering.cpp b/sycl/test-e2e/Graph/Inputs/buffer_ordering.cpp index 79305c69db52c..a51295583699d 100644 --- a/sycl/test-e2e/Graph/Inputs/buffer_ordering.cpp +++ b/sycl/test-e2e/Graph/Inputs/buffer_ordering.cpp @@ -15,7 +15,10 @@ int main() { queue Queue; - exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; const size_t N = 10; std::vector Arr(N, 0.0f); diff --git a/sycl/test-e2e/Graph/Inputs/dotp_buffer_reduction.cpp b/sycl/test-e2e/Graph/Inputs/dotp_buffer_reduction.cpp index 9e64a0bdae5d8..832b61129f23a 100644 --- a/sycl/test-e2e/Graph/Inputs/dotp_buffer_reduction.cpp +++ b/sycl/test-e2e/Graph/Inputs/dotp_buffer_reduction.cpp @@ -6,8 +6,6 @@ int main() { queue Queue; - exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; - float DotpData = 0.f; const size_t N = 10; @@ -15,16 +13,21 @@ int main() { std::vector YData(N); std::vector ZData(N); - { - buffer DotpBuf(&DotpData, range<1>(1)); - DotpBuf.set_write_back(false); + buffer DotpBuf(&DotpData, range<1>(1)); + DotpBuf.set_write_back(false); - buffer XBuf(XData); - XBuf.set_write_back(false); - buffer YBuf(YData); - YBuf.set_write_back(false); - buffer ZBuf(ZData); - ZBuf.set_write_back(false); + buffer XBuf(XData); + XBuf.set_write_back(false); + buffer YBuf(YData); + YBuf.set_write_back(false); + buffer ZBuf(ZData); + ZBuf.set_write_back(false); + { + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}, + exp_ext::property::graph::assume_data_outlives_buffer{}}}; auto NodeI = add_node(Graph, Queue, [&](handler &CGH) { auto X = XBuf.get_access(CGH); @@ -75,10 +78,10 @@ int main() { // Using shortcut for executing a graph of commands Queue.ext_oneapi_graph(ExecGraph).wait(); - - host_accessor HostAcc(DotpBuf); - assert(HostAcc[0] == dotp_reference_result(N)); } + host_accessor HostAcc(DotpBuf); + assert(HostAcc[0] == dotp_reference_result(N)); + return 0; } diff --git a/sycl/test-e2e/Graph/Inputs/dotp_usm_reduction.cpp b/sycl/test-e2e/Graph/Inputs/dotp_usm_reduction.cpp index 50918b0fe9e8f..4618ae5da2b2a 100644 --- a/sycl/test-e2e/Graph/Inputs/dotp_usm_reduction.cpp +++ b/sycl/test-e2e/Graph/Inputs/dotp_usm_reduction.cpp @@ -6,7 +6,11 @@ int main() { queue Queue; - exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}, + exp_ext::property::graph::assume_data_outlives_buffer{}}}; float *Dotp = malloc_device(1, Queue); diff --git a/sycl/test-e2e/Graph/Inputs/event_status_querying.cpp b/sycl/test-e2e/Graph/Inputs/event_status_querying.cpp index 04b5820a895b7..0f5819ee361cf 100644 --- a/sycl/test-e2e/Graph/Inputs/event_status_querying.cpp +++ b/sycl/test-e2e/Graph/Inputs/event_status_querying.cpp @@ -55,8 +55,6 @@ int main() { ReferenceC[j] = ReferenceB[j]; } - exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; - buffer BufferA{DataA}; BufferA.set_write_back(false); buffer BufferB{DataB}; @@ -64,67 +62,76 @@ int main() { buffer BufferC{DataC}; BufferC.set_write_back(false); - // Copy from B to A - auto Init = add_node(Graph, Queue, [&](handler &CGH) { - auto AccA = BufferA.get_access(CGH); - auto AccB = BufferB.get_access(CGH); - CGH.copy(AccB, AccA); - }); - - // Read & write A - auto Node1 = add_node(Graph, Queue, [&](handler &CGH) { - auto AccA = BufferA.get_access(CGH); - CGH.parallel_for(range<1>(Size), [=](item<1> id) { - auto LinID = id.get_linear_id(); - AccA[LinID] += ModValue; + { + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}, + exp_ext::property::graph::assume_data_outlives_buffer{}}}; + + // Copy from B to A + auto Init = add_node(Graph, Queue, [&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + auto AccB = BufferB.get_access(CGH); + CGH.copy(AccB, AccA); }); - }); - - // Read & write B - auto Node2 = add_node(Graph, Queue, [&](handler &CGH) { - auto AccB = BufferB.get_access(CGH); - CGH.parallel_for(range<1>(Size), [=](item<1> id) { - auto LinID = id.get_linear_id(); - AccB[LinID] += ModValue; + + // Read & write A + auto Node1 = add_node(Graph, Queue, [&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + CGH.parallel_for(range<1>(Size), [=](item<1> id) { + auto LinID = id.get_linear_id(); + AccA[LinID] += ModValue; + }); }); - }); - - // memcpy from A to B - auto Node3 = add_node(Graph, Queue, [&](handler &CGH) { - auto AccA = BufferA.get_access(CGH); - auto AccB = BufferB.get_access(CGH); - CGH.copy(AccA, AccB); - }); - - // Read and write B - auto Node4 = add_node(Graph, Queue, [&](handler &CGH) { - auto AccB = BufferB.get_access(CGH); - CGH.parallel_for(range<1>(Size), [=](item<1> id) { - auto LinID = id.get_linear_id(); - AccB[LinID] += ModValue; + + // Read & write B + auto Node2 = add_node(Graph, Queue, [&](handler &CGH) { + auto AccB = BufferB.get_access(CGH); + CGH.parallel_for(range<1>(Size), [=](item<1> id) { + auto LinID = id.get_linear_id(); + AccB[LinID] += ModValue; + }); }); - }); - - // Copy from B to C - auto Node5 = add_node(Graph, Queue, [&](handler &CGH) { - auto AccB = BufferB.get_access(CGH); - auto AccC = BufferC.get_access(CGH); - CGH.copy(AccB, AccC); - }); - - auto GraphExec = Graph.finalize(); - - sycl::event Event = - Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); - auto Info = Event.get_info(); - std::cout << event_status_name(Info) << std::endl; - while ( - (Info = Event.get_info()) != - sycl::info::event_command_status::complete) { - } - std::cout << event_status_name(Info) << std::endl; - Queue.wait_and_throw(); + // memcpy from A to B + auto Node3 = add_node(Graph, Queue, [&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + auto AccB = BufferB.get_access(CGH); + CGH.copy(AccA, AccB); + }); + + // Read and write B + auto Node4 = add_node(Graph, Queue, [&](handler &CGH) { + auto AccB = BufferB.get_access(CGH); + CGH.parallel_for(range<1>(Size), [=](item<1> id) { + auto LinID = id.get_linear_id(); + AccB[LinID] += ModValue; + }); + }); + + // Copy from B to C + auto Node5 = add_node(Graph, Queue, [&](handler &CGH) { + auto AccB = BufferB.get_access(CGH); + auto AccC = BufferC.get_access(CGH); + CGH.copy(AccB, AccC); + }); + + auto GraphExec = Graph.finalize(); + + sycl::event Event = + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + auto Info = Event.get_info(); + std::cout << event_status_name(Info) << std::endl; + while ( + (Info = + Event.get_info()) != + sycl::info::event_command_status::complete) { + } + std::cout << event_status_name(Info) << std::endl; + + Queue.wait_and_throw(); + } host_accessor HostAccA(BufferA); host_accessor HostAccB(BufferB); diff --git a/sycl/test-e2e/Graph/Inputs/sub_graph_reduction.cpp b/sycl/test-e2e/Graph/Inputs/sub_graph_reduction.cpp index ccb168e28f805..5ea3b6fd78a03 100644 --- a/sycl/test-e2e/Graph/Inputs/sub_graph_reduction.cpp +++ b/sycl/test-e2e/Graph/Inputs/sub_graph_reduction.cpp @@ -6,8 +6,16 @@ int main() { queue Queue; - exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; - exp_ext::command_graph SubGraph{Queue.get_context(), Queue.get_device()}; + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}, + exp_ext::property::graph::assume_data_outlives_buffer{}}}; + exp_ext::command_graph SubGraph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}, + exp_ext::property::graph::assume_data_outlives_buffer{}}}; float *Dotp = malloc_device(1, Queue); diff --git a/sycl/test-e2e/Graph/Inputs/temp_buffer_reinterpret.cpp b/sycl/test-e2e/Graph/Inputs/temp_buffer_reinterpret.cpp index 97098dd22e191..bbed84e3df709 100644 --- a/sycl/test-e2e/Graph/Inputs/temp_buffer_reinterpret.cpp +++ b/sycl/test-e2e/Graph/Inputs/temp_buffer_reinterpret.cpp @@ -19,12 +19,19 @@ int main() { std::vector ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC); calculate_reference_data(Iterations, Size, ReferenceA, ReferenceB, ReferenceC); + buffer BufferA{DataA.data(), range<1>{DataA.size()}}; + BufferA.set_write_back(false); + buffer BufferB{DataB.data(), range<1>{DataB.size()}}; + BufferB.set_write_back(false); + buffer BufferC{DataC.data(), range<1>{DataC.size()}}; + BufferC.set_write_back(false); { - exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; - buffer BufferA{DataA.data(), range<1>{DataA.size()}}; - buffer BufferB{DataB.data(), range<1>{DataB.size()}}; - buffer BufferC{DataC.data(), range<1>{DataC.size()}}; + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}, + exp_ext::property::graph::assume_data_outlives_buffer{}}}; { // Create some temporary buffers only for adding nodes @@ -43,6 +50,10 @@ int main() { CGH.ext_oneapi_graph(GraphExec); }); } + + Queue.copy(BufferA.get_access(), DataA.data()); + Queue.copy(BufferB.get_access(), DataB.data()); + Queue.copy(BufferC.get_access(), DataC.data()); // Perform a wait on all graph submissions. Queue.wait_and_throw(); } diff --git a/sycl/test-e2e/Graph/RecordReplay/assume_buffer_outlives_graph_property.cpp b/sycl/test-e2e/Graph/RecordReplay/assume_buffer_outlives_graph_property.cpp new file mode 100644 index 0000000000000..1a9f955d95739 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/assume_buffer_outlives_graph_property.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/assume_buffer_outlives_graph_property.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/assume_data_outlives_buffer_property.cpp b/sycl/test-e2e/Graph/RecordReplay/assume_data_outlives_buffer_property.cpp new file mode 100644 index 0000000000000..cd517364665b0 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/assume_data_outlives_buffer_property.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/assume_data_outlives_buffer_property.cpp" diff --git a/sycl/test-e2e/format.py b/sycl/test-e2e/format.py index 2a90ce4f8cd3f..7136b4adb510b 100644 --- a/sycl/test-e2e/format.py +++ b/sycl/test-e2e/format.py @@ -100,7 +100,7 @@ def execute(self, test, litConfig): # -that new tests by default would runnable there (unless they have # -other restrictions). substitutions.append(('%{build}', '%clangxx -fsycl -fsycl-targets=%{sycl_triple} %s')) - + compilation_cmd_pthread = "%clangxx -pthread -fsycl -fsycl-targets=%{sycl_triple} %s" substitutions.append(('%{build_pthread_inc}', compilation_cmd_pthread)) diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index d617ffbc1633e..bc682bb948d32 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -4373,6 +4373,7 @@ _ZNK4sycl3_V16detail16AccessorBaseHost13isPlaceholderEv _ZNK4sycl3_V16detail16AccessorBaseHost14getAccessRangeEv _ZNK4sycl3_V16detail16AccessorBaseHost14getMemoryRangeEv _ZNK4sycl3_V16detail16AccessorBaseHost15getMemoryObjectEv +_ZNK4sycl3_V16detail16AccessorBaseHost25isMemoryObjectUsedByGraphEv _ZNK4sycl3_V16detail16AccessorBaseHost6getPtrEv _ZNK4sycl3_V16detail16AccessorBaseHost9getOffsetEv _ZNK4sycl3_V16detail18device_image_plain10has_kernelERKNS0_9kernel_idE diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index b9182a5f7c8c7..d9b1d73c0041a 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -1289,16 +1289,18 @@ ?isBackendSupportedFillSize@handler@_V1@sycl@@CA_N_K@Z ?isConstOrGlobal@handler@_V1@sycl@@CA_NW4target@access@23@@Z ?isDeviceGlobalUsedInKernel@detail@_V1@sycl@@YA_NPEBX@Z -?isHostPointerReadOnly@SYCLMemObjT@detail@_V1@sycl@@UEBA_NXZ -?isImageOrImageArray@handler@_V1@sycl@@CA_NW4target@access@23@@Z -?isInterop@SYCLMemObjT@detail@_V1@sycl@@UEBA_NXZ -?isOutOfRange@detail@_V1@sycl@@YA_NV?$vec@H$03@23@W4addressing_mode@23@V?$range@$02@23@@Z -?isPathPresent@OSUtil@detail@_V1@sycl@@SA_NAEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@Z -?isPlaceholder@AccessorBaseHost@detail@_V1@sycl@@QEBA_NXZ -?isStateExplicitKernelBundle@handler@_V1@sycl@@AEBA_NXZ -?isValidModeForDestinationAccessor@handler@_V1@sycl@@CA_NW4mode@access@23@@Z -?isValidModeForSourceAccessor@handler@_V1@sycl@@CA_NW4mode@access@23@@Z -?isValidTargetForExplicitOp@handler@_V1@sycl@@CA_NW4target@access@23@@Z +?isHostPointerReadOnly@SYCLMemObjT@detail@_V1@sycl@@UEBA_NXZ +?isImageOrImageArray@handler@_V1@sycl@@CA_NW4target@access@23@@Z +?isInterop@SYCLMemObjT@detail@_V1@sycl@@UEBA_NXZ +?isMemoryObjectUsedByGraph@AccessorBaseHost@detail@_V1@sycl@@QEBA_NXZ +?isOutOfRange@detail@_V1@sycl@@YA_NV?$vec@H$03@23@W4addressing_mode@23@V?$range@$02@23@@Z +?isPathPresent@OSUtil@detail@_V1@sycl@@SA_NAEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@Z +?isPlaceholder@AccessorBaseHost@detail@_V1@sycl@@QEBA_NXZ +?isStateExplicitKernelBundle@handler@_V1@sycl@@AEBA_NXZ +?isUsedInGraph@SYCLMemObjT@detail@_V1@sycl@@QEBA_NXZ +?isValidModeForDestinationAccessor@handler@_V1@sycl@@CA_NW4mode@access@23@@Z +?isValidModeForSourceAccessor@handler@_V1@sycl@@CA_NW4mode@access@23@@Z +?isValidTargetForExplicitOp@handler@_V1@sycl@@CA_NW4target@access@23@@Z ?is_accelerator@device@_V1@sycl@@QEBA_NXZ ?is_compatible@_V1@sycl@@YA_NAEBV?$vector@Vkernel_id@_V1@sycl@@V?$allocator@Vkernel_id@_V1@sycl@@@std@@@std@@AEBVdevice@12@@Z ?is_cpu@device@_V1@sycl@@QEBA_NXZ @@ -1354,13 +1356,15 @@ ?malloc_shared@_V1@sycl@@YAPEAX_KAEBVqueue@12@AEBUcode_location@detail@12@@Z ?malloc_shared@_V1@sycl@@YAPEAX_KAEBVqueue@12@AEBVproperty_list@12@AEBUcode_location@detail@12@@Z ?map@MemoryManager@detail@_V1@sycl@@SAPEAXPEAVSYCLMemObjI@234@PEAXV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@W4mode@access@34@IV?$range@$02@34@4V?$id@$02@34@IV?$vector@PEAU_pi_event@@V?$allocator@PEAU_pi_event@@@std@@@7@AEAPEAU_pi_event@@@Z -?map_external_memory_array@experimental@oneapi@ext@_V1@sycl@@YA?AUimage_mem_handle@12345@Uinterop_mem_handle@12345@AEBUimage_descriptor@12345@AEBVdevice@45@AEBVcontext@45@@Z -?map_external_memory_array@experimental@oneapi@ext@_V1@sycl@@YA?AUimage_mem_handle@12345@Uinterop_mem_handle@12345@AEBUimage_descriptor@12345@AEBVqueue@45@@Z -?markAsInternal@SYCLMemObjT@detail@_V1@sycl@@QEAAXXZ -?markBufferAsInternal@detail@_V1@sycl@@YAXAEBV?$shared_ptr@Vbuffer_impl@detail@_V1@sycl@@@std@@@Z -?mem_advise@handler@_V1@sycl@@QEAAXPEBX_KH@Z -?mem_advise@queue@_V1@sycl@@QEAA?AVevent@23@PEBX_KHAEBUcode_location@detail@23@@Z -?mem_advise@queue@_V1@sycl@@QEAA?AVevent@23@PEBX_KHAEBV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@AEBUcode_location@detail@23@@Z +?map_external_memory_array@experimental@oneapi@ext@_V1@sycl@@YA?AUimage_mem_handle@12345@Uinterop_mem_handle@12345@AEBUimage_descriptor@12345@AEBVdevice@45@AEBVcontext@45@@Z +?map_external_memory_array@experimental@oneapi@ext@_V1@sycl@@YA?AUimage_mem_handle@12345@Uinterop_mem_handle@12345@AEBUimage_descriptor@12345@AEBVqueue@45@@Z +?markAsInternal@SYCLMemObjT@detail@_V1@sycl@@QEAAXXZ +?markBeingUsedInGraph@SYCLMemObjT@detail@_V1@sycl@@QEAAXXZ +?markBufferAsInternal@detail@_V1@sycl@@YAXAEBV?$shared_ptr@Vbuffer_impl@detail@_V1@sycl@@@std@@@Z +?markNoLongerBeingUsedInGraph@SYCLMemObjT@detail@_V1@sycl@@QEAAXXZ +?mem_advise@handler@_V1@sycl@@QEAAXPEBX_KH@Z +?mem_advise@queue@_V1@sycl@@QEAA?AVevent@23@PEBX_KHAEBUcode_location@detail@23@@Z +?mem_advise@queue@_V1@sycl@@QEAA?AVevent@23@PEBX_KHAEBV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@AEBUcode_location@detail@23@@Z ?mem_advise@queue@_V1@sycl@@QEAA?AVevent@23@PEBX_KHV423@AEBUcode_location@detail@23@@Z ?mem_advise@queue@_V1@sycl@@QEAA?AVevent@23@PEBX_KW4_pi_mem_advice@@AEBUcode_location@detail@23@@Z ?memcpy@handler@_V1@sycl@@QEAAXPEAXPEBX_K@Z @@ -1378,12 +1382,13 @@ ?memset@queue@_V1@sycl@@QEAA?AVevent@23@PEAXH_KAEBV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@AEBUcode_location@detail@23@@Z ?memset@queue@_V1@sycl@@QEAA?AVevent@23@PEAXH_KV423@AEBUcode_location@detail@23@@Z ?memset_2d_usm@MemoryManager@detail@_V1@sycl@@SAXPEAXV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_K22DV?$vector@PEAU_pi_event@@V?$allocator@PEAU_pi_event@@@std@@@6@PEAPEAU_pi_event@@@Z -?message@SYCLCategory@detail@_V1@sycl@@UEBA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@H@Z -?name@SYCLCategory@detail@_V1@sycl@@UEBAPEBDXZ -?native_specialization_constant@kernel_bundle_plain@detail@_V1@sycl@@QEBA_NXZ -?parallel_for@handler@_V1@sycl@@QEAAXV?$range@$00@23@Vkernel@23@@Z -?parallel_for@handler@_V1@sycl@@QEAAXV?$range@$01@23@Vkernel@23@@Z -?parallel_for@handler@_V1@sycl@@QEAAXV?$range@$02@23@Vkernel@23@@Z +?message@SYCLCategory@detail@_V1@sycl@@UEBA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@H@Z +?name@SYCLCategory@detail@_V1@sycl@@UEBAPEBDXZ +?native_specialization_constant@kernel_bundle_plain@detail@_V1@sycl@@QEBA_NXZ +?needsWriteBack@SYCLMemObjT@detail@_V1@sycl@@QEBA_NXZ +?parallel_for@handler@_V1@sycl@@QEAAXV?$range@$00@23@Vkernel@23@@Z +?parallel_for@handler@_V1@sycl@@QEAAXV?$range@$01@23@Vkernel@23@@Z +?parallel_for@handler@_V1@sycl@@QEAAXV?$range@$02@23@Vkernel@23@@Z ?pitched_alloc_device@experimental@oneapi@ext@_V1@sycl@@YAPEAXPEA_KAEBUimage_descriptor@12345@AEBVdevice@45@AEBVcontext@45@@Z ?pitched_alloc_device@experimental@oneapi@ext@_V1@sycl@@YAPEAXPEA_KAEBUimage_descriptor@12345@AEBVqueue@45@@Z ?pitched_alloc_device@experimental@oneapi@ext@_V1@sycl@@YAPEAXPEA_K_K1IAEBVdevice@45@AEBVcontext@45@@Z diff --git a/sycl/unittests/Extensions/CommandGraph.cpp b/sycl/unittests/Extensions/CommandGraph.cpp index bc5982a6970f8..893d914230fde 100644 --- a/sycl/unittests/Extensions/CommandGraph.cpp +++ b/sycl/unittests/Extensions/CommandGraph.cpp @@ -386,7 +386,11 @@ class CommandGraphTest : public ::testing::Test { public: CommandGraphTest() : Mock{}, Plat{Mock.getPlatform()}, Dev{Plat.get_devices()[0]}, - Queue{Dev}, Graph{Queue.get_context(), Dev} {} + Queue{Dev}, + Graph{Queue.get_context(), + Dev, + {experimental::property::graph::assume_buffer_outlives_graph{}}} { + } protected: void SetUp() override {} @@ -1369,3 +1373,59 @@ TEST_F(MultiThreadGraphTest, Finalize) { ASSERT_EQ(checkExecGraphSchedule(GraphExecImpl, GraphExecRefImpl), true); } } + +TEST_F(CommandGraphTest, InvalidBuffer) { + // Check that using a buffer with write_back enabled in a graph will throw. + int Data; + // Create a buffer which does not have write-back disabled. + buffer Buffer{&Data, range<1>{1}}; + + // Use this buffer in the graph, this should throw. + ASSERT_THROW( + { + try { + Graph.add([&](handler &CGH) { + auto Acc = Buffer.get_access(CGH); + }); + } catch (const sycl::exception &e) { + ASSERT_EQ(e.code(), make_error_code(sycl::errc::invalid)); + throw; + } + }, + sycl::exception); +} + +TEST_F(CommandGraphTest, InvalidHostAccessor) { + // Check that creating a host_accessor on a buffer which is in use by a graph + // will throw. + + // Create a buffer which does not have write-back disabled. + buffer Buffer{range<1>{1}}; + + { + // Create a graph in local scope so we can destroy it + ext::oneapi::experimental::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {experimental::property::graph::assume_buffer_outlives_graph{}}}; + + // Add the buffer to the graph. + Graph.add([&](handler &CGH) { + auto Acc = Buffer.get_access(CGH); + }); + + // Attempt to create a host_accessor, which should throw. + ASSERT_THROW( + { + try { + host_accessor HostAcc{Buffer}; + } catch (const sycl::exception &e) { + ASSERT_EQ(e.code(), make_error_code(sycl::errc::invalid)); + throw; + } + }, + sycl::exception); + } + // Graph is now out of scope so we should be able to create a host_accessor + ASSERT_NO_THROW({ host_accessor HostAcc{Buffer}; }); +}