From 1c16b5b7a40ee07cf788eae189fef762938a4bb0 Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Mon, 7 Oct 2024 13:01:38 +0100 Subject: [PATCH] [SYCL][Graph] Implement dynamic command-groups Implement Dynamic Command-Group feature specified in PR [[SYCL][Graph] Add specification for kernel binary updates](https://github.com/intel/llvm/pull/14896) This feature enables updating `ur_kernel_handle_t` objects in graph nodes between executions as well as parameters and execution range of nodes. This functionality is currently supported on CUDA & HIP which are used for testing in the new E2E tests. Level Zero support will follow shortly, resulting in the removal of the `XFAIL` labels from the E2E tests. The code for adding nodes to a graph has been refactored to split out verification of edges, and marking memory objects used in a node, as separate helper functions. This allows path for adding a command-group node to do this functions over each CG in the list before creating the node itself. The `dynamic_parameter_impl` code has also been refactored so the code is shared for updating a dynamic parameter used in both a regular kernel node and a dynamic command-group node. See the addition to the design doc for further details on the implementation. --- sycl/doc/design/CommandGraph.md | 17 + .../sycl/ext/oneapi/experimental/graph.hpp | 24 + sycl/include/sycl/handler.hpp | 1 + sycl/source/detail/graph_impl.cpp | 438 +++++++++++++++--- sycl/source/detail/graph_impl.hpp | 254 +++++----- sycl/source/detail/scheduler/commands.cpp | 67 ++- sycl/source/detail/scheduler/commands.hpp | 5 +- .../source/detail/scheduler/graph_builder.cpp | 10 +- sycl/source/detail/scheduler/scheduler.cpp | 7 +- sycl/source/detail/scheduler/scheduler.hpp | 31 +- sycl/source/handler.cpp | 17 +- .../Graph/Update/dyn_cgf_accessor.cpp | 60 +++ .../Graph/Update/dyn_cgf_accessor_deps.cpp | 74 +++ .../Graph/Update/dyn_cgf_accessor_deps2.cpp | 85 ++++ .../Graph/Update/dyn_cgf_accessor_spv.cpp | 81 ++++ .../Graph/Update/dyn_cgf_event_deps.cpp | 73 +++ .../test-e2e/Graph/Update/dyn_cgf_ndrange.cpp | 71 +++ .../Graph/Update/dyn_cgf_overwrite_range.cpp | 59 +++ .../Graph/Update/dyn_cgf_parameters.cpp | 70 +++ .../Graph/Update/dyn_cgf_shared_nodes.cpp | 74 +++ .../Update/dyn_cgf_update_before_finalize.cpp | 48 ++ sycl/test-e2e/Graph/Update/dyn_cgf_usm.cpp | 55 +++ .../Graph/Update/dyn_cgf_with_dyn_param.cpp | 121 +++++ sycl/test/abi/sycl_symbols_linux.dump | 5 + sycl/test/abi/sycl_symbols_windows.dump | 9 + .../Extensions/CommandGraph/Exceptions.cpp | 192 ++++++++ 26 files changed, 1719 insertions(+), 229 deletions(-) create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_accessor.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps2.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_accessor_spv.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_event_deps.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_ndrange.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_overwrite_range.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_parameters.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_shared_nodes.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_update_before_finalize.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_usm.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_with_dyn_param.cpp diff --git a/sycl/doc/design/CommandGraph.md b/sycl/doc/design/CommandGraph.md index 2f83d42a3c57c..24629e5d39976 100644 --- a/sycl/doc/design/CommandGraph.md +++ b/sycl/doc/design/CommandGraph.md @@ -282,6 +282,23 @@ requirements for these new accessors to correctly trigger allocations before updating. This is similar to how individual graph commands are enqueued when accessors are used in a graph node. +### Dynamic Command-Group + +To implement the `dynamic_command_group` class for updating the command-groups (CG) +associated with nodes, the CG member of the node implementation class changes +from a `std::unique_ptr` to a `std::shared_ptr` so that multiple nodes and the +`dynamic_command_group_impl` object can share the same CG object. This avoids +the overhead of having to allocate and free copies of the CG when a new active +CG is selected. + +The `dynamic_command_group_impl` class contains weak pointers to the nodes which +have been created with it, so that when a new active CG is selected it can +propagate the change to those nodes. The `node_impl` class also contains a +reference to the dynamic command-group that created it, so that when the graph +is finalized each node can use the list of kernels in its dynamic command-group +as part of the `urCommandBufferAppendKernelLaunchExp` call to pass the possible +alternative kernels. + ## Optimizations ### Interactions with Profiling diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp index e15d5ed5a6b7a..44a89ce821821 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp @@ -93,6 +93,7 @@ class node_impl; class graph_impl; class exec_graph_impl; class dynamic_parameter_impl; +class dynamic_command_group_impl; } // namespace detail enum class node_type { @@ -213,6 +214,23 @@ class depends_on_all_leaves : public ::sycl::detail::DataLessProperty< } // namespace node } // namespace property +class __SYCL_EXPORT dynamic_command_group { +public: + dynamic_command_group( + const command_graph &Graph, + const std::vector> &CGFList); + + size_t get_active_cgf() const; + void set_active_cgf(size_t Index); + +private: + template + friend const decltype(Obj::impl) & + sycl::detail::getSyclObjImpl(const Obj &SyclObject); + + std::shared_ptr impl; +}; + namespace detail { // Templateless modifiable command-graph base class. class __SYCL_EXPORT modifiable_command_graph { @@ -328,6 +346,12 @@ class __SYCL_EXPORT modifiable_command_graph { modifiable_command_graph(const std::shared_ptr &Impl) : impl(Impl) {} + /// Template-less implementation of add() for dynamic command-group nodes. + /// @param DynCGF Dynamic Command-group function object to add. + /// @param Dep List of predecessor nodes. + /// @return Node added to the graph. + node addImpl(dynamic_command_group &DynCGF, const std::vector &Dep); + /// Template-less implementation of add() for CGF nodes. /// @param CGF Command-group function to add. /// @param Dep List of predecessor nodes. diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index cda66914dcb06..98979cd010348 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -3341,6 +3341,7 @@ class __SYCL_EXPORT handler { size_t Size, bool Block = false); friend class ext::oneapi::experimental::detail::graph_impl; friend class ext::oneapi::experimental::detail::dynamic_parameter_impl; + friend class ext::oneapi::experimental::detail::dynamic_command_group_impl; bool DisableRangeRounding(); diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 4df70fc78c1b1..0bc58f9e86e95 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -349,11 +349,76 @@ void graph_impl::removeRoot(const std::shared_ptr &Root) { MRoots.erase(Root); } -std::shared_ptr -graph_impl::add(const std::vector> &Dep) { - // Copy deps so we can modify them - auto Deps = Dep; +std::set> graph_impl::getCGEdges( + const std::shared_ptr &CommandGroup) const { + const auto &Requirements = CommandGroup->getRequirements(); + if (!MAllowBuffers && Requirements.size()) { + throw sycl::exception(make_error_code(errc::invalid), + "Cannot use buffers in a graph without passing the " + "assume_buffer_outlives_graph property on " + "Graph construction."); + } + + if (CommandGroup->getType() == sycl::detail::CGType::Kernel) { + auto CGKernel = + static_cast(CommandGroup.get()); + if (CGKernel->hasStreams()) { + throw sycl::exception( + make_error_code(errc::invalid), + "Using sycl streams in a graph node is unsupported."); + } + } + + // Add any nodes specified by event dependencies into the dependency list + std::set> UniqueDeps; + for (auto &Dep : CommandGroup->getEvents()) { + if (auto NodeImpl = MEventsMap.find(Dep); NodeImpl == MEventsMap.end()) { + throw sycl::exception(sycl::make_error_code(errc::invalid), + "Event dependency from handler::depends_on does " + "not correspond to a node within the graph"); + } else { + UniqueDeps.insert(NodeImpl->second); + } + } + + // A unique set of dependencies obtained by checking requirements and events + for (auto &Req : Requirements) { + // Look through the graph for nodes which share this requirement + for (auto &Node : MNodeStorage) { + if (Node->hasRequirementDependency(Req)) { + bool ShouldAddDep = true; + // If any of this node's successors have this requirement then we skip + // adding the current node as a dependency. + for (auto &Succ : Node->MSuccessors) { + if (Succ.lock()->hasRequirementDependency(Req)) { + ShouldAddDep = false; + break; + } + } + if (ShouldAddDep) { + UniqueDeps.insert(Node); + } + } + } + } + + return std::move(UniqueDeps); +} + +void graph_impl::markCGMemObjs( + const std::shared_ptr &CommandGroup) { + const auto &Requirements = CommandGroup->getRequirements(); + for (auto &Req : Requirements) { + auto MemObj = static_cast(Req->MSYCLMemObj); + bool WasInserted = MMemObjs.insert(MemObj).second; + if (WasInserted) { + MemObj->markBeingUsedInGraph(); + } + } +} +std::shared_ptr +graph_impl::add(std::vector> &Deps) { const std::shared_ptr &NodeImpl = std::make_shared(); MNodeStorage.push_back(NodeImpl); @@ -367,7 +432,7 @@ graph_impl::add(const std::vector> &Dep) { std::shared_ptr graph_impl::add(std::function CGF, const std::vector &Args, - const std::vector> &Dep) { + std::vector> &Deps) { (void)Args; sycl::handler Handler{shared_from_this()}; @@ -398,7 +463,7 @@ graph_impl::add(std::function CGF, Handler.getType()); auto NodeImpl = - this->add(NodeType, std::move(Handler.impl->MGraphNodeCG), Dep); + this->add(NodeType, std::move(Handler.impl->MGraphNodeCG), Deps); NodeImpl->MNDRangeUsed = Handler.impl->MNDRangeUsed; // Add an event associated with this explicit node for mixed usage addEventForNode(std::make_shared(), NodeImpl); @@ -441,67 +506,15 @@ graph_impl::add(const std::vector Events) { std::shared_ptr graph_impl::add(node_type NodeType, - std::unique_ptr CommandGroup, - const std::vector> &Dep) { - // Copy deps so we can modify them - auto Deps = Dep; + std::shared_ptr CommandGroup, + std::vector> &Deps) { // A unique set of dependencies obtained by checking requirements and events - std::set> UniqueDeps; - const auto &Requirements = CommandGroup->getRequirements(); - if (!MAllowBuffers && Requirements.size()) { - throw sycl::exception(make_error_code(errc::invalid), - "Cannot use buffers in a graph without passing the " - "assume_buffer_outlives_graph property on " - "Graph construction."); - } + std::set> UniqueDeps = getCGEdges(CommandGroup); - if (CommandGroup->getType() == sycl::detail::CGType::Kernel) { - auto CGKernel = - static_cast(CommandGroup.get()); - if (CGKernel->hasStreams()) { - throw sycl::exception( - make_error_code(errc::invalid), - "Using sycl streams in a graph node is unsupported."); - } - } - - for (auto &Req : Requirements) { - // Track and mark the memory objects being used by the graph. - auto MemObj = static_cast(Req->MSYCLMemObj); - bool WasInserted = MMemObjs.insert(MemObj).second; - if (WasInserted) { - MemObj->markBeingUsedInGraph(); - } - // Look through the graph for nodes which share this requirement - for (auto &Node : MNodeStorage) { - if (Node->hasRequirementDependency(Req)) { - bool ShouldAddDep = true; - // If any of this node's successors have this requirement then we skip - // adding the current node as a dependency. - for (auto &Succ : Node->MSuccessors) { - if (Succ.lock()->hasRequirementDependency(Req)) { - ShouldAddDep = false; - break; - } - } - if (ShouldAddDep) { - UniqueDeps.insert(Node); - } - } - } - } + // Track and mark the memory objects being used by the graph. + markCGMemObjs(CommandGroup); - // Add any nodes specified by event dependencies into the dependency list - for (auto &Dep : CommandGroup->getEvents()) { - if (auto NodeImpl = MEventsMap.find(Dep); NodeImpl != MEventsMap.end()) { - UniqueDeps.insert(NodeImpl->second); - } else { - throw sycl::exception(sycl::make_error_code(errc::invalid), - "Event dependency from handler::depends_on does " - "not correspond to a node within the graph"); - } - } // Add any deps determined from requirements and events into the dependency // list Deps.insert(Deps.end(), UniqueDeps.begin(), UniqueDeps.end()); @@ -515,6 +528,45 @@ graph_impl::add(node_type NodeType, return NodeImpl; } +std::shared_ptr +graph_impl::add(std::shared_ptr &DynCGImpl, + std::vector> &Deps) { + // Set of Dependent nodes based on CG event and accessor dependencies. + std::set> DynCGDeps = + getCGEdges(DynCGImpl->MKernels[0]); + for (unsigned i = 1; i < DynCGImpl->getNumCGs(); i++) { + auto &CG = DynCGImpl->MKernels[i]; + auto CGEdges = getCGEdges(CG); + if (CGEdges != DynCGDeps) { + throw sycl::exception(make_error_code(sycl::errc::invalid), + "Command-groups in dynamic command-group don't have" + "equivalent dependencies to other graph nodes."); + } + } + + // Track and mark the memory objects being used by the graph. + for (auto &CG : DynCGImpl->MKernels) { + markCGMemObjs(CG); + } + + // Get active dynamic command-group CG and use to create a node object + const auto &ActiveKernel = DynCGImpl->getActiveKernel(); + std::shared_ptr NodeImpl = + add(node_type::kernel, ActiveKernel, Deps); + + // Track whether sycl::nd-range or sycl::range was used. + NodeImpl->MNDRangeUsed = DynCGImpl->MNDRangeUsed; + + // Add an event associated with this explicit node for mixed usage + addEventForNode(std::make_shared(), NodeImpl); + + // Track the dynamic command-group used inside the node object + DynCGImpl->MNodes.push_back(NodeImpl); + NodeImpl->MDynCG = DynCGImpl; + + return NodeImpl; +} + bool graph_impl::clearQueues() { bool AnyQueuesCleared = false; for (auto &Queue : MRecordingQueues) { @@ -700,10 +752,19 @@ exec_graph_impl::enqueueNodeDirect(sycl::context Ctx, StreamID, InstanceID, CmdTraceEvent, xpti::trace_task_begin, nullptr); #endif + // For dynamic command-group nodes, get list of possible kernels + std::vector KernelAlternatives{}; + if (auto &DynCG = Node->MDynCG) { + for (auto &CG : DynCG->MKernels) { + KernelAlternatives.push_back( + static_cast(CG.get())); + } + } + ur_result_t Res = sycl::detail::enqueueImpCommandBufferKernel( Ctx, DeviceImpl, CommandBuffer, *static_cast((Node->MCommandGroup.get())), - Deps, &NewSyncPoint, &NewCommand, nullptr); + KernelAlternatives, Deps, &NewSyncPoint, &NewCommand, nullptr); MCommandMap[Node] = NewCommand; @@ -736,10 +797,19 @@ ur_exp_command_buffer_sync_point_t exec_graph_impl::enqueueNode( findRealDeps(Deps, N.lock(), MPartitionNodes[Node]); } + // For dynamic command-group nodes, get list of possible kernels + std::vector KernelAlternatives{}; + if (auto &DynCG = Node->MDynCG) { + for (auto &CG : DynCG->MKernels) { + KernelAlternatives.push_back( + static_cast(CG.get())); + } + } + sycl::detail::EventImplPtr Event = sycl::detail::Scheduler::getInstance().addCG( Node->getCGCopy(), AllocaQueue, /*EventNeeded=*/true, CommandBuffer, - Deps); + Deps, KernelAlternatives); MCommandMap[Node] = Event->getCommandBufferCommand(); return Event->getSyncPoint(); @@ -1378,6 +1448,7 @@ void exec_graph_impl::updateImpl(std::shared_ptr Node) { // Gather arg information from Node auto &ExecCG = *(static_cast(Node->MCommandGroup.get())); + // Copy args because we may modify them std::vector NodeArgs = ExecCG.getArguments(); // Copy NDR desc since we need to modify it @@ -1560,6 +1631,27 @@ modifiable_command_graph::modifiable_command_graph( : impl(std::make_shared( SyclQueue.get_context(), SyclQueue.get_device(), PropList)) {} +node modifiable_command_graph::addImpl(dynamic_command_group &DynCGF, + const std::vector &Deps) { + impl->throwIfGraphRecordingQueue("Explicit API \"Add()\" function"); + auto DynCGFImpl = sycl::detail::getSyclObjImpl(DynCGF); + + if (DynCGFImpl->MGraph != impl) { + throw sycl::exception(make_error_code(sycl::errc::invalid), + "Graph does not match the graph associated with " + "dynamic command-group."); + } + + std::vector> DepImpls; + for (auto &D : Deps) { + DepImpls.push_back(sycl::detail::getSyclObjImpl(D)); + } + + graph_impl::WriteLock Lock(impl->MMutex); + std::shared_ptr NodeImpl = impl->add(DynCGFImpl, DepImpls); + return sycl::detail::createSyclObjFromImpl(NodeImpl); +} + node modifiable_command_graph::addImpl(const std::vector &Deps) { impl->throwIfGraphRecordingQueue("Explicit API \"Add()\" function"); std::vector> DepImpls; @@ -1760,6 +1852,203 @@ void dynamic_parameter_base::updateAccessor( impl->updateAccessor(Acc); } +void dynamic_parameter_impl::updateValue(const raw_kernel_arg *NewRawValue, + size_t Size) { + // Number of bytes is taken from member of raw_kernel_arg object rather + // than using the size parameter which represents sizeof(raw_kernel_arg). + std::ignore = Size; + size_t RawArgSize = NewRawValue->MArgSize; + const void *RawArgData = NewRawValue->MArgData; + + for (auto &[NodeWeak, ArgIndex] : MNodes) { + auto NodeShared = NodeWeak.lock(); + if (NodeShared) { + dynamic_parameter_impl::updateCGArgValue( + NodeShared->MCommandGroup, ArgIndex, RawArgData, RawArgSize); + } + } + + for (auto &DynCGInfo : MDynCGs) { + auto DynCG = DynCGInfo.DynCG.lock(); + if (DynCG) { + auto &CG = DynCG->MKernels[DynCGInfo.CGIndex]; + dynamic_parameter_impl::updateCGArgValue(CG, DynCGInfo.ArgIndex, + RawArgData, RawArgSize); + } + } + + std::memcpy(MValueStorage.data(), RawArgData, RawArgSize); +} + +void dynamic_parameter_impl::updateValue(const void *NewValue, size_t Size) { + for (auto &[NodeWeak, ArgIndex] : MNodes) { + auto NodeShared = NodeWeak.lock(); + if (NodeShared) { + dynamic_parameter_impl::updateCGArgValue(NodeShared->MCommandGroup, + ArgIndex, NewValue, Size); + } + } + + for (auto &DynCGInfo : MDynCGs) { + auto DynCG = DynCGInfo.DynCG.lock(); + if (DynCG) { + auto &CG = DynCG->MKernels[DynCGInfo.CGIndex]; + dynamic_parameter_impl::updateCGArgValue(CG, DynCGInfo.ArgIndex, NewValue, + Size); + } + } + + std::memcpy(MValueStorage.data(), NewValue, Size); +} + +void dynamic_parameter_impl::updateAccessor( + const sycl::detail::AccessorBaseHost *Acc) { + for (auto &[NodeWeak, ArgIndex] : MNodes) { + auto NodeShared = NodeWeak.lock(); + // Should we fail here if the node isn't alive anymore? + if (NodeShared) { + dynamic_parameter_impl::updateCGAccessor(NodeShared->MCommandGroup, + ArgIndex, Acc); + } + } + + for (auto &DynCGInfo : MDynCGs) { + auto DynCG = DynCGInfo.DynCG.lock(); + if (DynCG) { + auto &CG = DynCG->MKernels[DynCGInfo.CGIndex]; + dynamic_parameter_impl::updateCGAccessor(CG, DynCGInfo.ArgIndex, Acc); + } + } + + std::memcpy(MValueStorage.data(), Acc, + sizeof(sycl::detail::AccessorBaseHost)); +} + +void dynamic_parameter_impl::updateCGArgValue( + std::shared_ptr &CG, int ArgIndex, const void *NewValue, + size_t Size) { + auto &Args = static_cast(CG.get())->MArgs; + for (auto &Arg : Args) { + if (Arg.MIndex != ArgIndex) { + continue; + } + assert(Arg.MSize == static_cast(Size)); + // MPtr may be a pointer into arg storage so we memcpy the contents of + // NewValue rather than assign it directly + std::memcpy(Arg.MPtr, NewValue, Size); + break; + } +} + +void dynamic_parameter_impl::updateCGAccessor( + std::shared_ptr &CG, int ArgIndex, + const sycl::detail::AccessorBaseHost *Acc) { + auto &Args = static_cast(CG.get())->MArgs; + + auto NewAccImpl = sycl::detail::getSyclObjImpl(*Acc); + for (auto &Arg : Args) { + if (Arg.MIndex != ArgIndex) { + continue; + } + assert(Arg.MType == sycl::detail::kernel_param_kind_t::kind_accessor); + + // Find old accessor in accessor storage and replace with new one + if (static_cast(NewAccImpl->MSYCLMemObj) + ->needsWriteBack()) { + throw sycl::exception( + make_error_code(errc::invalid), + "Accessors to buffers which have write_back enabled " + "are not allowed to be used in command graphs."); + } + + // All accessors passed to this function will be placeholders, so we must + // perform steps similar to what happens when handler::require() is + // called here. + sycl::detail::Requirement *NewReq = NewAccImpl.get(); + if (NewReq->MAccessMode != sycl::access_mode::read) { + auto SYCLMemObj = + static_cast(NewReq->MSYCLMemObj); + SYCLMemObj->handleWriteAccessorCreation(); + } + + for (auto &Acc : CG->getAccStorage()) { + if (auto OldAcc = static_cast(Arg.MPtr); + Acc.get() == OldAcc) { + Acc = NewAccImpl; + } + } + + for (auto &Req : CG->getRequirements()) { + if (auto OldReq = static_cast(Arg.MPtr); + Req == OldReq) { + Req = NewReq; + } + } + Arg.MPtr = NewAccImpl.get(); + break; + } +} + +dynamic_command_group_impl::dynamic_command_group_impl( + const command_graph &Graph) + : MGraph{sycl::detail::getSyclObjImpl(Graph)}, MActiveCGF(0), + MNDRangeUsed(false) {} + +void dynamic_command_group_impl::finalizeCGFList( + const std::vector> &CGFList) { + // True if kernels use sycl::nd_range, and false if using sycl::range + bool NDRangeUsed = false; + for (size_t CGFIndex = 0; CGFIndex < CGFList.size(); CGFIndex++) { + const auto &CGF = CGFList[CGFIndex]; + // Handler defined inside the loop so it doesn't appear to the runtime + // as a single command-group with multiple commands inside. + sycl::handler Handler{MGraph}; + CGF(Handler); + + if (Handler.getType() != sycl::detail::CGType::Kernel) { + throw sycl::exception( + make_error_code(errc::invalid), + "The only type of command-groups that can be used in " + "dynamic command-groups is kernels."); + } + + Handler.finalize(); + + // Verify range/nd_range is consistent across CGs in list + if (CGFIndex == 0) { + NDRangeUsed = Handler.impl->MNDRangeUsed; + } else if (NDRangeUsed != Handler.impl->MNDRangeUsed) { + throw sycl::exception( + make_error_code(errc::invalid), + "Cannot mix sycl::range and sycl::nd_range kenels."); + } + + // Take detail::CG object from handler + MKernels.push_back(std::move(Handler.impl->MGraphNodeCG)); + + // Track dynamic_parameter usage in command-list + auto &DynamicParams = Handler.impl->MDynamicParameters; + for (auto &[DynamicParam, ArgIndex] : DynamicParams) { + DynamicParam->registerDynCG(shared_from_this(), CGFIndex, ArgIndex); + } + } + MNDRangeUsed = NDRangeUsed; +} + +void dynamic_command_group_impl::setActiveIndex(size_t Index) { + if (Index >= getNumCGs()) { + throw sycl::exception(sycl::make_error_code(errc::invalid), + "Index is out of range."); + } + MActiveCGF = Index; + + // Update nodes using the dynamic command-group to use the new active CG + for (auto &Node : MNodes) { + if (auto NodeSP = Node.lock()) { + NodeSP->MCommandGroup = getActiveKernel(); + } + } +} } // namespace detail node_type node::get_type() const { return impl->MNodeType; } @@ -1798,6 +2087,25 @@ template <> __SYCL_EXPORT void node::update_range<2>(range<2> Range) { template <> __SYCL_EXPORT void node::update_range<3>(range<3> Range) { impl->updateRange(Range); } + +dynamic_command_group::dynamic_command_group( + const command_graph &Graph, + const std::vector> &CGFList) + : impl(std::make_shared(Graph)) { + if (CGFList.empty()) { + throw sycl::exception(sycl::make_error_code(errc::invalid), + "Dynamic command-group cannot be created with an " + "empty CGF list."); + } + impl->finalizeCGFList(CGFList); +} + +size_t dynamic_command_group::get_active_cgf() const { + return impl->getActiveIndex(); +} +void dynamic_command_group::set_active_cgf(size_t Index) { + return impl->setActiveIndex(Index); +} } // namespace experimental } // namespace oneapi } // namespace ext diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 4ee34830f39a2..a99e90894eb98 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -95,10 +95,12 @@ class node_impl : public std::enable_shared_from_this { /// User facing type of the node. node_type MNodeType = node_type::empty; /// Command group object which stores all args etc needed to enqueue the node - std::unique_ptr MCommandGroup; + std::shared_ptr MCommandGroup; /// Stores the executable graph impl associated with this node if it is a /// subgraph node. std::shared_ptr MSubGraphImpl; + /// Dynamic command-group object used in node, if any. + std::shared_ptr MDynCG; /// Used for tracking visited status during cycle checks. bool MVisited = false; @@ -143,10 +145,9 @@ class node_impl : public std::enable_shared_from_this { /// @param NodeType Type of the command-group. /// @param CommandGroup The CG which stores the command information for this /// node. - node_impl(node_type NodeType, - std::unique_ptr &&CommandGroup) + node_impl(node_type NodeType, std::shared_ptr CommandGroup) : MCGType(CommandGroup->getType()), MNodeType(NodeType), - MCommandGroup(std::move(CommandGroup)) { + MCommandGroup(CommandGroup) { if (NodeType == node_type::subgraph) { MSubGraphImpl = static_cast(MCommandGroup.get()) @@ -160,7 +161,7 @@ class node_impl : public std::enable_shared_from_this { : enable_shared_from_this(Other), MSuccessors(Other.MSuccessors), MPredecessors(Other.MPredecessors), MCGType(Other.MCGType), MNodeType(Other.MNodeType), MCommandGroup(Other.getCGCopy()), - MSubGraphImpl(Other.MSubGraphImpl) {} + MSubGraphImpl(Other.MSubGraphImpl), MDynCG(Other.MDynCG) {} /// Copy-assignment operator. This will perform a deep-copy of the /// command group object associated with this node. @@ -172,6 +173,7 @@ class node_impl : public std::enable_shared_from_this { MNodeType = Other.MNodeType; MCommandGroup = Other.getCGCopy(); MSubGraphImpl = Other.MSubGraphImpl; + MDynCG = Other.MDynCG; } return *this; } @@ -405,75 +407,6 @@ class node_impl : public std::enable_shared_from_this { return (ReqSrc->MDims > 1) || (ReqDst->MDims > 1); } - /// Update the value of an accessor inside this node. Accessors must be - /// handled specifically compared to other argument values. - /// @param ArgIndex The index of the accessor arg to be updated - /// @param Acc Pointer to the new accessor value - void updateAccessor(int ArgIndex, const sycl::detail::AccessorBaseHost *Acc) { - auto &Args = - static_cast(MCommandGroup.get())->MArgs; - auto NewAccImpl = sycl::detail::getSyclObjImpl(*Acc); - for (auto &Arg : Args) { - if (Arg.MIndex != ArgIndex) { - continue; - } - assert(Arg.MType == sycl::detail::kernel_param_kind_t::kind_accessor); - - // Find old accessor in accessor storage and replace with new one - if (static_cast(NewAccImpl->MSYCLMemObj) - ->needsWriteBack()) { - throw sycl::exception( - make_error_code(errc::invalid), - "Accessors to buffers which have write_back enabled " - "are not allowed to be used in command graphs."); - } - - // All accessors passed to this function will be placeholders, so we must - // perform steps similar to what happens when handler::require() is - // called here. - sycl::detail::Requirement *NewReq = NewAccImpl.get(); - if (NewReq->MAccessMode != sycl::access_mode::read) { - auto SYCLMemObj = - static_cast(NewReq->MSYCLMemObj); - SYCLMemObj->handleWriteAccessorCreation(); - } - - for (auto &Acc : MCommandGroup->getAccStorage()) { - if (auto OldAcc = - static_cast(Arg.MPtr); - Acc.get() == OldAcc) { - Acc = NewAccImpl; - } - } - - for (auto &Req : MCommandGroup->getRequirements()) { - if (auto OldReq = - static_cast(Arg.MPtr); - Req == OldReq) { - Req = NewReq; - } - } - Arg.MPtr = NewAccImpl.get(); - break; - } - } - - void updateArgValue(int ArgIndex, const void *NewValue, size_t Size) { - - auto &Args = - static_cast(MCommandGroup.get())->MArgs; - for (auto &Arg : Args) { - if (Arg.MIndex != ArgIndex) { - continue; - } - assert(Arg.MSize == static_cast(Size)); - // MPtr may be a pointer into arg storage so we memcpy the contents of - // NewValue rather than assign it directly - std::memcpy(Arg.MPtr, NewValue, Size); - break; - } - } - template void updateNDRange(nd_range ExecutionRange) { if (MCGType != sycl::detail::CGType::Kernel) { @@ -888,30 +821,40 @@ class graph_impl : public std::enable_shared_from_this { /// @param Root Node to remove from list of root nodes. void removeRoot(const std::shared_ptr &Root); + /// Verifies the CG is valid to add to the graph and returns set of + /// dependent nodes if so. + /// @param CommandGroup The command group to verify and retrieve edges for. + /// @return Set of dependent nodes in the graph. + std::set> + getCGEdges(const std::shared_ptr &CommandGroup) const; + + /// Identifies the sycl buffers used in the command-group and marks them + /// as used in the graph. + /// @param CommandGroup The command-group to check for buffer usage in. + void markCGMemObjs(const std::shared_ptr &CommandGroup); + /// Create a kernel node in the graph. /// @param NodeType User facing type of the node. /// @param CommandGroup The CG which stores all information for this node. - /// @param Dep Dependencies of the created node. + /// @param Deps Dependencies of the created node. /// @return Created node in the graph. - std::shared_ptr - add(node_type NodeType, std::unique_ptr CommandGroup, - const std::vector> &Dep = {}); + std::shared_ptr add(node_type NodeType, + std::shared_ptr CommandGroup, + std::vector> &Deps); /// Create a CGF node in the graph. /// @param CGF Command-group function to create node with. /// @param Args Node arguments. - /// @param Dep Dependencies of the created node. + /// @param Deps Dependencies of the created node. /// @return Created node in the graph. - std::shared_ptr - add(std::function CGF, - const std::vector &Args, - const std::vector> &Dep = {}); + std::shared_ptr add(std::function CGF, + const std::vector &Args, + std::vector> &Deps); /// Create an empty node in the graph. - /// @param Dep List of predecessor nodes. + /// @param Deps List of predecessor nodes. /// @return Created node in the graph. - std::shared_ptr - add(const std::vector> &Dep = {}); + std::shared_ptr add(std::vector> &Deps); /// Create an empty node in the graph. /// @param Events List of events associated to this node. @@ -919,6 +862,14 @@ class graph_impl : public std::enable_shared_from_this { std::shared_ptr add(const std::vector Events); + /// Create a dynamic command-group node in the graph. + /// @param DynCGImpl Dynamic command-group used to create node. + /// @param Deps List of predecessor nodes. + /// @return Created node in the graph. + std::shared_ptr + add(std::shared_ptr &DynCGImpl, + std::vector> &Deps); + /// Add a queue to the set of queues which are currently recording to this /// graph. /// @param RecordingQueue Queue to add to set. @@ -1236,7 +1187,12 @@ class graph_impl : public std::enable_shared_from_this { /// @param Node The node to add deps for /// @param Deps List of dependent nodes void addDepsToNode(std::shared_ptr Node, - const std::vector> &Deps) { + std::vector> &Deps) { + // Remove empty shared pointers from the list + auto EmptyElementIter = + std::remove(Deps.begin(), Deps.end(), std::shared_ptr()); + Deps.erase(EmptyElementIter, Deps.end()); + if (!Deps.empty()) { for (auto &N : Deps) { N->registerSuccessor(Node); @@ -1520,65 +1476,113 @@ class dynamic_parameter_impl { MNodes.emplace_back(NodeImpl, ArgIndex); } + /// Struct detailing an instance of the usage of the dynamic parameter in a + /// dynamic CG. + struct DynamicCGInfo { + /// Dynamic command-group that uses this dynamic parameter. + std::weak_ptr DynCG; + /// Index of the CG in the Dynamic CG that uses this dynamic parameter. + size_t CGIndex; + /// The arg index in the kernel the dynamic parameter is used. + int ArgIndex; + }; + + /// Registers a dynamic command-group with this dynamic parameter. + /// @param DynCG The dynamic command-group to register. + /// @param CGIndex Index of the CG in DynCG using this dynamic parameter. + /// @param ArgIndex The arg index in the kernel the dynamic parameter is used. + void registerDynCG(std::shared_ptr DynCG, + size_t CGIndex, int ArgIndex) { + MDynCGs.emplace_back(DynamicCGInfo{DynCG, CGIndex, ArgIndex}); + } + /// Get a pointer to the internal value of this dynamic parameter void *getValue() { return MValueStorage.data(); } /// Update sycl_ext_oneapi_raw_kernel_arg parameter /// @param NewRawValue Pointer to a raw_kernel_arg object. /// @param Size Parameter is ignored. - void updateValue(const raw_kernel_arg *NewRawValue, size_t Size) { - // Number of bytes is taken from member of raw_kernel_arg object rather - // than using the size parameter which represents sizeof(raw_kernel_arg). - std::ignore = Size; - size_t RawArgSize = NewRawValue->MArgSize; - const void *RawArgData = NewRawValue->MArgData; - - for (auto &[NodeWeak, ArgIndex] : MNodes) { - auto NodeShared = NodeWeak.lock(); - if (NodeShared) { - NodeShared->updateArgValue(ArgIndex, RawArgData, RawArgSize); - } - } - std::memcpy(MValueStorage.data(), RawArgData, RawArgSize); - } + void updateValue(const raw_kernel_arg *NewRawValue, size_t Size); /// Update the internal value of this dynamic parameter as well as the value - /// of this parameter in all registered nodes. + /// of this parameter in all registered nodes and dynamic CGs. /// @param NewValue Pointer to the new value /// @param Size Size of the data pointer to by NewValue - void updateValue(const void *NewValue, size_t Size) { - for (auto &[NodeWeak, ArgIndex] : MNodes) { - auto NodeShared = NodeWeak.lock(); - if (NodeShared) { - NodeShared->updateArgValue(ArgIndex, NewValue, Size); - } - } - std::memcpy(MValueStorage.data(), NewValue, Size); - } + void updateValue(const void *NewValue, size_t Size); /// Update the internal value of this dynamic parameter as well as the value - /// of this parameter in all registered nodes. Should only be called for - /// accessor dynamic_parameters. + /// of this parameter in all registered nodes and dynamic CGs. Should only be + /// called for accessor dynamic_parameters. /// @param Acc The new accessor value - void updateAccessor(const sycl::detail::AccessorBaseHost *Acc) { - for (auto &[NodeWeak, ArgIndex] : MNodes) { - auto NodeShared = NodeWeak.lock(); - // Should we fail here if the node isn't alive anymore? - if (NodeShared) { - NodeShared->updateAccessor(ArgIndex, Acc); - } - } - std::memcpy(MValueStorage.data(), Acc, - sizeof(sycl::detail::AccessorBaseHost)); - } + void updateAccessor(const sycl::detail::AccessorBaseHost *Acc); + + /// Static helper function for updating command-group value arguments. + /// @param CG The command-group to update the argument information for. + /// @param ArgIndex The argument index to update. + /// @param NewValue Pointer to the new value. + /// @param Size Size of the data pointer to by NewValue + static void updateCGArgValue(std::shared_ptr &CG, + int ArgIndex, const void *NewValue, size_t Size); + + /// Static helper function for updating command-group accessor arguments. + /// @param CG The command-group to update the argument information for. + /// @param ArgIndex The argument index to update. + /// @param Acc The new accessor value + static void updateCGAccessor(std::shared_ptr &CG, + int ArgIndex, + const sycl::detail::AccessorBaseHost *Acc); // Weak ptrs to node_impls which will be updated std::vector, int>> MNodes; + // Dynamic command-groups which will be updated + std::vector MDynCGs; std::shared_ptr MGraph; std::vector MValueStorage; }; +class dynamic_command_group_impl + : public std::enable_shared_from_this { +public: + dynamic_command_group_impl( + const command_graph &Graph); + + /// Returns the index of the active command-group + size_t getActiveIndex() const { return MActiveCGF; } + + /// Returns the number of CGs in the dynamic command-group. + size_t getNumCGs() const { return MKernels.size(); } + + /// Set the index of the active command-group. + /// @param Index The new index. + void setActiveIndex(size_t Index); + + /// Instantiates a command-group object for each CGF in the list. + /// @param CGFList List of CGFs to finalize with a handler into CG objects. + void + finalizeCGFList(const std::vector> &CGFList); + + /// Retrieve CG at the currently active index + /// @param Shared pointer to the active CG object. + std::shared_ptr getActiveKernel() const { + return MKernels[MActiveCGF]; + } + + /// Graph this dynamic command-group is associated with. + std::shared_ptr MGraph; + + /// Index of active command-group + std::atomic MActiveCGF; + + /// True if CG functions use sycl::nd_range, false if using sycl::range. + bool MNDRangeUsed; + + /// List of kernel command-groups for dynamic command-group nodes + std::vector> MKernels; + + /// List of nodes using this dynamic command-group. + std::vector> MNodes; +}; } // namespace detail } // namespace experimental } // namespace oneapi diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 6556b8008b30a..7e49852381b51 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -1923,10 +1923,12 @@ static std::string_view cgTypeToString(detail::CGType Type) { ExecCGCommand::ExecCGCommand( std::unique_ptr CommandGroup, QueueImplPtr Queue, bool EventNeeded, ur_exp_command_buffer_handle_t CommandBuffer, - const std::vector &Dependencies) + const std::vector &Dependencies, + const std::vector &AlternativeKernels) : Command(CommandType::RUN_CG, std::move(Queue), CommandBuffer, Dependencies), - MEventNeeded(EventNeeded), MCommandGroup(std::move(CommandGroup)) { + MEventNeeded(EventNeeded), MCommandGroup(std::move(CommandGroup)), + MAlternativeKernels(AlternativeKernels) { if (MCommandGroup->getType() == detail::CGType::CodeplayHostTask) { MEvent->setSubmittedQueue( static_cast(MCommandGroup.get())->MQueue); @@ -2479,17 +2481,26 @@ ur_result_t enqueueImpCommandBufferKernel( context Ctx, DeviceImplPtr DeviceImpl, ur_exp_command_buffer_handle_t CommandBuffer, const CGExecKernel &CommandGroup, + const std::vector &AlternativeKernels, std::vector &SyncPoints, ur_exp_command_buffer_sync_point_t *OutSyncPoint, ur_exp_command_buffer_command_handle_t *OutCommand, const std::function &getMemAllocationFunc) { auto ContextImpl = sycl::detail::getSyclObjImpl(Ctx); const sycl::detail::AdapterPtr &Adapter = ContextImpl->getAdapter(); + + // UR kernel and program for 'CommandGroup' ur_kernel_handle_t UrKernel = nullptr; ur_program_handle_t UrProgram = nullptr; + + // Impl objects created when 'CommandGroup' is from a kernel bundle std::shared_ptr SyclKernelImpl = nullptr; std::shared_ptr DeviceImageImpl = nullptr; + // List of ur objects to be released after UR call + std::vector UrKernelsToRelease; + std::vector UrProgramsToRelease; + auto Kernel = CommandGroup.MSyclKernel; auto KernelBundleImplPtr = CommandGroup.MKernelBundle; const KernelArgMask *EliminatedArgMask = nullptr; @@ -2518,6 +2529,39 @@ ur_result_t enqueueImpCommandBufferKernel( std::tie(UrKernel, std::ignore, EliminatedArgMask, UrProgram) = sycl::detail::ProgramManager::getInstance().getOrCreateKernel( ContextImpl, DeviceImpl, CommandGroup.MKernelName); + UrKernelsToRelease.push_back(UrKernel); + UrProgramsToRelease.push_back(UrProgram); + } + + // Build up the list of UR kernel handles that the UR command could be + // updated to use. + std::vector AltUrKernels; + for (const auto &AltCGKernel : AlternativeKernels) { + ur_kernel_handle_t AltUrKernel = nullptr; + if (auto KernelBundleImplPtr = AltCGKernel->MKernelBundle; + KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) { + auto KernelName = AltCGKernel->MKernelName; + kernel_id KernelID = + detail::ProgramManager::getInstance().getSYCLKernelID(KernelName); + kernel SyclKernel = + KernelBundleImplPtr->get_kernel(KernelID, KernelBundleImplPtr); + AltUrKernel = detail::getSyclObjImpl(SyclKernel)->getHandleRef(); + } else if (AltCGKernel->MSyclKernel != nullptr) { + AltUrKernel = Kernel->getHandleRef(); + } else { + ur_program_handle_t UrProgram = nullptr; + std::tie(AltUrKernel, std::ignore, std::ignore, UrProgram) = + sycl::detail::ProgramManager::getInstance().getOrCreateKernel( + ContextImpl, DeviceImpl, AltCGKernel->MKernelName); + UrKernelsToRelease.push_back(AltUrKernel); + UrProgramsToRelease.push_back(UrProgram); + } + + if (AltUrKernel != UrKernel) { + // Don't include command-group 'CommandGroup' in the list to pass to UR, + // as this will be used for the primary ur kernel parameter. + AltUrKernels.push_back(AltUrKernel); + } } auto SetFunc = [&Adapter, &UrKernel, &DeviceImageImpl, &Ctx, @@ -2561,13 +2605,16 @@ ur_result_t enqueueImpCommandBufferKernel( ur_result_t Res = Adapter->call_nocheck( CommandBuffer, UrKernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0], - &NDRDesc.GlobalSize[0], LocalSize, 0, nullptr, SyncPoints.size(), - SyncPoints.size() ? SyncPoints.data() : nullptr, 0, nullptr, - OutSyncPoint, nullptr, OutCommand); + &NDRDesc.GlobalSize[0], LocalSize, AltUrKernels.size(), + AltUrKernels.size() ? AltUrKernels.data() : nullptr, + SyncPoints.size(), SyncPoints.size() ? SyncPoints.data() : nullptr, 0, + nullptr, OutSyncPoint, nullptr, OutCommand); - if (!SyclKernelImpl && !Kernel) { - Adapter->call(UrKernel); - Adapter->call(UrProgram); + for (auto &Kernel : UrKernelsToRelease) { + Adapter->call(Kernel); + } + for (auto &Program : UrProgramsToRelease) { + Adapter->call(Program); } if (Res != UR_RESULT_SUCCESS) { @@ -2779,8 +2826,8 @@ ur_result_t ExecCGCommand::enqueueImpCommandBuffer() { auto result = enqueueImpCommandBufferKernel( MQueue->get_context(), MQueue->getDeviceImplPtr(), MCommandBuffer, - *ExecKernel, MSyncPointDeps, &OutSyncPoint, &OutCommand, - getMemAllocationFunc); + *ExecKernel, MAlternativeKernels, MSyncPointDeps, &OutSyncPoint, + &OutCommand, getMemAllocationFunc); MEvent->setSyncPoint(OutSyncPoint); MEvent->setCommandBufferCommand(OutCommand); return result; diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 1aecf5ed4eabb..dc61bf422b186 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -637,7 +637,8 @@ class ExecCGCommand : public Command { ExecCGCommand( std::unique_ptr CommandGroup, QueueImplPtr Queue, bool EventNeeded, ur_exp_command_buffer_handle_t CommandBuffer = nullptr, - const std::vector &Dependencies = {}); + const std::vector &Dependencies = {}, + const std::vector &AlternativeKernels = {}); std::vector> getAuxiliaryResources() const; @@ -674,6 +675,7 @@ class ExecCGCommand : public Command { AllocaCommandBase *getAllocaForReq(Requirement *Req); std::unique_ptr MCommandGroup; + std::vector MAlternativeKernels; friend class Command; }; @@ -733,6 +735,7 @@ ur_result_t enqueueImpCommandBufferKernel( context Ctx, DeviceImplPtr DeviceImpl, ur_exp_command_buffer_handle_t CommandBuffer, const CGExecKernel &CommandGroup, + const std::vector &AlternativeKernels, std::vector &SyncPoints, ur_exp_command_buffer_sync_point_t *OutSyncPoint, ur_exp_command_buffer_command_handle_t *OutCommand, diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 5f95995e279d7..ecb4c558ac69e 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -921,13 +921,15 @@ Command *Scheduler::GraphBuilder::addCG( std::unique_ptr CommandGroup, const QueueImplPtr &Queue, std::vector &ToEnqueue, bool EventNeeded, ur_exp_command_buffer_handle_t CommandBuffer, - const std::vector &Dependencies) { + const std::vector &Dependencies, + const std::vector &AlternativeKernels) { + std::vector &Reqs = CommandGroup->getRequirements(); std::vector &Events = CommandGroup->getEvents(); - auto NewCmd = std::make_unique(std::move(CommandGroup), Queue, - EventNeeded, CommandBuffer, - std::move(Dependencies)); + auto NewCmd = std::make_unique( + std::move(CommandGroup), Queue, EventNeeded, CommandBuffer, + std::move(Dependencies), std::move(AlternativeKernels)); if (!NewCmd) throw exception(make_error_code(errc::memory_allocation), diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index ac1d8ca44c5dc..92358a2d5f1df 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -97,7 +97,9 @@ void Scheduler::waitForRecordToFinish(MemObjRecord *Record, EventImplPtr Scheduler::addCG( std::unique_ptr CommandGroup, const QueueImplPtr &Queue, bool EventNeeded, ur_exp_command_buffer_handle_t CommandBuffer, - const std::vector &Dependencies) { + const std::vector &Dependencies, + const std::vector &AlternativeKernels) { + EventImplPtr NewEvent = nullptr; const CGType Type = CommandGroup->getType(); std::vector AuxiliaryCmds; @@ -122,7 +124,8 @@ EventImplPtr Scheduler::addCG( default: NewCmd = MGraphBuilder.addCG(std::move(CommandGroup), std::move(Queue), AuxiliaryCmds, EventNeeded, CommandBuffer, - std::move(Dependencies)); + std::move(Dependencies), + std::move(AlternativeKernels)); } NewEvent = NewCmd->getEvent(); NewEvent->setSubmissionTime(); diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index c6d2d07600d12..fcd681aa0940c 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -372,13 +372,18 @@ class Scheduler { /// \param EventNeeded Specifies whether an event is explicitly required. /// \param CommandBuffer Optional command buffer to enqueue to instead of /// directly to the queue. - /// \param Dependencies Optional list of dependency - /// sync points when enqueuing to a command buffer. + /// \param Dependencies Optional list of dependencies to other command-buffer + /// sync points when enqueuing to a command buffer. Only valid to pass when + /// \p CommandBuffer is not null. + /// \param AlternativeKernels Optional list of kernels that the command can + /// be dynamically updated to. Only valid to pass when \p CommandBuffer is + /// not null. /// \return an event object to wait on for command group completion. EventImplPtr addCG( std::unique_ptr CommandGroup, const QueueImplPtr &Queue, bool EventNeeded, ur_exp_command_buffer_handle_t CommandBuffer = nullptr, - const std::vector &Dependencies = {}); + const std::vector &Dependencies = {}, + const std::vector &AlternativeKernels = {}); /// Registers a command group, that copies most recent memory to the memory /// pointed by the requirement. @@ -548,18 +553,22 @@ class Scheduler { /// \sa queue::submit, Scheduler::addCG /// \param CommandBuffer Optional command buffer to enqueue to instead of /// directly to the queue. - /// \param Dependencies Optional list of dependency - /// sync points when enqueuing to a command buffer. + /// \param Dependencies Optional list of dependency sync points when + /// enqueuing to a command buffer. + /// \param AlternativeKernels Optional list of kernels that the command can + /// dynamically be updated to. Only valid to pass when CommandBuffer is not + /// null. /// /// \return a command that represents command group execution and a bool /// indicating whether this command should be enqueued to the graph /// processor right away or not. - Command *addCG(std::unique_ptr CommandGroup, - const QueueImplPtr &Queue, std::vector &ToEnqueue, - bool EventNeeded, - ur_exp_command_buffer_handle_t CommandBuffer = nullptr, - const std::vector - &Dependencies = {}); + Command * + addCG(std::unique_ptr CommandGroup, const QueueImplPtr &Queue, + std::vector &ToEnqueue, bool EventNeeded, + ur_exp_command_buffer_handle_t CommandBuffer = nullptr, + const std::vector &Dependencies = + {}, + const std::vector &AlternativeKernels = {}); /// Registers a \ref CG "command group" that updates host memory to the /// latest state. diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 12f38de0c1c8e..218c05efeb951 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -554,11 +554,9 @@ event handler::finalize() { // Find the last node added to the graph from this queue, so our new // node can set it as a predecessor. auto DependentNode = GraphImpl->getLastInorderNode(MQueue); - - NodeImpl = DependentNode - ? GraphImpl->add(NodeType, std::move(CommandGroup), - {DependentNode}) - : GraphImpl->add(NodeType, std::move(CommandGroup)); + std::vector> + Deps = {DependentNode}; + NodeImpl = GraphImpl->add(NodeType, std::move(CommandGroup), Deps); // If we are recording an in-order queue remember the new node, so it // can be used as a dependency for any more nodes recorded from this @@ -566,12 +564,9 @@ event handler::finalize() { GraphImpl->setLastInorderNode(MQueue, NodeImpl); } else { auto LastBarrierRecordedFromQueue = GraphImpl->getBarrierDep(MQueue); - if (LastBarrierRecordedFromQueue) { - NodeImpl = GraphImpl->add(NodeType, std::move(CommandGroup), - {LastBarrierRecordedFromQueue}); - } else { - NodeImpl = GraphImpl->add(NodeType, std::move(CommandGroup)); - } + std::vector> + Deps = {LastBarrierRecordedFromQueue}; + NodeImpl = GraphImpl->add(NodeType, std::move(CommandGroup), Deps); if (NodeImpl->MCGType == sycl::detail::CGType::Barrier) { GraphImpl->setBarrierDep(MQueue, NodeImpl); diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_accessor.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor.cpp new file mode 100644 index 0000000000000..2b5f378d8bed7 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor.cpp @@ -0,0 +1,60 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests using dynamic command-group objects with buffer accessors + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + const size_t N = 1024; + std::vector HostData(N, 0); + buffer Buf{HostData}; + Buf.set_write_back(false); + auto Acc = Buf.get_access(); + + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + + int PatternA = 42; + auto CGFA = [&](handler &CGH) { + CGH.require(Acc); + CGH.parallel_for(N, [=](item<1> Item) { Acc[Item.get_id()] = PatternA; }); + }; + + int PatternB = 0xA; + auto CGFB = [&](handler &CGH) { + CGH.require(Acc); + CGH.parallel_for(N, [=](item<1> Item) { Acc[Item.get_id()] = PatternB; }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); + auto DynamicCGNode = Graph.add(DynamicCG); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Acc, HostData.data()).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == PatternA); + } + + DynamicCG.set_active_cgf(1); + ExecGraph.update(DynamicCGNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(Acc, HostData.data()).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == PatternB); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps.cpp new file mode 100644 index 0000000000000..5ce7a4bf40df1 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps.cpp @@ -0,0 +1,74 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests adding a dynamic command-group node to a graph using buffer +// accessors for the node edges. + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + const size_t N = 1024; + int *Ptr = (int *)sycl::malloc_device(N, Queue); + std::vector HostData(N, 0); + buffer Buf{HostData}; + Buf.set_write_back(false); + + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + + auto RootNode = Graph.add([&](handler &CGH) { + auto Acc = Buf.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { Acc[Item.get_id()] = 1; }); + }); + + int PatternA = 42; + auto CGFA = [&](handler &CGH) { + auto Acc = Buf.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { Acc[Item.get_id()] += PatternA; }); + }; + + int PatternB = 0xA; + auto CGFB = [&](handler &CGH) { + auto Acc = Buf.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { Acc[Item.get_id()] += PatternB; }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); + auto DynamicCGNode = Graph.add(DynamicCG); + + auto LeafNode = Graph.add([&](handler &CGH) { + auto Acc = Buf.get_access(CGH); + CGH.parallel_for( + N, [=](item<1> Item) { Ptr[Item.get_id()] = Acc[Item.get_id()]; }); + }); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == (PatternA + 1)); + } + + DynamicCG.set_active_cgf(1); + ExecGraph.update(DynamicCGNode); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == (PatternB + 1)); + } + + sycl::free(Ptr, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps2.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps2.cpp new file mode 100644 index 0000000000000..8d50b8b26e0c2 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps2.cpp @@ -0,0 +1,85 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests adding a dynamic command-group node to a graph using buffer +// accessors for the node edges, but where different command-groups +// use different buffers that create identical edges. + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + const size_t N = 1024; + int *Ptr = (int *)sycl::malloc_device(N, Queue); + std::vector HostData(N, 0); + buffer BufA{sycl::range<1>(N)}; + buffer BufB{sycl::range<1>(N)}; + BufA.set_write_back(false); + BufB.set_write_back(false); + + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + + int InitA = 4; + int InitB = -4; + auto RootNode = Graph.add([&](handler &CGH) { + auto AccA = BufA.get_access(CGH); + auto AccB = BufB.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { + AccA[Item.get_id()] = InitA; + AccB[Item.get_id()] = InitB; + }); + }); + + int PatternA = 42; + auto CGFA = [&](handler &CGH) { + auto AccA = BufA.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { AccA[Item.get_id()] += PatternA; }); + }; + + int PatternB = 0xA; + auto CGFB = [&](handler &CGH) { + auto AccB = BufB.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { AccB[Item.get_id()] += PatternB; }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); + auto DynamicCGNode = Graph.add(DynamicCG); + + auto LeafNode = Graph.add([&](handler &CGH) { + auto AccA = BufA.get_access(CGH); + auto AccB = BufB.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { + Ptr[Item.get_id()] = AccA[Item.get_id()] + AccB[Item.get_id()]; + }); + }); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == (InitA + InitB + PatternA)); + } + + DynamicCG.set_active_cgf(1); + ExecGraph.update(DynamicCGNode); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == (InitA + InitB + PatternB)); + } + + sycl::free(Ptr, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_spv.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_spv.cpp new file mode 100644 index 0000000000000..80556f60fc75f --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_spv.cpp @@ -0,0 +1,81 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out %S/../Inputs/Kernels/update_with_indices_accessor.spv +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out %S/../Inputs/Kernels/update_with_indices_accessor.spv 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out %S/../Inputs/Kernels/update_with_indices_accessor.spv 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// REQUIRES: level_zero +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests updating an accessor argument to a graph node created from SPIR-V +// using dynamic command-groups. + +#include "../graph_common.hpp" + +int main(int, char **argv) { + queue Queue{}; + sycl::kernel_bundle KernelBundle = loadKernelsFromFile(Queue, argv[1]); + const auto getKernel = + [](sycl::kernel_bundle &bundle, + const std::string &name) { + return bundle.ext_oneapi_get_kernel(name); + }; + + kernel kernel = getKernel( + KernelBundle, "_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_EUlvE_"); + + const size_t N = 1024; + + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + std::vector HostDataA(N, 0); + std::vector HostDataB(N, 0); + + buffer BufA{HostDataA}; + buffer BufB{HostDataB}; + BufA.set_write_back(false); + BufB.set_write_back(false); + + auto AccA = BufA.get_access(); + auto AccB = BufB.get_access(); + + auto CGFA = [&](handler &CGH) { + CGH.require(AccA); + CGH.set_arg(0, AccA); + CGH.single_task(kernel); + }; + + auto CGFB = [&](handler &CGH) { + CGH.require(AccB); + CGH.set_arg(0, AccB); + CGH.single_task(kernel); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Queue, {CGFA, CGFB}); + auto DynamicCGNode = Graph.add(DynamicCG); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(BufA.get_access(), HostDataA.data()).wait(); + Queue.copy(BufB.get_access(), HostDataB.data()).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i); + assert(HostDataB[i] == 0); + } + + DynamicCG.set_active_cgf(1); + ExecGraph.update(DynamicCGNode); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(BufA.get_access(), HostDataA.data()).wait(); + Queue.copy(BufB.get_access(), HostDataB.data()).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i); + assert(HostDataB[i] == i); + } + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_event_deps.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_event_deps.cpp new file mode 100644 index 0000000000000..11e28a033a4c2 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_event_deps.cpp @@ -0,0 +1,73 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests adding a dynamic command-group node to a graph using graph limited +// events for dependencies. + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 1024; + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + int *PtrC = malloc_device(N, Queue); + std::vector HostData(N); + + Graph.begin_recording(Queue); + int PatternA = 42; + auto EventA = Queue.fill(PtrA, PatternA, N); + int PatternB = 0xA; + auto EventB = Queue.fill(PtrB, PatternB, N); + Graph.end_recording(Queue); + + auto CGFA = [&](handler &CGH) { + CGH.depends_on({EventA, EventB}); + CGH.parallel_for(N, [=](item<1> Item) { + auto I = Item.get_id(); + PtrC[I] = PtrA[I] * PtrB[I]; + }); + }; + + auto CGFB = [&](handler &CGH) { + CGH.depends_on({EventA, EventB}); + CGH.parallel_for(N, [=](item<1> Item) { + auto I = Item.get_id(); + PtrC[I] = PtrA[I] + PtrB[I]; + }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); + auto DynamicCGNode = Graph.add(DynamicCG); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(PtrC, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == PatternA * PatternB); + } + + DynamicCG.set_active_cgf(1); + ExecGraph.update(DynamicCGNode); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(PtrC, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == PatternA + PatternB); + } + + sycl::free(PtrA, Queue); + sycl::free(PtrB, Queue); + sycl::free(PtrC, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_ndrange.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_ndrange.cpp new file mode 100644 index 0000000000000..3b45f7dd3d58d --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_ndrange.cpp @@ -0,0 +1,71 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests updating a dynamic command-group node where the dynamic command-groups +// have different nd-ranges + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 1024; + int *Ptr = malloc_device(N, Queue); + std::vector HostData(N); + + auto RootNode = + Graph.add([&](handler &cgh) { cgh.memset(Ptr, 0, N * sizeof(int)); }); + + int PatternA = 42; + sycl::range<1> RangeA{512}; + auto CGFA = [&](handler &CGH) { + CGH.parallel_for(RangeA, + [=](item<1> Item) { Ptr[Item.get_id()] = PatternA; }); + }; + + int PatternB = 0xA; + sycl::range<1> RangeB{256}; + auto CGFB = [&](handler &CGH) { + CGH.parallel_for(RangeB, + [=](item<1> Item) { Ptr[Item.get_id()] = PatternB; }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); + auto DynamicCGNode = + Graph.add(DynamicCG, exp_ext::property::node::depends_on(RootNode)); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + if (i < RangeA.get(0)) { + assert(HostData[i] == PatternA); + } else { + assert(HostData[i] == 0); + } + } + + DynamicCG.set_active_cgf(1); + ExecGraph.update(DynamicCGNode); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + if (i < RangeB.get(0)) { + assert(HostData[i] == PatternB); + } else { + assert(HostData[i] == 0); + } + } + + sycl::free(Ptr, Queue); + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_overwrite_range.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_overwrite_range.cpp new file mode 100644 index 0000000000000..3ba2500cd6189 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_overwrite_range.cpp @@ -0,0 +1,59 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests how the nd-range of a node is overwritten by the active command-group + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 1024; + std::vector HostData(N); + int *Ptr = malloc_device(N, Queue); + Queue.memset(Ptr, 0, N * sizeof(int)).wait(); + + int PatternA = 42; + auto CGFA = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] = PatternA; }); + }; + + int PatternB = 0xA; + auto CGFB = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] = PatternB; }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); + auto DynamicCGNode = Graph.add(DynamicCG); + + size_t NewRange = 512; + sycl::range<1> UpdateRange(NewRange); + DynamicCGNode.update_range(UpdateRange); + + DynamicCG.set_active_cgf(1); + + // Check that the UpdateRange from active CGF 0 is preserved + DynamicCG.set_active_cgf(0); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + if (i < NewRange) { + assert(HostData[i] == PatternA); + } else { + assert(HostData[i] == 0); + } + } + + sycl::free(Ptr, Queue); + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_parameters.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_parameters.cpp new file mode 100644 index 0000000000000..0c46672869c7d --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_parameters.cpp @@ -0,0 +1,70 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests updating kernel code using dynamic command-groups that have different +// parameters in each command-group. + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 1024; + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + + std::vector HostDataA(N); + std::vector HostDataB(N); + + Queue.memset(PtrA, 0, N * sizeof(int)); + Queue.memset(PtrB, 0, N * sizeof(int)); + Queue.wait(); + + int PatternA = 0xA; + auto CGFA = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { PtrA[Item.get_id()] = PatternA; }); + }; + + int PatternB = 42; + auto CGFB = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { PtrB[Item.get_id()] = PatternB; }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); + auto DynamicCGNode = Graph.add(DynamicCG); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(PtrA, HostDataA.data(), N); + Queue.copy(PtrB, HostDataB.data(), N); + Queue.wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == PatternA); + assert(HostDataB[i] == 0); + } + + DynamicCG.set_active_cgf(1); + ExecGraph.update(DynamicCGNode); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(PtrA, HostDataA.data(), N); + Queue.copy(PtrB, HostDataB.data(), N); + Queue.wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == PatternA); + assert(HostDataB[i] == PatternB); + } + + sycl::free(PtrA, Queue); + sycl::free(PtrB, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_shared_nodes.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_shared_nodes.cpp new file mode 100644 index 0000000000000..f9b0728d8ea67 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_shared_nodes.cpp @@ -0,0 +1,74 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests using the same dynamic command-group in more than one graph node. + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 1024; + std::vector HostData(N); + int *Ptr = malloc_device(N, Queue); + + auto RootNode = + Graph.add([&](handler &CGH) { CGH.memset(Ptr, 0, N * sizeof(int)); }); + + int PatternA = 42; + auto CGFA = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] += PatternA; }); + }; + + int PatternB = 0xA; + auto CGFB = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] += PatternB; }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); + auto Node1 = + Graph.add(DynamicCG, exp_ext::property::node::depends_on(RootNode)); + + auto Node2 = Graph.add( + [&](handler &cgh) { + cgh.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] *= 2; }); + }, + exp_ext::property::node::depends_on(Node1)); + + auto Node3 = Graph.add(DynamicCG, exp_ext::property::node::depends_on(Node2)); + + // This ND-Range affects Node 1 as well, as the range is tied to the node. + sycl::range<1> Node3Range(512); + Node3.update_range(Node3Range); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + int Ref = (i < Node3Range.get(0)) ? (PatternA * 3) : 0; + assert(HostData[i] == Ref); + } + + DynamicCG.set_active_cgf(1); + ExecGraph.update(Node1); + ExecGraph.update(Node3); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + int Ref = (PatternB * 3); + assert(HostData[i] == Ref); + } + + sycl::free(Ptr, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_update_before_finalize.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_update_before_finalize.cpp new file mode 100644 index 0000000000000..a9109d000eb17 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_update_before_finalize.cpp @@ -0,0 +1,48 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests updating a dynamic command-group node after it has been added to +// a graph but before the graph has been finalized + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 1024; + int *Ptr = malloc_device(N, Queue); + std::vector HostData(N); + + int PatternA = 42; + auto CGFA = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] = PatternA; }); + }; + + int PatternB = 0xA; + auto CGFB = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] = PatternB; }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); + auto DynamicCGNode = Graph.add(DynamicCG); + DynamicCG.set_active_cgf(1); + auto ExecGraph = Graph.finalize(); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == PatternB); + } + + sycl::free(Ptr, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_usm.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_usm.cpp new file mode 100644 index 0000000000000..79db8ebe67c57 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_usm.cpp @@ -0,0 +1,55 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests updating usm kernel code using dynamic command-groups + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 1024; + int *Ptr = malloc_device(N, Queue); + std::vector HostData(N); + + int PatternA = 42; + auto CGFA = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] = PatternA; }); + }; + + int PatternB = 0xA; + auto CGFB = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] = PatternB; }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); + auto DynamicCGNode = Graph.add(DynamicCG); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == PatternA); + } + + DynamicCG.set_active_cgf(1); + ExecGraph.update(DynamicCGNode); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == PatternB); + } + + sycl::free(Ptr, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_with_dyn_param.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_with_dyn_param.cpp new file mode 100644 index 0000000000000..cb9bdf15f76b8 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_with_dyn_param.cpp @@ -0,0 +1,121 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests using a dynamic command-group object with dynamic parameters inside it + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 1024; + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + int *PtrC = malloc_device(N, Queue); + + std::vector HostDataA(N); + std::vector HostDataB(N); + std::vector HostDataC(N); + + exp_ext::dynamic_parameter DynParam1(Graph, PtrA); + exp_ext::dynamic_parameter DynParam2(Graph, PtrC); + + auto CGFA = [&](handler &CGH) { + CGH.set_arg(0, DynParam1); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + CGH.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrA[i] = i; + } + }); + }; + + auto CGFB = [&](handler &CGH) { + CGH.set_arg(0, DynParam1); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + CGH.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrA[i] = i; + } + }); + }; + + auto CGFC = [&](handler &CGH) { + CGH.set_arg(0, DynParam2); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + CGH.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrC[i] = i; + } + }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB, CGFC}); + auto DynamicCGNode = Graph.add(DynamicCG); + auto KernelNode = Graph.add( + [&](handler &CGH) { + CGH.set_arg(0, DynParam2); + // TODO: Use the free function kernel extension instead of regular + // kernels when available. + CGH.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrC[i] += i; + } + }); + }, + exp_ext::property::node::depends_on(DynamicCGNode)); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + auto ExecuteGraphAndVerifyResults = [&](bool A, bool B, bool C) { + Queue.memset(PtrA, 0, N * sizeof(int)); + Queue.memset(PtrB, 0, N * sizeof(int)); + Queue.memset(PtrC, 0, N * sizeof(int)); + Queue.wait(); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N); + Queue.copy(PtrB, HostDataB.data(), N); + Queue.copy(PtrC, HostDataC.data(), N); + Queue.wait(); + + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == (A ? i : 0)); + assert(HostDataB[i] == (B ? i : 0)); + assert(HostDataC[i] == (C ? (2 * i) : i)); + } + }; + ExecuteGraphAndVerifyResults(true, false, false); + + DynParam1.update(PtrB); + ExecGraph.update(DynamicCGNode); + ExecuteGraphAndVerifyResults(false, true, false); + + DynamicCG.set_active_cgf(1); + ExecGraph.update(DynamicCGNode); + ExecuteGraphAndVerifyResults(false, true, false); + + DynamicCG.set_active_cgf(2); + // Should be ignored as DynParam1 not used in active node + DynParam1.update(PtrA); + ExecGraph.update(DynamicCGNode); + ExecuteGraphAndVerifyResults(false, false, true); + + sycl::free(PtrA, Queue); + sycl::free(PtrB, Queue); + sycl::free(PtrC, Queue); + + return 0; +} diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 2d1dcce2b68e2..990676baacd48 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3023,6 +3023,9 @@ _ZN4sycl3_V13ext6oneapi12experimental20pitched_alloc_deviceEPmRKNS3_16image_desc _ZN4sycl3_V13ext6oneapi12experimental20pitched_alloc_deviceEPmRKNS3_16image_descriptorERKNS0_6deviceERKNS0_7contextE _ZN4sycl3_V13ext6oneapi12experimental20pitched_alloc_deviceEPmmmjRKNS0_5queueE _ZN4sycl3_V13ext6oneapi12experimental20pitched_alloc_deviceEPmmmjRKNS0_6deviceERKNS0_7contextE +_ZN4sycl3_V13ext6oneapi12experimental21dynamic_command_group14set_active_cgfEm +_ZN4sycl3_V13ext6oneapi12experimental21dynamic_command_groupC1ERKNS3_13command_graphILNS3_11graph_stateE0EEERKSt6vectorISt8functionIFvRNS0_7handlerEEESaISF_EE +_ZN4sycl3_V13ext6oneapi12experimental21dynamic_command_groupC2ERKNS3_13command_graphILNS3_11graph_stateE0EEERKSt6vectorISt8functionIFvRNS0_7handlerEEESaISF_EE _ZN4sycl3_V13ext6oneapi12experimental21get_composite_devicesEv _ZN4sycl3_V13ext6oneapi12experimental22get_image_channel_typeENS3_16image_mem_handleERKNS0_5queueE _ZN4sycl3_V13ext6oneapi12experimental22get_image_channel_typeENS3_16image_mem_handleERKNS0_6deviceERKNS0_7contextE @@ -3081,6 +3084,7 @@ _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph15begin_re _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph15begin_recordingERNS0_5queueERKNS0_13property_listE _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph24addGraphLeafDependenciesENS3_4nodeE _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph7addImplERKSt6vectorINS3_4nodeESaIS7_EE +_ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph7addImplERNS3_21dynamic_command_groupERKSt6vectorINS3_4nodeESaIS9_EE _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph7addImplESt8functionIFvRNS0_7handlerEEERKSt6vectorINS3_4nodeESaISC_EE _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph9make_edgeERNS3_4nodeES7_ _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graphC1ERKNS0_5queueERKNS0_13property_listE @@ -3596,6 +3600,7 @@ _ZNK4sycl3_V13ext6oneapi12experimental12physical_mem10get_deviceEv _ZNK4sycl3_V13ext6oneapi12experimental12physical_mem11get_contextEv _ZNK4sycl3_V13ext6oneapi12experimental12physical_mem3mapEmmNS3_19address_access_modeEm _ZNK4sycl3_V13ext6oneapi12experimental12physical_mem4sizeEv +_ZNK4sycl3_V13ext6oneapi12experimental21dynamic_command_group14get_active_cgfEv _ZNK4sycl3_V13ext6oneapi12experimental4node14get_successorsEv _ZNK4sycl3_V13ext6oneapi12experimental4node16get_predecessorsEv _ZNK4sycl3_V13ext6oneapi12experimental4node8get_typeEv diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index 33092af8477f5..0bf485466ec1d 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -326,6 +326,15 @@ ??0dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@$$QEAV0123456@@Z ??0dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@AEBV0123456@@Z ??0dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@V?$command_graph@$0A@@23456@_KPEBX@Z +?get_active_cgf@dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEBA_KXZ +??4dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEAAAEAV012345@$$QEAV012345@@Z +?addImpl@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@IEAA?AVnode@34567@AEAVdynamic_command_group@34567@AEBV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@@Z +??0dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEAA@AEBV?$command_graph@$0A@@12345@AEBV?$vector@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@V?$allocator@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@@2@@std@@@Z +??0dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEAA@$$QEAV012345@@Z +??0dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEAA@AEBV012345@@Z +?set_active_cgf@dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEAAX_K@Z +??1dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEAA@XZ +??4dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEAAAEAV012345@AEBV012345@@Z ??0event@_V1@sycl@@AEAA@V?$shared_ptr@Vevent_impl@detail@_V1@sycl@@@std@@@Z ??0event@_V1@sycl@@QEAA@$$QEAV012@@Z ??0event@_V1@sycl@@QEAA@AEBV012@@Z diff --git a/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp b/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp index 1542b5f34d7dc..ca33711024e3c 100644 --- a/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp +++ b/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp @@ -700,3 +700,195 @@ TEST_F(CommandGraphTest, RecordingWrongGraphDep) { }), sycl::exception); } + +// Error when a dynamic command-group is used with a graph belonging to a +// different graph. +TEST_F(CommandGraphTest, DynamicCommandGroupWrongGraph) { + experimental::command_graph Graph1{Queue.get_context(), Queue.get_device()}; + experimental::command_graph Graph2{Queue.get_context(), Queue.get_device()}; + auto CGF = [&](sycl::handler &CGH) { + CGH.single_task>([]() {}); + }; + + experimental::dynamic_command_group DynCG(Graph2, {CGF}); + ASSERT_THROW(Graph1.add(DynCG), sycl::exception); +} + +// Error when mixing ND-range and range across the dynamic CG. Also check error +// when using incorrect update function for dynamic CG. +TEST_F(CommandGraphTest, DynamicCommandGroupMismatchNDRange) { + size_t N = 32; + experimental::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + auto CGFA = [&](handler &CGH) { + CGH.parallel_for>(sycl::nd_range<1>({N * 2}, {N}), + [&](sycl::nd_item<1> it) {}); + }; + + auto CGFB = [&](handler &CGH) { + CGH.parallel_for>(sycl::range<1>(N), [&](sycl::item<1> it) {}); + }; + + auto CGFC = [&](handler &CGH) { + CGH.parallel_for>(sycl::range<1>(N), [&](sycl::item<1> it) {}); + }; + + ASSERT_THROW(experimental::dynamic_command_group DynCG(Graph, {CGFA, CGFB}), + sycl::exception); + + experimental::dynamic_command_group DynCG(Graph, {CGFC, CGFB}); + auto DynCGNode = Graph.add(DynCG); + + ASSERT_THROW(DynCGNode.update_nd_range(sycl::nd_range<1>({N * 2}, {N})), + sycl::exception); +} + +// Error when a non-kernel command-group is included in a dynamic command-group +TEST_F(CommandGraphTest, DynamicCommandGroupNotKernel) { + int *Ptr = malloc_device(1, Queue); + auto CGF = [&](sycl::handler &CGH) { CGH.memset(Ptr, 1, 0); }; + + experimental::command_graph Graph{Queue}; + ASSERT_THROW(experimental::dynamic_command_group DynCG(Graph, {CGF}), + sycl::exception); + sycl::free(Ptr, Queue); +} + +// Error if edges are not the same for all command-groups in dynamic command +// group, test using graph limited events to create edges +TEST_F(CommandGraphTest, DynamicCommandGroupMismatchEventEdges) { + size_t N = 32; + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + + experimental::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + Graph.begin_recording(Queue); + + auto EventA = Queue.submit([&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { PtrA[Item.get_id()] = 1; }); + }); + + auto EventB = Queue.submit([&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { PtrB[Item.get_id()] = 4; }); + }); + + Graph.end_recording(); + + auto CGFA = [&](handler &CGH) { + CGH.depends_on(EventA); + CGH.parallel_for(N, [=](item<1> Item) { PtrA[Item.get_id()] += 2; }); + }; + + auto CGFB = [&](handler &CGH) { + CGH.depends_on(EventB); + CGH.parallel_for(N, [=](item<1> Item) { PtrB[Item.get_id()] += 0xA; }); + }; + + experimental::dynamic_command_group DynCG(Graph, {CGFA, CGFB}); + ASSERT_THROW(Graph.add(DynCG), sycl::exception); + + sycl::free(PtrA, Queue); + sycl::free(PtrB, Queue); +} + +// Test that an exception is thrown when a graph isn't created with buffer +// property, but buffers are used. +TEST_F(CommandGraphTest, DynamicCommandGroupBufferThrows) { + size_t N = 32; + std::vector HostData(N, 0); + buffer Buf{HostData}; + Buf.set_write_back(false); + + experimental::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + auto CGFA = [&](handler &CGH) { + auto Acc = Buf.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { Acc[Item.get_id()] = 2; }); + }; + + auto CGFB = [&](handler &CGH) { + auto Acc = Buf.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { Acc[Item.get_id()] = 0xA; }); + }; + + experimental::dynamic_command_group DynCG(Graph, {CGFA, CGFB}); + ASSERT_THROW(Graph.add(DynCG), sycl::exception); +} + +// Test and exception is thrown when using a host-accessor to a buffer +// used in a non active CGF node in the graph. +TEST_F(CommandGraphTest, DynamicCommandGroupBufferHostAccThrows) { + size_t N = 32; + std::vector HostData(N, 0); + buffer Buf{HostData}; + Buf.set_write_back(false); + + int *Ptr = malloc_device(N, Queue); + + { + ext::oneapi::experimental::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {experimental::property::graph::assume_buffer_outlives_graph{}}}; + + auto CGFA = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] = 2; }); + }; + + auto CGFB = [&](handler &CGH) { + auto Acc = Buf.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { Acc[Item.get_id()] = 0xA; }); + }; + + experimental::dynamic_command_group DynCG(Graph, {CGFA, CGFB}); + ASSERT_NO_THROW(Graph.add(DynCG)); + + ASSERT_THROW({ host_accessor HostAcc{Buf}; }, sycl::exception); + } + + sycl::free(Ptr, Queue); +} + +// Error if edges are not the same for all command-groups in dynamic command +// group, test using accessors to create edges +TEST_F(CommandGraphTest, DynamicCommandGroupMismatchAccessorEdges) { + size_t N = 32; + std::vector HostData(N, 0); + buffer BufA{HostData}; + buffer BufB{HostData}; + BufA.set_write_back(false); + BufB.set_write_back(false); + + experimental::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {experimental::property::graph::assume_buffer_outlives_graph{}}}; + + Graph.begin_recording(Queue); + + Queue.submit([&](handler &CGH) { + auto AccA = BufA.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { AccA[Item.get_id()] = 1; }); + }); + + Queue.submit([&](handler &CGH) { + auto AccB = BufB.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { AccB[Item.get_id()] = 4; }); + }); + + Graph.end_recording(); + + auto CGFA = [&](handler &CGH) { + auto AccA = BufA.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { AccA[Item.get_id()] += 2; }); + }; + + auto CGFB = [&](handler &CGH) { + auto AccB = BufB.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { AccB[Item.get_id()] += 0xA; }); + }; + + experimental::dynamic_command_group DynCG(Graph, {CGFA, CGFB}); + ASSERT_THROW(Graph.add(DynCG), sycl::exception); +}