From b8bbdd6860aebb5fcf42a41d627a322b48d6bbfe Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Mon, 7 Oct 2024 13:01:38 +0100 Subject: [PATCH] [SYCL][Graph] Implement dynamic command-groups Implement Dynamic Command-Group feature specified in PR [[SYCL][Graph] Add specification for kernel binary updates](https://github.com/intel/llvm/pull/14896) This feature enables updating `ur_kernel_handle_t` objects in graph nodes between executions as well as parameters and execution range of nodes. This functionality is currently supported on CUDA & HIP which are used for testing in the new E2E tests. Level Zero support will follow shortly, resulting in the removal of the `XFAIL` labels from the E2E tests. The code for adding nodes to a graph has been refactored to split out verification of edges, and marking memory objects used in a node, as separate helper functions. This allows path for adding a command-group node to do this functions over each CG in the list before creating the node itself. The `dynamic_parameter_impl` code has also been refactored so the code is shared for updating a dynamic parameter used in both a regular kernel node and a dynamic command-group node. See the addition to the design doc for further details on the implementation. --- sycl/doc/design/CommandGraph.md | 24 + .../sycl/ext/oneapi/experimental/graph.hpp | 24 + sycl/include/sycl/handler.hpp | 25 ++ sycl/source/detail/cg.hpp | 5 +- sycl/source/detail/graph_impl.cpp | 419 +++++++++++++++--- sycl/source/detail/graph_impl.hpp | 265 ++++++----- sycl/source/detail/handler_impl.hpp | 2 + sycl/source/detail/scheduler/commands.cpp | 62 ++- sycl/source/handler.cpp | 21 +- .../Graph/Update/dyn_cgf_accessor.cpp | 60 +++ .../Graph/Update/dyn_cgf_accessor_deps.cpp | 74 ++++ .../Graph/Update/dyn_cgf_accessor_deps2.cpp | 85 ++++ .../Graph/Update/dyn_cgf_accessor_spv.cpp | 81 ++++ .../Update/dyn_cgf_different_arg_nums.cpp | 148 +++++++ .../Graph/Update/dyn_cgf_event_deps.cpp | 73 +++ .../test-e2e/Graph/Update/dyn_cgf_ndrange.cpp | 72 +++ .../Graph/Update/dyn_cgf_ndrange_3D.cpp | 80 ++++ .../Graph/Update/dyn_cgf_overwrite_range.cpp | 59 +++ .../Graph/Update/dyn_cgf_parameters.cpp | 70 +++ .../Graph/Update/dyn_cgf_shared_nodes.cpp | 74 ++++ .../Update/dyn_cgf_update_before_finalize.cpp | 48 ++ sycl/test-e2e/Graph/Update/dyn_cgf_usm.cpp | 55 +++ .../Update/dyn_cgf_with_all_dyn_params.cpp | 121 +++++ ...dyn_cgf_with_different_type_dyn_params.cpp | 137 ++++++ .../Update/dyn_cgf_with_some_dyn_params.cpp | 107 +++++ .../Graph/Update/update_ndrange_to_range.cpp | 55 +++ .../Graph/Update/update_range_to_ndrange.cpp | 56 +++ .../Graph/Update/whole_update_dynamic_cgf.cpp | 75 ++++ .../Update/whole_update_dynamic_param.cpp | 2 - sycl/test/abi/sycl_symbols_linux.dump | 5 + sycl/test/abi/sycl_symbols_windows.dump | 9 + .../Extensions/CommandGraph/Exceptions.cpp | 163 +++++++ .../Extensions/CommandGraph/Update.cpp | 29 -- 33 files changed, 2332 insertions(+), 253 deletions(-) create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_accessor.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps2.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_accessor_spv.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_different_arg_nums.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_event_deps.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_ndrange.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_ndrange_3D.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_overwrite_range.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_parameters.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_shared_nodes.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_update_before_finalize.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_usm.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_with_all_dyn_params.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_with_different_type_dyn_params.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_with_some_dyn_params.cpp create mode 100644 sycl/test-e2e/Graph/Update/update_ndrange_to_range.cpp create mode 100644 sycl/test-e2e/Graph/Update/update_range_to_ndrange.cpp create mode 100644 sycl/test-e2e/Graph/Update/whole_update_dynamic_cgf.cpp diff --git a/sycl/doc/design/CommandGraph.md b/sycl/doc/design/CommandGraph.md index 2f83d42a3c57c..d7587113a4615 100644 --- a/sycl/doc/design/CommandGraph.md +++ b/sycl/doc/design/CommandGraph.md @@ -282,6 +282,30 @@ requirements for these new accessors to correctly trigger allocations before updating. This is similar to how individual graph commands are enqueued when accessors are used in a graph node. +### Dynamic Command-Group + +To implement the `dynamic_command_group` class for updating the command-groups (CG) +associated with nodes, the CG member of the node implementation class changes +from a `std::unique_ptr` to a `std::shared_ptr` so that multiple nodes and the +`dynamic_command_group_impl` object can share the same CG object. This avoids +the overhead of having to allocate and free copies of the CG when a new active +CG is selected. + +The `dynamic_command_group_impl` class contains weak pointers to the nodes which +have been created with it, so that when a new active CG is selected it can +propagate the change to those nodes. The `node_impl` class also contains a +reference to the dynamic command-group that created it, so that when the graph +is finalized each node can use the list of kernels in its dynamic command-group +as part of the `urCommandBufferAppendKernelLaunchExp` call to pass the possible +alternative kernels. + +The `sycl::detail::CGExecKernel` class has been added to, so that if the +object was created from an element in the dynamic command-group list, the class +stores a vector of weak pointers to the other alternative command-groups created +from the same dynamic command-group object. This allows the DPC++ scheduler to +access the list of alternative kernels when calling the UR API to append a +kernel command to a command-buffer. + ## Optimizations ### Interactions with Profiling diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp index d18cf3ebc4b3d..2bc3ef1d921ab 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp @@ -96,6 +96,7 @@ class node_impl; class graph_impl; class exec_graph_impl; class dynamic_parameter_impl; +class dynamic_command_group_impl; } // namespace detail enum class node_type { @@ -216,6 +217,23 @@ class depends_on_all_leaves : public ::sycl::detail::DataLessProperty< } // namespace node } // namespace property +class __SYCL_EXPORT dynamic_command_group { +public: + dynamic_command_group( + const command_graph &Graph, + const std::vector> &CGFList); + + size_t get_active_cgf() const; + void set_active_cgf(size_t Index); + +private: + template + friend const decltype(Obj::impl) & + sycl::detail::getSyclObjImpl(const Obj &SyclObject); + + std::shared_ptr impl; +}; + namespace detail { // Templateless modifiable command-graph base class. class __SYCL_EXPORT modifiable_command_graph { @@ -337,6 +355,12 @@ class __SYCL_EXPORT modifiable_command_graph { modifiable_command_graph(const std::shared_ptr &Impl) : impl(Impl) {} + /// Template-less implementation of add() for dynamic command-group nodes. + /// @param DynCGF Dynamic Command-group function object to add. + /// @param Dep List of predecessor nodes. + /// @return Node added to the graph. + node addImpl(dynamic_command_group &DynCGF, const std::vector &Dep); + /// Template-less implementation of add() for CGF nodes. /// @param CGF Command-group function to add. /// @param Dep List of predecessor nodes. diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index cda66914dcb06..d1d8936eca677 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -1165,7 +1165,9 @@ class __SYCL_EXPORT handler { StoreLambda( std::move(Wrapper)); setType(detail::CGType::Kernel); +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES setNDRangeUsed(false); +#endif #endif } else #endif // !__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ && @@ -1188,8 +1190,10 @@ class __SYCL_EXPORT handler { StoreLambda( std::move(KernelFunc)); setType(detail::CGType::Kernel); +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES setNDRangeUsed(false); #endif +#endif #else (void)KernelFunc; #endif // __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ @@ -1239,7 +1243,9 @@ class __SYCL_EXPORT handler { StoreLambda( std::move(KernelFunc)); setType(detail::CGType::Kernel); +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES setNDRangeUsed(true); +#endif #endif } @@ -1262,7 +1268,9 @@ class __SYCL_EXPORT handler { setNDRangeDescriptor(std::move(NumWorkItems)); processLaunchProperties(Props); setType(detail::CGType::Kernel); +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES setNDRangeUsed(false); +#endif extractArgsAndReqs(); MKernelName = getKernelName(); #endif @@ -1288,7 +1296,9 @@ class __SYCL_EXPORT handler { setNDRangeDescriptor(std::move(NDRange)); processLaunchProperties(Props); setType(detail::CGType::Kernel); +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES setNDRangeUsed(true); +#endif extractArgsAndReqs(); MKernelName = getKernelName(); #endif @@ -1329,7 +1339,9 @@ class __SYCL_EXPORT handler { setNDRangeDescriptor(NumWorkGroups, /*SetNumWorkGroups=*/true); StoreLambda(std::move(KernelFunc)); setType(detail::CGType::Kernel); +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES setNDRangeUsed(false); +#endif #endif // __SYCL_DEVICE_ONLY__ } @@ -1954,7 +1966,9 @@ class __SYCL_EXPORT handler { StoreLambda( std::move(KernelFunc)); setType(detail::CGType::Kernel); +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES setNDRangeUsed(false); +#endif #endif } @@ -2052,7 +2066,9 @@ class __SYCL_EXPORT handler { detail::checkValueRange(NumWorkItems, WorkItemOffset); setNDRangeDescriptor(std::move(NumWorkItems), std::move(WorkItemOffset)); setType(detail::CGType::Kernel); +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES setNDRangeUsed(false); +#endif extractArgsAndReqs(); MKernelName = getKernelName(); #endif @@ -2131,7 +2147,9 @@ class __SYCL_EXPORT handler { setNDRangeDescriptor(std::move(NumWorkItems)); MKernel = detail::getSyclObjImpl(std::move(Kernel)); setType(detail::CGType::Kernel); +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES setNDRangeUsed(false); +#endif if (!lambdaAndKernelHaveEqualName()) { extractArgsAndReqs(); MKernelName = getKernelName(); @@ -2172,7 +2190,9 @@ class __SYCL_EXPORT handler { setNDRangeDescriptor(std::move(NumWorkItems), std::move(WorkItemOffset)); MKernel = detail::getSyclObjImpl(std::move(Kernel)); setType(detail::CGType::Kernel); +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES setNDRangeUsed(false); +#endif if (!lambdaAndKernelHaveEqualName()) { extractArgsAndReqs(); MKernelName = getKernelName(); @@ -2212,7 +2232,9 @@ class __SYCL_EXPORT handler { setNDRangeDescriptor(std::move(NDRange)); MKernel = detail::getSyclObjImpl(std::move(Kernel)); setType(detail::CGType::Kernel); +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES setNDRangeUsed(true); +#endif if (!lambdaAndKernelHaveEqualName()) { extractArgsAndReqs(); MKernelName = getKernelName(); @@ -3341,6 +3363,7 @@ class __SYCL_EXPORT handler { size_t Size, bool Block = false); friend class ext::oneapi::experimental::detail::graph_impl; friend class ext::oneapi::experimental::detail::dynamic_parameter_impl; + friend class ext::oneapi::experimental::detail::dynamic_command_group_impl; bool DisableRangeRounding(); @@ -3604,8 +3627,10 @@ class __SYCL_EXPORT handler { } #endif +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES // Set that an ND Range was used during a call to parallel_for void setNDRangeUsed(bool Value); +#endif inline void internalProfilingTagImpl() { throwIfActionIsCreated(); diff --git a/sycl/source/detail/cg.hpp b/sycl/source/detail/cg.hpp index 1799bbedd4903..f0dadad99dac5 100644 --- a/sycl/source/detail/cg.hpp +++ b/sycl/source/detail/cg.hpp @@ -257,6 +257,9 @@ class CGExecKernel : public CG { std::string MKernelName; std::vector> MStreams; std::vector> MAuxiliaryResources; + /// Used to implement ext_oneapi_graph dynamic_command_group. Stores the list + /// of command-groups that a kernel command can be updated to. + std::vector> MAlternativeKernels; ur_kernel_cache_config_t MKernelCacheConfig; bool MKernelIsCooperative = false; bool MKernelUsesClusterLaunch = false; @@ -277,7 +280,7 @@ class CGExecKernel : public CG { MKernelBundle(std::move(KernelBundle)), MArgs(std::move(Args)), MKernelName(std::move(KernelName)), MStreams(std::move(Streams)), MAuxiliaryResources(std::move(AuxiliaryResources)), - MKernelCacheConfig(std::move(KernelCacheConfig)), + MAlternativeKernels{}, MKernelCacheConfig(std::move(KernelCacheConfig)), MKernelIsCooperative(KernelIsCooperative), MKernelUsesClusterLaunch(MKernelUsesClusterLaunch) { assert(getType() == CGType::Kernel && "Wrong type of exec kernel CG."); diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index ec847888357f2..ebdeb8fd60d63 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -352,11 +352,76 @@ void graph_impl::removeRoot(const std::shared_ptr &Root) { MRoots.erase(Root); } -std::shared_ptr -graph_impl::add(const std::vector> &Dep) { - // Copy deps so we can modify them - auto Deps = Dep; +std::set> graph_impl::getCGEdges( + const std::shared_ptr &CommandGroup) const { + const auto &Requirements = CommandGroup->getRequirements(); + if (!MAllowBuffers && Requirements.size()) { + throw sycl::exception(make_error_code(errc::invalid), + "Cannot use buffers in a graph without passing the " + "assume_buffer_outlives_graph property on " + "Graph construction."); + } + + if (CommandGroup->getType() == sycl::detail::CGType::Kernel) { + auto CGKernel = + static_cast(CommandGroup.get()); + if (CGKernel->hasStreams()) { + throw sycl::exception( + make_error_code(errc::invalid), + "Using sycl streams in a graph node is unsupported."); + } + } + + // Add any nodes specified by event dependencies into the dependency list + std::set> UniqueDeps; + for (auto &Dep : CommandGroup->getEvents()) { + if (auto NodeImpl = MEventsMap.find(Dep); NodeImpl == MEventsMap.end()) { + throw sycl::exception(sycl::make_error_code(errc::invalid), + "Event dependency from handler::depends_on does " + "not correspond to a node within the graph"); + } else { + UniqueDeps.insert(NodeImpl->second); + } + } + + // A unique set of dependencies obtained by checking requirements and events + for (auto &Req : Requirements) { + // Look through the graph for nodes which share this requirement + for (auto &Node : MNodeStorage) { + if (Node->hasRequirementDependency(Req)) { + bool ShouldAddDep = true; + // If any of this node's successors have this requirement then we skip + // adding the current node as a dependency. + for (auto &Succ : Node->MSuccessors) { + if (Succ.lock()->hasRequirementDependency(Req)) { + ShouldAddDep = false; + break; + } + } + if (ShouldAddDep) { + UniqueDeps.insert(Node); + } + } + } + } + + return std::move(UniqueDeps); +} + +void graph_impl::markCGMemObjs( + const std::shared_ptr &CommandGroup) { + const auto &Requirements = CommandGroup->getRequirements(); + for (auto &Req : Requirements) { + auto MemObj = static_cast(Req->MSYCLMemObj); + bool WasInserted = MMemObjs.insert(MemObj).second; + if (WasInserted) { + MemObj->markBeingUsedInGraph(); + } + } +} +std::shared_ptr +graph_impl::add(std::vector> &Deps) { const std::shared_ptr &NodeImpl = std::make_shared(); MNodeStorage.push_back(NodeImpl); @@ -370,7 +435,7 @@ graph_impl::add(const std::vector> &Dep) { std::shared_ptr graph_impl::add(std::function CGF, const std::vector &Args, - const std::vector> &Dep) { + std::vector> &Deps) { (void)Args; sycl::handler Handler{shared_from_this()}; @@ -401,8 +466,8 @@ graph_impl::add(std::function CGF, Handler.getType()); auto NodeImpl = - this->add(NodeType, std::move(Handler.impl->MGraphNodeCG), Dep); - NodeImpl->MNDRangeUsed = Handler.impl->MNDRangeUsed; + this->add(NodeType, std::move(Handler.impl->MGraphNodeCG), Deps); + // Add an event associated with this explicit node for mixed usage addEventForNode(std::make_shared(), NodeImpl); @@ -444,67 +509,15 @@ graph_impl::add(const std::vector Events) { std::shared_ptr graph_impl::add(node_type NodeType, - std::unique_ptr CommandGroup, - const std::vector> &Dep) { - // Copy deps so we can modify them - auto Deps = Dep; + std::shared_ptr CommandGroup, + std::vector> &Deps) { // A unique set of dependencies obtained by checking requirements and events - std::set> UniqueDeps; - const auto &Requirements = CommandGroup->getRequirements(); - if (!MAllowBuffers && Requirements.size()) { - throw sycl::exception(make_error_code(errc::invalid), - "Cannot use buffers in a graph without passing the " - "assume_buffer_outlives_graph property on " - "Graph construction."); - } - - if (CommandGroup->getType() == sycl::detail::CGType::Kernel) { - auto CGKernel = - static_cast(CommandGroup.get()); - if (CGKernel->hasStreams()) { - throw sycl::exception( - make_error_code(errc::invalid), - "Using sycl streams in a graph node is unsupported."); - } - } + std::set> UniqueDeps = getCGEdges(CommandGroup); - for (auto &Req : Requirements) { - // Track and mark the memory objects being used by the graph. - auto MemObj = static_cast(Req->MSYCLMemObj); - bool WasInserted = MMemObjs.insert(MemObj).second; - if (WasInserted) { - MemObj->markBeingUsedInGraph(); - } - // Look through the graph for nodes which share this requirement - for (auto &Node : MNodeStorage) { - if (Node->hasRequirementDependency(Req)) { - bool ShouldAddDep = true; - // If any of this node's successors have this requirement then we skip - // adding the current node as a dependency. - for (auto &Succ : Node->MSuccessors) { - if (Succ.lock()->hasRequirementDependency(Req)) { - ShouldAddDep = false; - break; - } - } - if (ShouldAddDep) { - UniqueDeps.insert(Node); - } - } - } - } + // Track and mark the memory objects being used by the graph. + markCGMemObjs(CommandGroup); - // Add any nodes specified by event dependencies into the dependency list - for (auto &Dep : CommandGroup->getEvents()) { - if (auto NodeImpl = MEventsMap.find(Dep); NodeImpl != MEventsMap.end()) { - UniqueDeps.insert(NodeImpl->second); - } else { - throw sycl::exception(sycl::make_error_code(errc::invalid), - "Event dependency from handler::depends_on does " - "not correspond to a node within the graph"); - } - } // Add any deps determined from requirements and events into the dependency // list Deps.insert(Deps.end(), UniqueDeps.begin(), UniqueDeps.end()); @@ -518,6 +531,42 @@ graph_impl::add(node_type NodeType, return NodeImpl; } +std::shared_ptr +graph_impl::add(std::shared_ptr &DynCGImpl, + std::vector> &Deps) { + // Set of Dependent nodes based on CG event and accessor dependencies. + std::set> DynCGDeps = + getCGEdges(DynCGImpl->MKernels[0]); + for (unsigned i = 1; i < DynCGImpl->getNumCGs(); i++) { + auto &CG = DynCGImpl->MKernels[i]; + auto CGEdges = getCGEdges(CG); + if (CGEdges != DynCGDeps) { + throw sycl::exception(make_error_code(sycl::errc::invalid), + "Command-groups in dynamic command-group don't have" + "equivalent dependencies to other graph nodes."); + } + } + + // Track and mark the memory objects being used by the graph. + for (auto &CG : DynCGImpl->MKernels) { + markCGMemObjs(CG); + } + + // Get active dynamic command-group CG and use to create a node object + const auto &ActiveKernel = DynCGImpl->getActiveKernel(); + std::shared_ptr NodeImpl = + add(node_type::kernel, ActiveKernel, Deps); + + // Add an event associated with this explicit node for mixed usage + addEventForNode(std::make_shared(), NodeImpl); + + // Track the dynamic command-group used inside the node object + DynCGImpl->MNodes.push_back(NodeImpl); + NodeImpl->MDynCG = DynCGImpl; + + return NodeImpl; +} + bool graph_impl::clearQueues() { bool AnyQueuesCleared = false; for (auto &Queue : MRecordingQueues) { @@ -1568,6 +1617,27 @@ modifiable_command_graph::modifiable_command_graph( : impl(std::make_shared( SyclQueue.get_context(), SyclQueue.get_device(), PropList)) {} +node modifiable_command_graph::addImpl(dynamic_command_group &DynCGF, + const std::vector &Deps) { + impl->throwIfGraphRecordingQueue("Explicit API \"Add()\" function"); + auto DynCGFImpl = sycl::detail::getSyclObjImpl(DynCGF); + + if (DynCGFImpl->MGraph != impl) { + throw sycl::exception(make_error_code(sycl::errc::invalid), + "Graph does not match the graph associated with " + "dynamic command-group."); + } + + std::vector> DepImpls; + for (auto &D : Deps) { + DepImpls.push_back(sycl::detail::getSyclObjImpl(D)); + } + + graph_impl::WriteLock Lock(impl->MMutex); + std::shared_ptr NodeImpl = impl->add(DynCGFImpl, DepImpls); + return sycl::detail::createSyclObjFromImpl(NodeImpl); +} + node modifiable_command_graph::addImpl(const std::vector &Deps) { impl->throwIfGraphRecordingQueue("Explicit API \"Add()\" function"); std::vector> DepImpls; @@ -1775,6 +1845,208 @@ void dynamic_parameter_base::updateAccessor( impl->updateAccessor(Acc); } +void dynamic_parameter_impl::updateValue(const raw_kernel_arg *NewRawValue, + size_t Size) { + // Number of bytes is taken from member of raw_kernel_arg object rather + // than using the size parameter which represents sizeof(raw_kernel_arg). + std::ignore = Size; + size_t RawArgSize = NewRawValue->MArgSize; + const void *RawArgData = NewRawValue->MArgData; + + for (auto &[NodeWeak, ArgIndex] : MNodes) { + auto NodeShared = NodeWeak.lock(); + if (NodeShared) { + dynamic_parameter_impl::updateCGArgValue( + NodeShared->MCommandGroup, ArgIndex, RawArgData, RawArgSize); + } + } + + for (auto &DynCGInfo : MDynCGs) { + auto DynCG = DynCGInfo.DynCG.lock(); + if (DynCG) { + auto &CG = DynCG->MKernels[DynCGInfo.CGIndex]; + dynamic_parameter_impl::updateCGArgValue(CG, DynCGInfo.ArgIndex, + RawArgData, RawArgSize); + } + } + + std::memcpy(MValueStorage.data(), RawArgData, RawArgSize); +} + +void dynamic_parameter_impl::updateValue(const void *NewValue, size_t Size) { + for (auto &[NodeWeak, ArgIndex] : MNodes) { + auto NodeShared = NodeWeak.lock(); + if (NodeShared) { + dynamic_parameter_impl::updateCGArgValue(NodeShared->MCommandGroup, + ArgIndex, NewValue, Size); + } + } + + for (auto &DynCGInfo : MDynCGs) { + auto DynCG = DynCGInfo.DynCG.lock(); + if (DynCG) { + auto &CG = DynCG->MKernels[DynCGInfo.CGIndex]; + dynamic_parameter_impl::updateCGArgValue(CG, DynCGInfo.ArgIndex, NewValue, + Size); + } + } + + std::memcpy(MValueStorage.data(), NewValue, Size); +} + +void dynamic_parameter_impl::updateAccessor( + const sycl::detail::AccessorBaseHost *Acc) { + for (auto &[NodeWeak, ArgIndex] : MNodes) { + auto NodeShared = NodeWeak.lock(); + // Should we fail here if the node isn't alive anymore? + if (NodeShared) { + dynamic_parameter_impl::updateCGAccessor(NodeShared->MCommandGroup, + ArgIndex, Acc); + } + } + + for (auto &DynCGInfo : MDynCGs) { + auto DynCG = DynCGInfo.DynCG.lock(); + if (DynCG) { + auto &CG = DynCG->MKernels[DynCGInfo.CGIndex]; + dynamic_parameter_impl::updateCGAccessor(CG, DynCGInfo.ArgIndex, Acc); + } + } + + std::memcpy(MValueStorage.data(), Acc, + sizeof(sycl::detail::AccessorBaseHost)); +} + +void dynamic_parameter_impl::updateCGArgValue( + std::shared_ptr CG, int ArgIndex, const void *NewValue, + size_t Size) { + auto &Args = static_cast(CG.get())->MArgs; + for (auto &Arg : Args) { + if (Arg.MIndex != ArgIndex) { + continue; + } + assert(Arg.MSize == static_cast(Size)); + // MPtr may be a pointer into arg storage so we memcpy the contents of + // NewValue rather than assign it directly + std::memcpy(Arg.MPtr, NewValue, Size); + break; + } +} + +void dynamic_parameter_impl::updateCGAccessor( + std::shared_ptr CG, int ArgIndex, + const sycl::detail::AccessorBaseHost *Acc) { + auto &Args = static_cast(CG.get())->MArgs; + + auto NewAccImpl = sycl::detail::getSyclObjImpl(*Acc); + for (auto &Arg : Args) { + if (Arg.MIndex != ArgIndex) { + continue; + } + assert(Arg.MType == sycl::detail::kernel_param_kind_t::kind_accessor); + + // Find old accessor in accessor storage and replace with new one + if (static_cast(NewAccImpl->MSYCLMemObj) + ->needsWriteBack()) { + throw sycl::exception( + make_error_code(errc::invalid), + "Accessors to buffers which have write_back enabled " + "are not allowed to be used in command graphs."); + } + + // All accessors passed to this function will be placeholders, so we must + // perform steps similar to what happens when handler::require() is + // called here. + sycl::detail::Requirement *NewReq = NewAccImpl.get(); + if (NewReq->MAccessMode != sycl::access_mode::read) { + auto SYCLMemObj = + static_cast(NewReq->MSYCLMemObj); + SYCLMemObj->handleWriteAccessorCreation(); + } + + for (auto &Acc : CG->getAccStorage()) { + if (auto OldAcc = static_cast(Arg.MPtr); + Acc.get() == OldAcc) { + Acc = NewAccImpl; + } + } + + for (auto &Req : CG->getRequirements()) { + if (auto OldReq = static_cast(Arg.MPtr); + Req == OldReq) { + Req = NewReq; + } + } + Arg.MPtr = NewAccImpl.get(); + break; + } +} + +dynamic_command_group_impl::dynamic_command_group_impl( + const command_graph &Graph) + : MGraph{sycl::detail::getSyclObjImpl(Graph)}, MActiveCGF(0) {} + +void dynamic_command_group_impl::finalizeCGFList( + const std::vector> &CGFList) { + // True if kernels use sycl::nd_range, and false if using sycl::range + for (size_t CGFIndex = 0; CGFIndex < CGFList.size(); CGFIndex++) { + const auto &CGF = CGFList[CGFIndex]; + // Handler defined inside the loop so it doesn't appear to the runtime + // as a single command-group with multiple commands inside. + sycl::handler Handler{MGraph}; + CGF(Handler); + + if (Handler.getType() != sycl::detail::CGType::Kernel) { + throw sycl::exception( + make_error_code(errc::invalid), + "The only type of command-groups that can be used in " + "dynamic command-groups is kernels."); + } + + Handler.finalize(); + + // Take unique_ptr object from handler and convert to + // shared_ptr to store + sycl::detail::CG *RawCGPtr = Handler.impl->MGraphNodeCG.release(); + auto RawCGExecPtr = static_cast(RawCGPtr); + auto CGExecSP = std::shared_ptr(RawCGExecPtr); + MKernels.push_back(CGExecSP); + + // Track dynamic_parameter usage in command-list + auto &DynamicParams = Handler.impl->MDynamicParameters; + for (auto &[DynamicParam, ArgIndex] : DynamicParams) { + DynamicParam->registerDynCG(shared_from_this(), CGFIndex, ArgIndex); + } + } + + // For each CGExecKernel store the list of alternative kernels, not + // including itself. + using CGExecKernelSP = std::shared_ptr; + using CGExecKernelWP = std::weak_ptr; + for (auto KernelCG : MKernels) { + std::vector Alternatives; + std::copy_if( + MKernels.begin(), MKernels.end(), std::back_inserter(Alternatives), + [&KernelCG](const CGExecKernelSP &K) { return K != KernelCG; }); + + KernelCG->MAlternativeKernels = std::move(Alternatives); + } +} + +void dynamic_command_group_impl::setActiveIndex(size_t Index) { + if (Index >= getNumCGs()) { + throw sycl::exception(sycl::make_error_code(errc::invalid), + "Index is out of range."); + } + MActiveCGF = Index; + + // Update nodes using the dynamic command-group to use the new active CG + for (auto &Node : MNodes) { + if (auto NodeSP = Node.lock()) { + NodeSP->MCommandGroup = getActiveKernel(); + } + } +} } // namespace detail node_type node::get_type() const { return impl->MNodeType; } @@ -1813,6 +2085,25 @@ template <> __SYCL_EXPORT void node::update_range<2>(range<2> Range) { template <> __SYCL_EXPORT void node::update_range<3>(range<3> Range) { impl->updateRange(Range); } + +dynamic_command_group::dynamic_command_group( + const command_graph &Graph, + const std::vector> &CGFList) + : impl(std::make_shared(Graph)) { + if (CGFList.empty()) { + throw sycl::exception(sycl::make_error_code(errc::invalid), + "Dynamic command-group cannot be created with an " + "empty CGF list."); + } + impl->finalizeCGFList(CGFList); +} + +size_t dynamic_command_group::get_active_cgf() const { + return impl->getActiveIndex(); +} +void dynamic_command_group::set_active_cgf(size_t Index) { + return impl->setActiveIndex(Index); +} } // namespace experimental } // namespace oneapi } // namespace ext diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 4ee34830f39a2..3b1fc3fa01641 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -95,10 +95,12 @@ class node_impl : public std::enable_shared_from_this { /// User facing type of the node. node_type MNodeType = node_type::empty; /// Command group object which stores all args etc needed to enqueue the node - std::unique_ptr MCommandGroup; + std::shared_ptr MCommandGroup; /// Stores the executable graph impl associated with this node if it is a /// subgraph node. std::shared_ptr MSubGraphImpl; + /// Dynamic command-group object used in node, if any. + std::shared_ptr MDynCG; /// Used for tracking visited status during cycle checks. bool MVisited = false; @@ -108,9 +110,6 @@ class node_impl : public std::enable_shared_from_this { /// cannot be used to find out the partion of a node outside of this process. int MPartitionNum = -1; - /// Track whether an ND-Range was used for kernel nodes - bool MNDRangeUsed = false; - /// Add successor to the node. /// @param Node Node to add as a successor. void registerSuccessor(const std::shared_ptr &Node) { @@ -143,10 +142,9 @@ class node_impl : public std::enable_shared_from_this { /// @param NodeType Type of the command-group. /// @param CommandGroup The CG which stores the command information for this /// node. - node_impl(node_type NodeType, - std::unique_ptr &&CommandGroup) + node_impl(node_type NodeType, std::shared_ptr CommandGroup) : MCGType(CommandGroup->getType()), MNodeType(NodeType), - MCommandGroup(std::move(CommandGroup)) { + MCommandGroup(CommandGroup) { if (NodeType == node_type::subgraph) { MSubGraphImpl = static_cast(MCommandGroup.get()) @@ -160,7 +158,7 @@ class node_impl : public std::enable_shared_from_this { : enable_shared_from_this(Other), MSuccessors(Other.MSuccessors), MPredecessors(Other.MPredecessors), MCGType(Other.MCGType), MNodeType(Other.MNodeType), MCommandGroup(Other.getCGCopy()), - MSubGraphImpl(Other.MSubGraphImpl) {} + MSubGraphImpl(Other.MSubGraphImpl), MDynCG(Other.MDynCG) {} /// Copy-assignment operator. This will perform a deep-copy of the /// command group object associated with this node. @@ -172,6 +170,7 @@ class node_impl : public std::enable_shared_from_this { MNodeType = Other.MNodeType; MCommandGroup = Other.getCGCopy(); MSubGraphImpl = Other.MSubGraphImpl; + MDynCG = Other.MDynCG; } return *this; } @@ -405,75 +404,6 @@ class node_impl : public std::enable_shared_from_this { return (ReqSrc->MDims > 1) || (ReqDst->MDims > 1); } - /// Update the value of an accessor inside this node. Accessors must be - /// handled specifically compared to other argument values. - /// @param ArgIndex The index of the accessor arg to be updated - /// @param Acc Pointer to the new accessor value - void updateAccessor(int ArgIndex, const sycl::detail::AccessorBaseHost *Acc) { - auto &Args = - static_cast(MCommandGroup.get())->MArgs; - auto NewAccImpl = sycl::detail::getSyclObjImpl(*Acc); - for (auto &Arg : Args) { - if (Arg.MIndex != ArgIndex) { - continue; - } - assert(Arg.MType == sycl::detail::kernel_param_kind_t::kind_accessor); - - // Find old accessor in accessor storage and replace with new one - if (static_cast(NewAccImpl->MSYCLMemObj) - ->needsWriteBack()) { - throw sycl::exception( - make_error_code(errc::invalid), - "Accessors to buffers which have write_back enabled " - "are not allowed to be used in command graphs."); - } - - // All accessors passed to this function will be placeholders, so we must - // perform steps similar to what happens when handler::require() is - // called here. - sycl::detail::Requirement *NewReq = NewAccImpl.get(); - if (NewReq->MAccessMode != sycl::access_mode::read) { - auto SYCLMemObj = - static_cast(NewReq->MSYCLMemObj); - SYCLMemObj->handleWriteAccessorCreation(); - } - - for (auto &Acc : MCommandGroup->getAccStorage()) { - if (auto OldAcc = - static_cast(Arg.MPtr); - Acc.get() == OldAcc) { - Acc = NewAccImpl; - } - } - - for (auto &Req : MCommandGroup->getRequirements()) { - if (auto OldReq = - static_cast(Arg.MPtr); - Req == OldReq) { - Req = NewReq; - } - } - Arg.MPtr = NewAccImpl.get(); - break; - } - } - - void updateArgValue(int ArgIndex, const void *NewValue, size_t Size) { - - auto &Args = - static_cast(MCommandGroup.get())->MArgs; - for (auto &Arg : Args) { - if (Arg.MIndex != ArgIndex) { - continue; - } - assert(Arg.MSize == static_cast(Size)); - // MPtr may be a pointer into arg storage so we memcpy the contents of - // NewValue rather than assign it directly - std::memcpy(Arg.MPtr, NewValue, Size); - break; - } - } - template void updateNDRange(nd_range ExecutionRange) { if (MCGType != sycl::detail::CGType::Kernel) { @@ -481,11 +411,6 @@ class node_impl : public std::enable_shared_from_this { sycl::errc::invalid, "Cannot update execution range of nodes which are not kernel nodes"); } - if (!MNDRangeUsed) { - throw sycl::exception(sycl::errc::invalid, - "Cannot update node which was created with a " - "sycl::range with a sycl::nd_range"); - } auto &NDRDesc = static_cast(MCommandGroup.get()) @@ -507,11 +432,6 @@ class node_impl : public std::enable_shared_from_this { sycl::errc::invalid, "Cannot update execution range of nodes which are not kernel nodes"); } - if (MNDRangeUsed) { - throw sycl::exception(sycl::errc::invalid, - "Cannot update node which was created with a " - "sycl::nd_range with a sycl::range"); - } auto &NDRDesc = static_cast(MCommandGroup.get()) @@ -535,6 +455,7 @@ class node_impl : public std::enable_shared_from_this { ExecCG->MArgs = OtherExecCG->MArgs; ExecCG->MNDRDesc = OtherExecCG->MNDRDesc; + ExecCG->MKernelName = OtherExecCG->MKernelName; ExecCG->getAccStorage() = OtherExecCG->getAccStorage(); ExecCG->getRequirements() = OtherExecCG->getRequirements(); @@ -888,30 +809,40 @@ class graph_impl : public std::enable_shared_from_this { /// @param Root Node to remove from list of root nodes. void removeRoot(const std::shared_ptr &Root); + /// Verifies the CG is valid to add to the graph and returns set of + /// dependent nodes if so. + /// @param CommandGroup The command group to verify and retrieve edges for. + /// @return Set of dependent nodes in the graph. + std::set> + getCGEdges(const std::shared_ptr &CommandGroup) const; + + /// Identifies the sycl buffers used in the command-group and marks them + /// as used in the graph. + /// @param CommandGroup The command-group to check for buffer usage in. + void markCGMemObjs(const std::shared_ptr &CommandGroup); + /// Create a kernel node in the graph. /// @param NodeType User facing type of the node. /// @param CommandGroup The CG which stores all information for this node. - /// @param Dep Dependencies of the created node. + /// @param Deps Dependencies of the created node. /// @return Created node in the graph. - std::shared_ptr - add(node_type NodeType, std::unique_ptr CommandGroup, - const std::vector> &Dep = {}); + std::shared_ptr add(node_type NodeType, + std::shared_ptr CommandGroup, + std::vector> &Deps); /// Create a CGF node in the graph. /// @param CGF Command-group function to create node with. /// @param Args Node arguments. - /// @param Dep Dependencies of the created node. + /// @param Deps Dependencies of the created node. /// @return Created node in the graph. - std::shared_ptr - add(std::function CGF, - const std::vector &Args, - const std::vector> &Dep = {}); + std::shared_ptr add(std::function CGF, + const std::vector &Args, + std::vector> &Deps); /// Create an empty node in the graph. - /// @param Dep List of predecessor nodes. + /// @param Deps List of predecessor nodes. /// @return Created node in the graph. - std::shared_ptr - add(const std::vector> &Dep = {}); + std::shared_ptr add(std::vector> &Deps); /// Create an empty node in the graph. /// @param Events List of events associated to this node. @@ -919,6 +850,14 @@ class graph_impl : public std::enable_shared_from_this { std::shared_ptr add(const std::vector Events); + /// Create a dynamic command-group node in the graph. + /// @param DynCGImpl Dynamic command-group used to create node. + /// @param Deps List of predecessor nodes. + /// @return Created node in the graph. + std::shared_ptr + add(std::shared_ptr &DynCGImpl, + std::vector> &Deps); + /// Add a queue to the set of queues which are currently recording to this /// graph. /// @param RecordingQueue Queue to add to set. @@ -1236,7 +1175,12 @@ class graph_impl : public std::enable_shared_from_this { /// @param Node The node to add deps for /// @param Deps List of dependent nodes void addDepsToNode(std::shared_ptr Node, - const std::vector> &Deps) { + std::vector> &Deps) { + // Remove empty shared pointers from the list + auto EmptyElementIter = + std::remove(Deps.begin(), Deps.end(), std::shared_ptr()); + Deps.erase(EmptyElementIter, Deps.end()); + if (!Deps.empty()) { for (auto &N : Deps) { N->registerSuccessor(Node); @@ -1520,65 +1464,110 @@ class dynamic_parameter_impl { MNodes.emplace_back(NodeImpl, ArgIndex); } + /// Struct detailing an instance of the usage of the dynamic parameter in a + /// dynamic CG. + struct DynamicCGInfo { + /// Dynamic command-group that uses this dynamic parameter. + std::weak_ptr DynCG; + /// Index of the CG in the Dynamic CG that uses this dynamic parameter. + size_t CGIndex; + /// The arg index in the kernel the dynamic parameter is used. + int ArgIndex; + }; + + /// Registers a dynamic command-group with this dynamic parameter. + /// @param DynCG The dynamic command-group to register. + /// @param CGIndex Index of the CG in DynCG using this dynamic parameter. + /// @param ArgIndex The arg index in the kernel the dynamic parameter is used. + void registerDynCG(std::shared_ptr DynCG, + size_t CGIndex, int ArgIndex) { + MDynCGs.emplace_back(DynamicCGInfo{DynCG, CGIndex, ArgIndex}); + } + /// Get a pointer to the internal value of this dynamic parameter void *getValue() { return MValueStorage.data(); } /// Update sycl_ext_oneapi_raw_kernel_arg parameter /// @param NewRawValue Pointer to a raw_kernel_arg object. /// @param Size Parameter is ignored. - void updateValue(const raw_kernel_arg *NewRawValue, size_t Size) { - // Number of bytes is taken from member of raw_kernel_arg object rather - // than using the size parameter which represents sizeof(raw_kernel_arg). - std::ignore = Size; - size_t RawArgSize = NewRawValue->MArgSize; - const void *RawArgData = NewRawValue->MArgData; - - for (auto &[NodeWeak, ArgIndex] : MNodes) { - auto NodeShared = NodeWeak.lock(); - if (NodeShared) { - NodeShared->updateArgValue(ArgIndex, RawArgData, RawArgSize); - } - } - std::memcpy(MValueStorage.data(), RawArgData, RawArgSize); - } + void updateValue(const raw_kernel_arg *NewRawValue, size_t Size); /// Update the internal value of this dynamic parameter as well as the value - /// of this parameter in all registered nodes. + /// of this parameter in all registered nodes and dynamic CGs. /// @param NewValue Pointer to the new value /// @param Size Size of the data pointer to by NewValue - void updateValue(const void *NewValue, size_t Size) { - for (auto &[NodeWeak, ArgIndex] : MNodes) { - auto NodeShared = NodeWeak.lock(); - if (NodeShared) { - NodeShared->updateArgValue(ArgIndex, NewValue, Size); - } - } - std::memcpy(MValueStorage.data(), NewValue, Size); - } + void updateValue(const void *NewValue, size_t Size); /// Update the internal value of this dynamic parameter as well as the value - /// of this parameter in all registered nodes. Should only be called for - /// accessor dynamic_parameters. + /// of this parameter in all registered nodes and dynamic CGs. Should only be + /// called for accessor dynamic_parameters. /// @param Acc The new accessor value - void updateAccessor(const sycl::detail::AccessorBaseHost *Acc) { - for (auto &[NodeWeak, ArgIndex] : MNodes) { - auto NodeShared = NodeWeak.lock(); - // Should we fail here if the node isn't alive anymore? - if (NodeShared) { - NodeShared->updateAccessor(ArgIndex, Acc); - } - } - std::memcpy(MValueStorage.data(), Acc, - sizeof(sycl::detail::AccessorBaseHost)); - } + void updateAccessor(const sycl::detail::AccessorBaseHost *Acc); + + /// Static helper function for updating command-group value arguments. + /// @param CG The command-group to update the argument information for. + /// @param ArgIndex The argument index to update. + /// @param NewValue Pointer to the new value. + /// @param Size Size of the data pointer to by NewValue + static void updateCGArgValue(std::shared_ptr CG, + int ArgIndex, const void *NewValue, size_t Size); + + /// Static helper function for updating command-group accessor arguments. + /// @param CG The command-group to update the argument information for. + /// @param ArgIndex The argument index to update. + /// @param Acc The new accessor value + static void updateCGAccessor(std::shared_ptr CG, + int ArgIndex, + const sycl::detail::AccessorBaseHost *Acc); // Weak ptrs to node_impls which will be updated std::vector, int>> MNodes; + // Dynamic command-groups which will be updated + std::vector MDynCGs; std::shared_ptr MGraph; std::vector MValueStorage; }; +class dynamic_command_group_impl + : public std::enable_shared_from_this { +public: + dynamic_command_group_impl( + const command_graph &Graph); + + /// Returns the index of the active command-group + size_t getActiveIndex() const { return MActiveCGF; } + + /// Returns the number of CGs in the dynamic command-group. + size_t getNumCGs() const { return MKernels.size(); } + + /// Set the index of the active command-group. + /// @param Index The new index. + void setActiveIndex(size_t Index); + + /// Instantiates a command-group object for each CGF in the list. + /// @param CGFList List of CGFs to finalize with a handler into CG objects. + void + finalizeCGFList(const std::vector> &CGFList); + + /// Retrieve CG at the currently active index + /// @param Shared pointer to the active CG object. + std::shared_ptr getActiveKernel() const { + return MKernels[MActiveCGF]; + } + + /// Graph this dynamic command-group is associated with. + std::shared_ptr MGraph; + + /// Index of active command-group + std::atomic MActiveCGF; + + /// List of kernel command-groups for dynamic command-group nodes + std::vector> MKernels; + + /// List of nodes using this dynamic command-group. + std::vector> MNodes; +}; } // namespace detail } // namespace experimental } // namespace oneapi diff --git a/sycl/source/detail/handler_impl.hpp b/sycl/source/detail/handler_impl.hpp index b34c22ad2777e..e17a8c7187191 100644 --- a/sycl/source/detail/handler_impl.hpp +++ b/sycl/source/detail/handler_impl.hpp @@ -152,9 +152,11 @@ class handler_impl { ext::oneapi::experimental::detail::dynamic_parameter_impl *, int>> MDynamicParameters; +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES // Track whether an NDRange was used when submitting a kernel (as opposed to a // range), needed for graph update bool MNDRangeUsed = false; +#endif /// The storage for the arguments passed. /// We need to store a copy of values that are passed explicitly through diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 1d24a2bb41c40..560862f727c88 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2485,11 +2485,22 @@ ur_result_t enqueueImpCommandBufferKernel( const std::function &getMemAllocationFunc) { auto ContextImpl = sycl::detail::getSyclObjImpl(Ctx); const sycl::detail::AdapterPtr &Adapter = ContextImpl->getAdapter(); + + const std::vector> + &AlternativeKernels = CommandGroup.MAlternativeKernels; + + // UR kernel and program for 'CommandGroup' ur_kernel_handle_t UrKernel = nullptr; ur_program_handle_t UrProgram = nullptr; + + // Impl objects created when 'CommandGroup' is from a kernel bundle std::shared_ptr SyclKernelImpl = nullptr; std::shared_ptr DeviceImageImpl = nullptr; + // List of ur objects to be released after UR call + std::vector UrKernelsToRelease; + std::vector UrProgramsToRelease; + auto Kernel = CommandGroup.MSyclKernel; auto KernelBundleImplPtr = CommandGroup.MKernelBundle; const KernelArgMask *EliminatedArgMask = nullptr; @@ -2518,6 +2529,42 @@ ur_result_t enqueueImpCommandBufferKernel( std::tie(UrKernel, std::ignore, EliminatedArgMask, UrProgram) = sycl::detail::ProgramManager::getInstance().getOrCreateKernel( ContextImpl, DeviceImpl, CommandGroup.MKernelName); + UrKernelsToRelease.push_back(UrKernel); + UrProgramsToRelease.push_back(UrProgram); + } + + // Build up the list of UR kernel handles that the UR command could be + // updated to use. + std::vector AltUrKernels; + for (const auto &AltCGKernelWP : AlternativeKernels) { + auto AltCGKernel = AltCGKernelWP.lock(); + assert(AltCGKernel != nullptr); + + ur_kernel_handle_t AltUrKernel = nullptr; + if (auto KernelBundleImplPtr = AltCGKernel->MKernelBundle; + KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) { + auto KernelName = AltCGKernel->MKernelName; + kernel_id KernelID = + detail::ProgramManager::getInstance().getSYCLKernelID(KernelName); + kernel SyclKernel = + KernelBundleImplPtr->get_kernel(KernelID, KernelBundleImplPtr); + AltUrKernel = detail::getSyclObjImpl(SyclKernel)->getHandleRef(); + } else if (AltCGKernel->MSyclKernel != nullptr) { + AltUrKernel = Kernel->getHandleRef(); + } else { + ur_program_handle_t UrProgram = nullptr; + std::tie(AltUrKernel, std::ignore, std::ignore, UrProgram) = + sycl::detail::ProgramManager::getInstance().getOrCreateKernel( + ContextImpl, DeviceImpl, AltCGKernel->MKernelName); + UrKernelsToRelease.push_back(AltUrKernel); + UrProgramsToRelease.push_back(UrProgram); + } + + if (AltUrKernel != UrKernel) { + // Don't include command-group 'CommandGroup' in the list to pass to UR, + // as this will be used for the primary ur kernel parameter. + AltUrKernels.push_back(AltUrKernel); + } } auto SetFunc = [&Adapter, &UrKernel, &DeviceImageImpl, &Ctx, @@ -2570,14 +2617,17 @@ ur_result_t enqueueImpCommandBufferKernel( ur_result_t Res = Adapter->call_nocheck( CommandBuffer, UrKernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0], - &NDRDesc.GlobalSize[0], LocalSize, 0, nullptr, SyncPoints.size(), - SyncPoints.size() ? SyncPoints.data() : nullptr, 0, nullptr, - OutSyncPoint, nullptr, + &NDRDesc.GlobalSize[0], LocalSize, AltUrKernels.size(), + AltUrKernels.size() ? AltUrKernels.data() : nullptr, + SyncPoints.size(), SyncPoints.size() ? SyncPoints.data() : nullptr, 0, + nullptr, OutSyncPoint, nullptr, CommandBufferDesc.isUpdatable ? OutCommand : nullptr); - if (!SyclKernelImpl && !Kernel) { - Adapter->call(UrKernel); - Adapter->call(UrProgram); + for (auto &Kernel : UrKernelsToRelease) { + Adapter->call(Kernel); + } + for (auto &Program : UrProgramsToRelease) { + Adapter->call(Program); } if (Res != UR_RESULT_SUCCESS) { diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 12f38de0c1c8e..c0c5944d6ed2e 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -554,11 +554,9 @@ event handler::finalize() { // Find the last node added to the graph from this queue, so our new // node can set it as a predecessor. auto DependentNode = GraphImpl->getLastInorderNode(MQueue); - - NodeImpl = DependentNode - ? GraphImpl->add(NodeType, std::move(CommandGroup), - {DependentNode}) - : GraphImpl->add(NodeType, std::move(CommandGroup)); + std::vector> + Deps = {DependentNode}; + NodeImpl = GraphImpl->add(NodeType, std::move(CommandGroup), Deps); // If we are recording an in-order queue remember the new node, so it // can be used as a dependency for any more nodes recorded from this @@ -566,12 +564,9 @@ event handler::finalize() { GraphImpl->setLastInorderNode(MQueue, NodeImpl); } else { auto LastBarrierRecordedFromQueue = GraphImpl->getBarrierDep(MQueue); - if (LastBarrierRecordedFromQueue) { - NodeImpl = GraphImpl->add(NodeType, std::move(CommandGroup), - {LastBarrierRecordedFromQueue}); - } else { - NodeImpl = GraphImpl->add(NodeType, std::move(CommandGroup)); - } + std::vector> + Deps = {LastBarrierRecordedFromQueue}; + NodeImpl = GraphImpl->add(NodeType, std::move(CommandGroup), Deps); if (NodeImpl->MCGType == sycl::detail::CGType::Barrier) { GraphImpl->setBarrierDep(MQueue, NodeImpl); @@ -581,8 +576,6 @@ event handler::finalize() { // Associate an event with this new node and return the event. GraphImpl->addEventForNode(EventImpl, NodeImpl); - NodeImpl->MNDRangeUsed = impl->MNDRangeUsed; - return detail::createSyclObjFromImpl(EventImpl); } @@ -1994,7 +1987,9 @@ std::tuple, bool> handler::getMaxWorkGroups_v2() { return {std::array{0, 0, 0}, false}; } +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES void handler::setNDRangeUsed(bool Value) { impl->MNDRangeUsed = Value; } +#endif void handler::registerDynamicParameter( ext::oneapi::experimental::detail::dynamic_parameter_base &DynamicParamBase, diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_accessor.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor.cpp new file mode 100644 index 0000000000000..2b5f378d8bed7 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor.cpp @@ -0,0 +1,60 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests using dynamic command-group objects with buffer accessors + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + const size_t N = 1024; + std::vector HostData(N, 0); + buffer Buf{HostData}; + Buf.set_write_back(false); + auto Acc = Buf.get_access(); + + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + + int PatternA = 42; + auto CGFA = [&](handler &CGH) { + CGH.require(Acc); + CGH.parallel_for(N, [=](item<1> Item) { Acc[Item.get_id()] = PatternA; }); + }; + + int PatternB = 0xA; + auto CGFB = [&](handler &CGH) { + CGH.require(Acc); + CGH.parallel_for(N, [=](item<1> Item) { Acc[Item.get_id()] = PatternB; }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); + auto DynamicCGNode = Graph.add(DynamicCG); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Acc, HostData.data()).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == PatternA); + } + + DynamicCG.set_active_cgf(1); + ExecGraph.update(DynamicCGNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(Acc, HostData.data()).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == PatternB); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps.cpp new file mode 100644 index 0000000000000..5ce7a4bf40df1 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps.cpp @@ -0,0 +1,74 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests adding a dynamic command-group node to a graph using buffer +// accessors for the node edges. + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + const size_t N = 1024; + int *Ptr = (int *)sycl::malloc_device(N, Queue); + std::vector HostData(N, 0); + buffer Buf{HostData}; + Buf.set_write_back(false); + + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + + auto RootNode = Graph.add([&](handler &CGH) { + auto Acc = Buf.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { Acc[Item.get_id()] = 1; }); + }); + + int PatternA = 42; + auto CGFA = [&](handler &CGH) { + auto Acc = Buf.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { Acc[Item.get_id()] += PatternA; }); + }; + + int PatternB = 0xA; + auto CGFB = [&](handler &CGH) { + auto Acc = Buf.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { Acc[Item.get_id()] += PatternB; }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); + auto DynamicCGNode = Graph.add(DynamicCG); + + auto LeafNode = Graph.add([&](handler &CGH) { + auto Acc = Buf.get_access(CGH); + CGH.parallel_for( + N, [=](item<1> Item) { Ptr[Item.get_id()] = Acc[Item.get_id()]; }); + }); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == (PatternA + 1)); + } + + DynamicCG.set_active_cgf(1); + ExecGraph.update(DynamicCGNode); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == (PatternB + 1)); + } + + sycl::free(Ptr, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps2.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps2.cpp new file mode 100644 index 0000000000000..8d50b8b26e0c2 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps2.cpp @@ -0,0 +1,85 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests adding a dynamic command-group node to a graph using buffer +// accessors for the node edges, but where different command-groups +// use different buffers that create identical edges. + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + const size_t N = 1024; + int *Ptr = (int *)sycl::malloc_device(N, Queue); + std::vector HostData(N, 0); + buffer BufA{sycl::range<1>(N)}; + buffer BufB{sycl::range<1>(N)}; + BufA.set_write_back(false); + BufB.set_write_back(false); + + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + + int InitA = 4; + int InitB = -4; + auto RootNode = Graph.add([&](handler &CGH) { + auto AccA = BufA.get_access(CGH); + auto AccB = BufB.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { + AccA[Item.get_id()] = InitA; + AccB[Item.get_id()] = InitB; + }); + }); + + int PatternA = 42; + auto CGFA = [&](handler &CGH) { + auto AccA = BufA.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { AccA[Item.get_id()] += PatternA; }); + }; + + int PatternB = 0xA; + auto CGFB = [&](handler &CGH) { + auto AccB = BufB.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { AccB[Item.get_id()] += PatternB; }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); + auto DynamicCGNode = Graph.add(DynamicCG); + + auto LeafNode = Graph.add([&](handler &CGH) { + auto AccA = BufA.get_access(CGH); + auto AccB = BufB.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { + Ptr[Item.get_id()] = AccA[Item.get_id()] + AccB[Item.get_id()]; + }); + }); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == (InitA + InitB + PatternA)); + } + + DynamicCG.set_active_cgf(1); + ExecGraph.update(DynamicCGNode); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == (InitA + InitB + PatternB)); + } + + sycl::free(Ptr, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_spv.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_spv.cpp new file mode 100644 index 0000000000000..80556f60fc75f --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_spv.cpp @@ -0,0 +1,81 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out %S/../Inputs/Kernels/update_with_indices_accessor.spv +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out %S/../Inputs/Kernels/update_with_indices_accessor.spv 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out %S/../Inputs/Kernels/update_with_indices_accessor.spv 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// REQUIRES: level_zero +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests updating an accessor argument to a graph node created from SPIR-V +// using dynamic command-groups. + +#include "../graph_common.hpp" + +int main(int, char **argv) { + queue Queue{}; + sycl::kernel_bundle KernelBundle = loadKernelsFromFile(Queue, argv[1]); + const auto getKernel = + [](sycl::kernel_bundle &bundle, + const std::string &name) { + return bundle.ext_oneapi_get_kernel(name); + }; + + kernel kernel = getKernel( + KernelBundle, "_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_EUlvE_"); + + const size_t N = 1024; + + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + std::vector HostDataA(N, 0); + std::vector HostDataB(N, 0); + + buffer BufA{HostDataA}; + buffer BufB{HostDataB}; + BufA.set_write_back(false); + BufB.set_write_back(false); + + auto AccA = BufA.get_access(); + auto AccB = BufB.get_access(); + + auto CGFA = [&](handler &CGH) { + CGH.require(AccA); + CGH.set_arg(0, AccA); + CGH.single_task(kernel); + }; + + auto CGFB = [&](handler &CGH) { + CGH.require(AccB); + CGH.set_arg(0, AccB); + CGH.single_task(kernel); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Queue, {CGFA, CGFB}); + auto DynamicCGNode = Graph.add(DynamicCG); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(BufA.get_access(), HostDataA.data()).wait(); + Queue.copy(BufB.get_access(), HostDataB.data()).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i); + assert(HostDataB[i] == 0); + } + + DynamicCG.set_active_cgf(1); + ExecGraph.update(DynamicCGNode); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(BufA.get_access(), HostDataA.data()).wait(); + Queue.copy(BufB.get_access(), HostDataB.data()).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i); + assert(HostDataB[i] == i); + } + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_different_arg_nums.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_different_arg_nums.cpp new file mode 100644 index 0000000000000..e1602864b44a0 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_different_arg_nums.cpp @@ -0,0 +1,148 @@ +// RUN: %{build} -o %t.out +// RUN: env SYCL_UR_TRACE=2 %{run} %t.out | FileCheck %s +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests updating a dynamic command-group with command-groups containing a +// different number of arguments. + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 1024; + int *Ptr = malloc_device(N, Queue); + std::vector HostData(N); + + // 3 kernel arguments: Ptr, PatternA, PatternB + int PatternA = 42; + int PatternB = 0xA; + auto CGFA = [&](handler &CGH) { + CGH.parallel_for( + N, [=](item<1> Item) { Ptr[Item.get_id()] = PatternA + PatternB; }); + }; + + // 2 kernel arguments: Ptr, MyPatternStruct + struct PatternStruct { + int PatternA; + int PatternB; + }; + PatternStruct MyPatternStruct{PatternA + 1, PatternB + 1}; + auto CGFB = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { + Ptr[Item.get_id()] = MyPatternStruct.PatternA + MyPatternStruct.PatternB; + }); + }; + + // 1 kernel argument: Ptr + auto CGFC = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] = 42 - 0xA; }); + }; + + // 4 kernel argument: Ptr + int PatternC = -12; + auto CGFD = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { + Ptr[Item.get_id()] = PatternA + PatternB + PatternC; + }); + }; + + // CHECK: <--- urKernelSetArgPointer( + // CHECK-SAME: .hKernel = [[KERNEL_HANDLE1:[0-9a-fA-Fx]+]] + // CHECL-SAME: .argIndex = 0 + + // CHECK: <--- urKernelSetArgValue + // CHECK-SAME: .hKernel = [[KERNEL_HANDLE1]] + // CHECK-SAME: .argIndex = 1 + + // CHECK: <--- urKernelSetArgValue + // CHECK-SAME: .hKernel = [[KERNEL_HANDLE1]] + // CHECK-SAME: .argIndex = 2 + + // CHECK: <--- urCommandBufferAppendKernelLaunchExp + // CHECK-SAME: .hKernel = [[KERNEL_HANDLE1]] + // CHECK-SAME: .numKernelAlternatives = 3 + // CHECK-SAME: .phKernelAlternatives = {[[KERNEL_HANDLE2:[0-9a-fA-Fx]+]], [[KERNEL_HANDLE3:[0-9a-fA-Fx]+]], [[KERNEL_HANDLE4:[0-9a-fA-Fx]+]]} + auto DynamicCG = + exp_ext::dynamic_command_group(Graph, {CGFA, CGFB, CGFC, CGFD}); + auto DynamicCGNode = Graph.add(DynamicCG); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // Verify CGFA works with 3 arguments + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + int Ref = PatternA + PatternB; + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == Ref); + } + + // Verify CGFB works with 2 arguments + // CHECK: <--- urCommandBufferUpdateKernelLaunchExp + // CHECK-SAME: .hNewKernel = [[KERNEL_HANDLE2]] + // CHECK-SAME: .numNewMemObjArgs = 0 + // CHECK-SAME: .numNewPointerArgs = 1 + // CHECK-SAME: .numNewValueArgs = 1 + // CHECK-SAME: .stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC + // CHECK-SAME: .argIndex = 0 + // CHECK-SAME: .stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC + // CHECK-SAME: .argIndex = 1 + DynamicCG.set_active_cgf(1); + ExecGraph.update(DynamicCGNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + Ref = (PatternA + 1) + (PatternB + 1); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == Ref); + } + + // Verify CGFC works with 1 argument + // CHECK: <--- urCommandBufferUpdateKernelLaunchExp + // CHECK-SAME: .hNewKernel = [[KERNEL_HANDLE3]] + // CHECK-SAME: .numNewMemObjArgs = 0 + // CHECK-SAME: .numNewPointerArgs = 1 + // CHECK-SAME: .numNewValueArgs = 0 + // CHECK-SAME: .stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC + // CHECK-SAME: .argIndex = 0 + DynamicCG.set_active_cgf(2); + ExecGraph.update(DynamicCGNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + Ref = PatternA - PatternB; + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == Ref); + } + + // Verify CGFD works with 4 arguments + // CHECK: <--- urCommandBufferUpdateKernelLaunchExp + // CHECK-SAME: .hNewKernel = [[KERNEL_HANDLE4]] + // CHECK-SAME: .numNewMemObjArgs = 0 + // CHECK-SAME: .numNewPointerArgs = 1 + // CHECK-SAME: .numNewValueArgs = 3 + // CHECK-SAME: .stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC + // CHECK-SAME: .argIndex = 0 + // CHECK-SAME: .stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC + // CHECK-SAME: .argIndex = 1 + // CHECK-SAME: .stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC + // CHECK-SAME: .argIndex = 2 + // CHECK-SAME: .stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC + // CHECK-SAME: .argIndex = 3 + DynamicCG.set_active_cgf(3); + ExecGraph.update(DynamicCGNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + Ref = PatternA + PatternB + PatternC; + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == Ref); + } + + sycl::free(Ptr, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_event_deps.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_event_deps.cpp new file mode 100644 index 0000000000000..11e28a033a4c2 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_event_deps.cpp @@ -0,0 +1,73 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests adding a dynamic command-group node to a graph using graph limited +// events for dependencies. + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 1024; + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + int *PtrC = malloc_device(N, Queue); + std::vector HostData(N); + + Graph.begin_recording(Queue); + int PatternA = 42; + auto EventA = Queue.fill(PtrA, PatternA, N); + int PatternB = 0xA; + auto EventB = Queue.fill(PtrB, PatternB, N); + Graph.end_recording(Queue); + + auto CGFA = [&](handler &CGH) { + CGH.depends_on({EventA, EventB}); + CGH.parallel_for(N, [=](item<1> Item) { + auto I = Item.get_id(); + PtrC[I] = PtrA[I] * PtrB[I]; + }); + }; + + auto CGFB = [&](handler &CGH) { + CGH.depends_on({EventA, EventB}); + CGH.parallel_for(N, [=](item<1> Item) { + auto I = Item.get_id(); + PtrC[I] = PtrA[I] + PtrB[I]; + }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); + auto DynamicCGNode = Graph.add(DynamicCG); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(PtrC, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == PatternA * PatternB); + } + + DynamicCG.set_active_cgf(1); + ExecGraph.update(DynamicCGNode); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(PtrC, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == PatternA + PatternB); + } + + sycl::free(PtrA, Queue); + sycl::free(PtrB, Queue); + sycl::free(PtrC, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_ndrange.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_ndrange.cpp new file mode 100644 index 0000000000000..f4717210bb35e --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_ndrange.cpp @@ -0,0 +1,72 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests updating a dynamic command-group node where the dynamic command-groups +// have different ranges/nd-ranges + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 1024; + int *Ptr = malloc_device(N, Queue); + std::vector HostData(N); + + auto RootNode = + Graph.add([&](handler &cgh) { cgh.memset(Ptr, 0, N * sizeof(int)); }); + + int PatternA = 42; + sycl::range<1> RangeA{512}; + auto CGFA = [&](handler &CGH) { + CGH.parallel_for(RangeA, + [=](item<1> Item) { Ptr[Item.get_id()] = PatternA; }); + }; + + int PatternB = 0xA; + size_t UpdatedN = 256; + sycl::nd_range<1> RangeB{sycl::range{UpdatedN}, sycl::range{16}}; + auto CGFB = [&](handler &CGH) { + CGH.parallel_for( + RangeB, [=](nd_item<1> Item) { Ptr[Item.get_global_id()] = PatternB; }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); + auto DynamicCGNode = + Graph.add(DynamicCG, exp_ext::property::node::depends_on(RootNode)); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + if (i < RangeA.get(0)) { + assert(HostData[i] == PatternA); + } else { + assert(HostData[i] == 0); + } + } + + DynamicCG.set_active_cgf(1); + ExecGraph.update(DynamicCGNode); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + if (i < UpdatedN) { + assert(HostData[i] == PatternB); + } else { + assert(HostData[i] == 0); + } + } + + sycl::free(Ptr, Queue); + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_ndrange_3D.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_ndrange_3D.cpp new file mode 100644 index 0000000000000..f6390df64303a --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_ndrange_3D.cpp @@ -0,0 +1,80 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests updating a dynamic command-group node where the dynamic command-groups +// have different ranges/nd-ranges + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 64; + int *Ptr = malloc_device(N, Queue); + std::vector HostData(N); + + auto RootNode = + Graph.add([&](handler &cgh) { cgh.memset(Ptr, 0, N * sizeof(int)); }); + + int PatternA = 42; + sycl::range<1> RangeA{N}; + auto CGFA = [&](handler &CGH) { + CGH.parallel_for(RangeA, + [=](item<1> Item) { Ptr[Item.get_id()] = PatternA; }); + }; + + int PatternB = 0xA; + sycl::nd_range<3> RangeB{sycl::range{4, 4, 4}, sycl::range{2, 2, 2}}; + auto CGFB = [&](handler &CGH) { + CGH.parallel_for(RangeB, [=](nd_item<3> Item) { + Ptr[Item.get_global_linear_id()] = PatternB; + }); + }; + + int PatternC = 7; + sycl::range<2> RangeC{8, 8}; + auto CGFC = [&](handler &CGH) { + CGH.parallel_for( + RangeC, [=](item<2> Item) { Ptr[Item.get_linear_id()] = PatternC; }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB, CGFC}); + auto DynamicCGNode = + Graph.add(DynamicCG, exp_ext::property::node::depends_on(RootNode)); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == PatternA); + } + + DynamicCG.set_active_cgf(1); + ExecGraph.update(DynamicCGNode); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == PatternB); + } + + DynamicCG.set_active_cgf(2); + ExecGraph.update(DynamicCGNode); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == PatternC); + } + + sycl::free(Ptr, Queue); + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_overwrite_range.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_overwrite_range.cpp new file mode 100644 index 0000000000000..3ba2500cd6189 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_overwrite_range.cpp @@ -0,0 +1,59 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests how the nd-range of a node is overwritten by the active command-group + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 1024; + std::vector HostData(N); + int *Ptr = malloc_device(N, Queue); + Queue.memset(Ptr, 0, N * sizeof(int)).wait(); + + int PatternA = 42; + auto CGFA = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] = PatternA; }); + }; + + int PatternB = 0xA; + auto CGFB = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] = PatternB; }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); + auto DynamicCGNode = Graph.add(DynamicCG); + + size_t NewRange = 512; + sycl::range<1> UpdateRange(NewRange); + DynamicCGNode.update_range(UpdateRange); + + DynamicCG.set_active_cgf(1); + + // Check that the UpdateRange from active CGF 0 is preserved + DynamicCG.set_active_cgf(0); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + if (i < NewRange) { + assert(HostData[i] == PatternA); + } else { + assert(HostData[i] == 0); + } + } + + sycl::free(Ptr, Queue); + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_parameters.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_parameters.cpp new file mode 100644 index 0000000000000..0c46672869c7d --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_parameters.cpp @@ -0,0 +1,70 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests updating kernel code using dynamic command-groups that have different +// parameters in each command-group. + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 1024; + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + + std::vector HostDataA(N); + std::vector HostDataB(N); + + Queue.memset(PtrA, 0, N * sizeof(int)); + Queue.memset(PtrB, 0, N * sizeof(int)); + Queue.wait(); + + int PatternA = 0xA; + auto CGFA = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { PtrA[Item.get_id()] = PatternA; }); + }; + + int PatternB = 42; + auto CGFB = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { PtrB[Item.get_id()] = PatternB; }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); + auto DynamicCGNode = Graph.add(DynamicCG); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(PtrA, HostDataA.data(), N); + Queue.copy(PtrB, HostDataB.data(), N); + Queue.wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == PatternA); + assert(HostDataB[i] == 0); + } + + DynamicCG.set_active_cgf(1); + ExecGraph.update(DynamicCGNode); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(PtrA, HostDataA.data(), N); + Queue.copy(PtrB, HostDataB.data(), N); + Queue.wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == PatternA); + assert(HostDataB[i] == PatternB); + } + + sycl::free(PtrA, Queue); + sycl::free(PtrB, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_shared_nodes.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_shared_nodes.cpp new file mode 100644 index 0000000000000..f9b0728d8ea67 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_shared_nodes.cpp @@ -0,0 +1,74 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests using the same dynamic command-group in more than one graph node. + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 1024; + std::vector HostData(N); + int *Ptr = malloc_device(N, Queue); + + auto RootNode = + Graph.add([&](handler &CGH) { CGH.memset(Ptr, 0, N * sizeof(int)); }); + + int PatternA = 42; + auto CGFA = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] += PatternA; }); + }; + + int PatternB = 0xA; + auto CGFB = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] += PatternB; }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); + auto Node1 = + Graph.add(DynamicCG, exp_ext::property::node::depends_on(RootNode)); + + auto Node2 = Graph.add( + [&](handler &cgh) { + cgh.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] *= 2; }); + }, + exp_ext::property::node::depends_on(Node1)); + + auto Node3 = Graph.add(DynamicCG, exp_ext::property::node::depends_on(Node2)); + + // This ND-Range affects Node 1 as well, as the range is tied to the node. + sycl::range<1> Node3Range(512); + Node3.update_range(Node3Range); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + int Ref = (i < Node3Range.get(0)) ? (PatternA * 3) : 0; + assert(HostData[i] == Ref); + } + + DynamicCG.set_active_cgf(1); + ExecGraph.update(Node1); + ExecGraph.update(Node3); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + int Ref = (PatternB * 3); + assert(HostData[i] == Ref); + } + + sycl::free(Ptr, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_update_before_finalize.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_update_before_finalize.cpp new file mode 100644 index 0000000000000..a9109d000eb17 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_update_before_finalize.cpp @@ -0,0 +1,48 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests updating a dynamic command-group node after it has been added to +// a graph but before the graph has been finalized + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 1024; + int *Ptr = malloc_device(N, Queue); + std::vector HostData(N); + + int PatternA = 42; + auto CGFA = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] = PatternA; }); + }; + + int PatternB = 0xA; + auto CGFB = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] = PatternB; }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); + auto DynamicCGNode = Graph.add(DynamicCG); + DynamicCG.set_active_cgf(1); + auto ExecGraph = Graph.finalize(); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == PatternB); + } + + sycl::free(Ptr, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_usm.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_usm.cpp new file mode 100644 index 0000000000000..79db8ebe67c57 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_usm.cpp @@ -0,0 +1,55 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests updating usm kernel code using dynamic command-groups + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 1024; + int *Ptr = malloc_device(N, Queue); + std::vector HostData(N); + + int PatternA = 42; + auto CGFA = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] = PatternA; }); + }; + + int PatternB = 0xA; + auto CGFB = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] = PatternB; }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); + auto DynamicCGNode = Graph.add(DynamicCG); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == PatternA); + } + + DynamicCG.set_active_cgf(1); + ExecGraph.update(DynamicCGNode); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == PatternB); + } + + sycl::free(Ptr, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_with_all_dyn_params.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_with_all_dyn_params.cpp new file mode 100644 index 0000000000000..cb9bdf15f76b8 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_with_all_dyn_params.cpp @@ -0,0 +1,121 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests using a dynamic command-group object with dynamic parameters inside it + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 1024; + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + int *PtrC = malloc_device(N, Queue); + + std::vector HostDataA(N); + std::vector HostDataB(N); + std::vector HostDataC(N); + + exp_ext::dynamic_parameter DynParam1(Graph, PtrA); + exp_ext::dynamic_parameter DynParam2(Graph, PtrC); + + auto CGFA = [&](handler &CGH) { + CGH.set_arg(0, DynParam1); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + CGH.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrA[i] = i; + } + }); + }; + + auto CGFB = [&](handler &CGH) { + CGH.set_arg(0, DynParam1); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + CGH.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrA[i] = i; + } + }); + }; + + auto CGFC = [&](handler &CGH) { + CGH.set_arg(0, DynParam2); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + CGH.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrC[i] = i; + } + }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB, CGFC}); + auto DynamicCGNode = Graph.add(DynamicCG); + auto KernelNode = Graph.add( + [&](handler &CGH) { + CGH.set_arg(0, DynParam2); + // TODO: Use the free function kernel extension instead of regular + // kernels when available. + CGH.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrC[i] += i; + } + }); + }, + exp_ext::property::node::depends_on(DynamicCGNode)); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + auto ExecuteGraphAndVerifyResults = [&](bool A, bool B, bool C) { + Queue.memset(PtrA, 0, N * sizeof(int)); + Queue.memset(PtrB, 0, N * sizeof(int)); + Queue.memset(PtrC, 0, N * sizeof(int)); + Queue.wait(); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N); + Queue.copy(PtrB, HostDataB.data(), N); + Queue.copy(PtrC, HostDataC.data(), N); + Queue.wait(); + + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == (A ? i : 0)); + assert(HostDataB[i] == (B ? i : 0)); + assert(HostDataC[i] == (C ? (2 * i) : i)); + } + }; + ExecuteGraphAndVerifyResults(true, false, false); + + DynParam1.update(PtrB); + ExecGraph.update(DynamicCGNode); + ExecuteGraphAndVerifyResults(false, true, false); + + DynamicCG.set_active_cgf(1); + ExecGraph.update(DynamicCGNode); + ExecuteGraphAndVerifyResults(false, true, false); + + DynamicCG.set_active_cgf(2); + // Should be ignored as DynParam1 not used in active node + DynParam1.update(PtrA); + ExecGraph.update(DynamicCGNode); + ExecuteGraphAndVerifyResults(false, false, true); + + sycl::free(PtrA, Queue); + sycl::free(PtrB, Queue); + sycl::free(PtrC, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_with_different_type_dyn_params.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_with_different_type_dyn_params.cpp new file mode 100644 index 0000000000000..15f815664a740 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_with_different_type_dyn_params.cpp @@ -0,0 +1,137 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests using a dynamic command-group object with dynamic parameters of +// different types + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 1024; + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + int *PtrC = malloc_device(N, Queue); + + std::vector HostDataA(N); + std::vector HostDataB(N); + std::vector HostDataC(N); + + int ScalarValue = 17; + exp_ext::dynamic_parameter DynParamScalar(Graph, ScalarValue); + exp_ext::dynamic_parameter DynParamPtr(Graph, PtrA); + + // Kernel has 2 dynamic parameters, one of scalar type & one of ptr type + auto CGFA = [&](handler &CGH) { + CGH.set_arg(0, DynParamPtr); + CGH.set_arg(1, DynParamScalar); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + CGH.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrA[i] = ScalarValue; + } + }); + }; + + // Kernel has a single argument, a dynamic parameter of ptr type + auto CGFB = [&](handler &CGH) { + CGH.set_arg(0, DynParamPtr); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + CGH.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrA[i] = ScalarValue; + } + }); + }; + + // Kernel has a two arguments, an immutable ptr type argument and a + // dynamic parameter of scalar type. + auto CGFC = [&](handler &CGH) { + CGH.set_arg(1, DynParamScalar); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + CGH.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrC[i] = ScalarValue; + } + }); + }; + + // Kernel has a single argument, of immutable pointer type + auto CGFD = [&](handler &CGH) { + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + CGH.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrA[i] = ScalarValue; + } + }); + }; + + auto DynamicCG = + exp_ext::dynamic_command_group(Graph, {CGFA, CGFB, CGFC, CGFD}); + auto DynamicCGNode = Graph.add(DynamicCG); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + auto ExecuteGraphAndVerifyResults = [&](int A, int B, int C) { + Queue.memset(PtrA, 0, N * sizeof(int)); + Queue.memset(PtrB, 0, N * sizeof(int)); + Queue.memset(PtrC, 0, N * sizeof(int)); + Queue.wait(); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N); + Queue.copy(PtrB, HostDataB.data(), N); + Queue.copy(PtrC, HostDataC.data(), N); + Queue.wait(); + + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == A); + assert(HostDataB[i] == B); + assert(HostDataC[i] == C); + } + }; + // CGFA using PtrA and ScalarValue in its dynamic parameters + ExecuteGraphAndVerifyResults(ScalarValue, 0, 0); + + // CGFA using PtrB and UpdatedScalarValue in its dynamic parameters + DynParamPtr.update(PtrB); + int UpdatedScalarValue = 42; + DynParamScalar.update(UpdatedScalarValue); + ExecGraph.update(DynamicCGNode); + ExecuteGraphAndVerifyResults(0, UpdatedScalarValue, 0); + + // CGFB using PtrB in its dynamic parameter and immutable ScalarValue + DynamicCG.set_active_cgf(1); + ExecGraph.update(DynamicCGNode); + ExecuteGraphAndVerifyResults(0, ScalarValue, false); + + // CGFC using immutable PtrC and UpdatedScalarValue in its dynamic parameter + DynamicCG.set_active_cgf(2); + ExecGraph.update(DynamicCGNode); + ExecuteGraphAndVerifyResults(0, 0, UpdatedScalarValue); + + // CGFD using immutable PtrA and immutable ScalarValue for arguments + DynamicCG.set_active_cgf(3); + ExecGraph.update(DynamicCGNode); + ExecuteGraphAndVerifyResults(ScalarValue, 0, 0); + + sycl::free(PtrA, Queue); + sycl::free(PtrB, Queue); + sycl::free(PtrC, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_with_some_dyn_params.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_with_some_dyn_params.cpp new file mode 100644 index 0000000000000..264c1b6849689 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_with_some_dyn_params.cpp @@ -0,0 +1,107 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests using a dynamic command-group object where some but not all the +// command-groups use dynamic parameters. + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 1024; + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + int *PtrC = malloc_device(N, Queue); + + std::vector HostDataA(N); + std::vector HostDataB(N); + std::vector HostDataC(N); + + exp_ext::dynamic_parameter DynParam(Graph, PtrA); + + auto CGFA = [&](handler &CGH) { + CGH.set_arg(0, DynParam); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + CGH.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrA[i] = i; + } + }); + }; + + auto CGFB = [&](handler &CGH) { + CGH.set_arg(0, DynParam); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + CGH.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrA[i] = i; + } + }); + }; + + auto CGFC = [&](handler &CGH) { + CGH.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrC[i] = i; + } + }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB, CGFC}); + auto DynamicCGNode = Graph.add(DynamicCG); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + auto ExecuteGraphAndVerifyResults = [&](bool A, bool B, bool C) { + Queue.memset(PtrA, 0, N * sizeof(int)); + Queue.memset(PtrB, 0, N * sizeof(int)); + Queue.memset(PtrC, 0, N * sizeof(int)); + Queue.wait(); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N); + Queue.copy(PtrB, HostDataB.data(), N); + Queue.copy(PtrC, HostDataC.data(), N); + Queue.wait(); + + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == (A ? i : 0)); + assert(HostDataB[i] == (B ? i : 0)); + assert(HostDataC[i] == (C ? i : 0)); + } + }; + // CGFA with DynParam using PtrA + ExecuteGraphAndVerifyResults(true, false, false); + + // CGFA with DynParam using PtrB + DynParam.update(PtrB); + ExecGraph.update(DynamicCGNode); + ExecuteGraphAndVerifyResults(false, true, false); + + // CGFB with DynParam using PtrB + DynamicCG.set_active_cgf(1); + ExecGraph.update(DynamicCGNode); + ExecuteGraphAndVerifyResults(false, true, false); + + // CGFC unconditionally using PtrC + DynamicCG.set_active_cgf(2); + ExecGraph.update(DynamicCGNode); + ExecuteGraphAndVerifyResults(false, false, true); + + sycl::free(PtrA, Queue); + sycl::free(PtrB, Queue); + sycl::free(PtrC, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/update_ndrange_to_range.cpp b/sycl/test-e2e/Graph/Update/update_ndrange_to_range.cpp new file mode 100644 index 0000000000000..0f1c10e5142bf --- /dev/null +++ b/sycl/test-e2e/Graph/Update/update_ndrange_to_range.cpp @@ -0,0 +1,55 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// + +// Tests updating a graph node from sycl::nd_range to sycl::range + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + const size_t N = 1024; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + int *PtrA = malloc_device(N, Queue); + + std::vector HostDataA(N); + + Queue.memset(PtrA, 0, N * sizeof(int)).wait(); + + nd_range<1> NDRange{range{N}, range{32}}; + + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.parallel_for(NDRange, [=](nd_item<1> Item) { + size_t GlobalID = Item.get_global_id(); + PtrA[GlobalID] += GlobalID; + }); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // first half of PtrA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i); + } + + // Update NDRange to target first half only + KernelNode.update_range(range<1>{512}); + ExecGraph.update(KernelNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == (i >= 512 ? i : i * 2)); + } + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/update_range_to_ndrange.cpp b/sycl/test-e2e/Graph/Update/update_range_to_ndrange.cpp new file mode 100644 index 0000000000000..9489d20c6a916 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/update_range_to_ndrange.cpp @@ -0,0 +1,56 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// + +// Tests updating a graph node from using a sycl::range to a sycl::nd_range + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + const size_t N = 1024; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + int *PtrA = malloc_device(N, Queue); + + std::vector HostDataA(N); + + Queue.memset(PtrA, 0, N * sizeof(int)).wait(); + + range<1> Range{1024}; + + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.parallel_for(Range, [=](item<1> Item) { + size_t GlobalID = Item.get_id(); + PtrA[GlobalID] += GlobalID; + }); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // first half of PtrA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i); + } + + // Update NDRange to target first half only + nd_range<1> NDRange{range{512}, range{32}}; + KernelNode.update_nd_range(NDRange); + ExecGraph.update(KernelNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == (i >= 512 ? i : i * 2)); + } + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/whole_update_dynamic_cgf.cpp b/sycl/test-e2e/Graph/Update/whole_update_dynamic_cgf.cpp new file mode 100644 index 0000000000000..03a0e19f8c51e --- /dev/null +++ b/sycl/test-e2e/Graph/Update/whole_update_dynamic_cgf.cpp @@ -0,0 +1,75 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests interaction of whole graph update and dynamic command-groups + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + exp_ext::command_graph GraphA{Queue.get_context(), Queue.get_device()}; + exp_ext::command_graph GraphB{Queue.get_context(), Queue.get_device()}; + + const size_t N = 1024; + int *Ptr = malloc_device(N, Queue); + std::vector HostData(N); + + int PatternA = 42; + auto CGFA = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] = PatternA; }); + }; + + int PatternB = 0xA; + auto CGFB = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] = PatternB; }); + }; + + auto DynamicCGA = exp_ext::dynamic_command_group(GraphA, {CGFA, CGFB}); + auto DynamicCGNodeA = GraphA.add(DynamicCGA); + + auto DynamicCGB = exp_ext::dynamic_command_group(GraphB, {CGFA, CGFB}); + auto DynamicCGNodeB = GraphB.add(DynamicCGB); + DynamicCGB.set_active_cgf(1); // Check if doesn't affect GraphA + + auto ExecGraph = GraphA.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == PatternA); + } + + // Graph B has CGF B as active, while Graph A has CGF A as active. + // Different command-groups should error due to being different + // kernels. + std::error_code ErrorCode = make_error_code(sycl::errc::success); + try { + ExecGraph.update(GraphB); + } catch (const sycl::exception &e) { + ErrorCode = e.code(); + } + assert(ErrorCode == sycl::errc::invalid); + + // Both ExecGraph and Graph B have CGFB as active, so + // whole graph update should be valid as graphs match. + DynamicCGA.set_active_cgf(1); + ExecGraph.update(DynamicCGNodeA); + ExecGraph.update(GraphB); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == PatternB); + } + + sycl::free(Ptr, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/whole_update_dynamic_param.cpp b/sycl/test-e2e/Graph/Update/whole_update_dynamic_param.cpp index c9a4922e7fd46..b894685a8bd87 100644 --- a/sycl/test-e2e/Graph/Update/whole_update_dynamic_param.cpp +++ b/sycl/test-e2e/Graph/Update/whole_update_dynamic_param.cpp @@ -5,8 +5,6 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} -// REQUIRES: aspect-usm_shared_allocations - // Tests that whole graph update works when using dynamic parameters. #include "../graph_common.hpp" diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 2d1dcce2b68e2..990676baacd48 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3023,6 +3023,9 @@ _ZN4sycl3_V13ext6oneapi12experimental20pitched_alloc_deviceEPmRKNS3_16image_desc _ZN4sycl3_V13ext6oneapi12experimental20pitched_alloc_deviceEPmRKNS3_16image_descriptorERKNS0_6deviceERKNS0_7contextE _ZN4sycl3_V13ext6oneapi12experimental20pitched_alloc_deviceEPmmmjRKNS0_5queueE _ZN4sycl3_V13ext6oneapi12experimental20pitched_alloc_deviceEPmmmjRKNS0_6deviceERKNS0_7contextE +_ZN4sycl3_V13ext6oneapi12experimental21dynamic_command_group14set_active_cgfEm +_ZN4sycl3_V13ext6oneapi12experimental21dynamic_command_groupC1ERKNS3_13command_graphILNS3_11graph_stateE0EEERKSt6vectorISt8functionIFvRNS0_7handlerEEESaISF_EE +_ZN4sycl3_V13ext6oneapi12experimental21dynamic_command_groupC2ERKNS3_13command_graphILNS3_11graph_stateE0EEERKSt6vectorISt8functionIFvRNS0_7handlerEEESaISF_EE _ZN4sycl3_V13ext6oneapi12experimental21get_composite_devicesEv _ZN4sycl3_V13ext6oneapi12experimental22get_image_channel_typeENS3_16image_mem_handleERKNS0_5queueE _ZN4sycl3_V13ext6oneapi12experimental22get_image_channel_typeENS3_16image_mem_handleERKNS0_6deviceERKNS0_7contextE @@ -3081,6 +3084,7 @@ _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph15begin_re _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph15begin_recordingERNS0_5queueERKNS0_13property_listE _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph24addGraphLeafDependenciesENS3_4nodeE _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph7addImplERKSt6vectorINS3_4nodeESaIS7_EE +_ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph7addImplERNS3_21dynamic_command_groupERKSt6vectorINS3_4nodeESaIS9_EE _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph7addImplESt8functionIFvRNS0_7handlerEEERKSt6vectorINS3_4nodeESaISC_EE _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph9make_edgeERNS3_4nodeES7_ _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graphC1ERKNS0_5queueERKNS0_13property_listE @@ -3596,6 +3600,7 @@ _ZNK4sycl3_V13ext6oneapi12experimental12physical_mem10get_deviceEv _ZNK4sycl3_V13ext6oneapi12experimental12physical_mem11get_contextEv _ZNK4sycl3_V13ext6oneapi12experimental12physical_mem3mapEmmNS3_19address_access_modeEm _ZNK4sycl3_V13ext6oneapi12experimental12physical_mem4sizeEv +_ZNK4sycl3_V13ext6oneapi12experimental21dynamic_command_group14get_active_cgfEv _ZNK4sycl3_V13ext6oneapi12experimental4node14get_successorsEv _ZNK4sycl3_V13ext6oneapi12experimental4node16get_predecessorsEv _ZNK4sycl3_V13ext6oneapi12experimental4node8get_typeEv diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index 33092af8477f5..0bf485466ec1d 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -326,6 +326,15 @@ ??0dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@$$QEAV0123456@@Z ??0dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@AEBV0123456@@Z ??0dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@V?$command_graph@$0A@@23456@_KPEBX@Z +?get_active_cgf@dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEBA_KXZ +??4dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEAAAEAV012345@$$QEAV012345@@Z +?addImpl@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@IEAA?AVnode@34567@AEAVdynamic_command_group@34567@AEBV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@@Z +??0dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEAA@AEBV?$command_graph@$0A@@12345@AEBV?$vector@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@V?$allocator@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@@2@@std@@@Z +??0dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEAA@$$QEAV012345@@Z +??0dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEAA@AEBV012345@@Z +?set_active_cgf@dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEAAX_K@Z +??1dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEAA@XZ +??4dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEAAAEAV012345@AEBV012345@@Z ??0event@_V1@sycl@@AEAA@V?$shared_ptr@Vevent_impl@detail@_V1@sycl@@@std@@@Z ??0event@_V1@sycl@@QEAA@$$QEAV012@@Z ??0event@_V1@sycl@@QEAA@AEBV012@@Z diff --git a/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp b/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp index 1542b5f34d7dc..90d95975a0245 100644 --- a/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp +++ b/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp @@ -700,3 +700,166 @@ TEST_F(CommandGraphTest, RecordingWrongGraphDep) { }), sycl::exception); } + +// Error when a dynamic command-group is used with a graph belonging to a +// different graph. +TEST_F(CommandGraphTest, DynamicCommandGroupWrongGraph) { + experimental::command_graph Graph1{Queue.get_context(), Queue.get_device()}; + experimental::command_graph Graph2{Queue.get_context(), Queue.get_device()}; + auto CGF = [&](sycl::handler &CGH) { + CGH.single_task>([]() {}); + }; + + experimental::dynamic_command_group DynCG(Graph2, {CGF}); + ASSERT_THROW(Graph1.add(DynCG), sycl::exception); +} + +// Error when a non-kernel command-group is included in a dynamic command-group +TEST_F(CommandGraphTest, DynamicCommandGroupNotKernel) { + int *Ptr = malloc_device(1, Queue); + auto CGF = [&](sycl::handler &CGH) { CGH.memset(Ptr, 1, 0); }; + + experimental::command_graph Graph{Queue}; + ASSERT_THROW(experimental::dynamic_command_group DynCG(Graph, {CGF}), + sycl::exception); + sycl::free(Ptr, Queue); +} + +// Error if edges are not the same for all command-groups in dynamic command +// group, test using graph limited events to create edges +TEST_F(CommandGraphTest, DynamicCommandGroupMismatchEventEdges) { + size_t N = 32; + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + + experimental::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + Graph.begin_recording(Queue); + + auto EventA = Queue.submit([&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { PtrA[Item.get_id()] = 1; }); + }); + + auto EventB = Queue.submit([&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { PtrB[Item.get_id()] = 4; }); + }); + + Graph.end_recording(); + + auto CGFA = [&](handler &CGH) { + CGH.depends_on(EventA); + CGH.parallel_for(N, [=](item<1> Item) { PtrA[Item.get_id()] += 2; }); + }; + + auto CGFB = [&](handler &CGH) { + CGH.depends_on(EventB); + CGH.parallel_for(N, [=](item<1> Item) { PtrB[Item.get_id()] += 0xA; }); + }; + + experimental::dynamic_command_group DynCG(Graph, {CGFA, CGFB}); + ASSERT_THROW(Graph.add(DynCG), sycl::exception); + + sycl::free(PtrA, Queue); + sycl::free(PtrB, Queue); +} + +// Test that an exception is thrown when a graph isn't created with buffer +// property, but buffers are used. +TEST_F(CommandGraphTest, DynamicCommandGroupBufferThrows) { + size_t N = 32; + std::vector HostData(N, 0); + buffer Buf{HostData}; + Buf.set_write_back(false); + + experimental::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + auto CGFA = [&](handler &CGH) { + auto Acc = Buf.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { Acc[Item.get_id()] = 2; }); + }; + + auto CGFB = [&](handler &CGH) { + auto Acc = Buf.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { Acc[Item.get_id()] = 0xA; }); + }; + + experimental::dynamic_command_group DynCG(Graph, {CGFA, CGFB}); + ASSERT_THROW(Graph.add(DynCG), sycl::exception); +} + +// Test and exception is thrown when using a host-accessor to a buffer +// used in a non active CGF node in the graph. +TEST_F(CommandGraphTest, DynamicCommandGroupBufferHostAccThrows) { + size_t N = 32; + std::vector HostData(N, 0); + buffer Buf{HostData}; + Buf.set_write_back(false); + + int *Ptr = malloc_device(N, Queue); + + { + ext::oneapi::experimental::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {experimental::property::graph::assume_buffer_outlives_graph{}}}; + + auto CGFA = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] = 2; }); + }; + + auto CGFB = [&](handler &CGH) { + auto Acc = Buf.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { Acc[Item.get_id()] = 0xA; }); + }; + + experimental::dynamic_command_group DynCG(Graph, {CGFA, CGFB}); + ASSERT_NO_THROW(Graph.add(DynCG)); + + ASSERT_THROW({ host_accessor HostAcc{Buf}; }, sycl::exception); + } + + sycl::free(Ptr, Queue); +} + +// Error if edges are not the same for all command-groups in dynamic command +// group, test using accessors to create edges +TEST_F(CommandGraphTest, DynamicCommandGroupMismatchAccessorEdges) { + size_t N = 32; + std::vector HostData(N, 0); + buffer BufA{HostData}; + buffer BufB{HostData}; + BufA.set_write_back(false); + BufB.set_write_back(false); + + experimental::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {experimental::property::graph::assume_buffer_outlives_graph{}}}; + + Graph.begin_recording(Queue); + + Queue.submit([&](handler &CGH) { + auto AccA = BufA.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { AccA[Item.get_id()] = 1; }); + }); + + Queue.submit([&](handler &CGH) { + auto AccB = BufB.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { AccB[Item.get_id()] = 4; }); + }); + + Graph.end_recording(); + + auto CGFA = [&](handler &CGH) { + auto AccA = BufA.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { AccA[Item.get_id()] += 2; }); + }; + + auto CGFB = [&](handler &CGH) { + auto AccB = BufB.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { AccB[Item.get_id()] += 0xA; }); + }; + + experimental::dynamic_command_group DynCG(Graph, {CGFA, CGFB}); + ASSERT_THROW(Graph.add(DynCG), sycl::exception); +} diff --git a/sycl/unittests/Extensions/CommandGraph/Update.cpp b/sycl/unittests/Extensions/CommandGraph/Update.cpp index ff66af25d9e83..676e3bead1416 100644 --- a/sycl/unittests/Extensions/CommandGraph/Update.cpp +++ b/sycl/unittests/Extensions/CommandGraph/Update.cpp @@ -134,35 +134,6 @@ TEST_F(CommandGraphTest, UpdateNodeTypeExceptions) { })); } -TEST_F(CommandGraphTest, UpdateRangeErrors) { - // Test that the correct errors are throw when trying to update node ranges - - nd_range<1> NDRange{range{128}, range{32}}; - range<1> Range{128}; - auto NodeNDRange = Graph.add([&](sycl::handler &cgh) { - cgh.parallel_for>(NDRange, [](nd_item<1>) {}); - }); - - // OK - EXPECT_NO_THROW(NodeNDRange.update_nd_range(NDRange)); - // Can't update an nd_range node with a range - EXPECT_ANY_THROW(NodeNDRange.update_range(Range)); - // Can't update with a different number of dimensions - EXPECT_ANY_THROW(NodeNDRange.update_nd_range( - nd_range<2>{range<2>{128, 128}, range<2>{32, 32}})); - - auto NodeRange = Graph.add([&](sycl::handler &cgh) { - cgh.parallel_for>(range<1>{128}, [](item<1>) {}); - }); - - // OK - EXPECT_NO_THROW(NodeRange.update_range(Range)); - // Can't update a range node with an nd_range - EXPECT_ANY_THROW(NodeRange.update_nd_range(NDRange)); - // Can't update with a different number of dimensions - EXPECT_ANY_THROW(NodeRange.update_range(range<2>{128, 128})); -} - class WholeGraphUpdateTest : public CommandGraphTest { protected: