From 528d43a05f08c556bf1c3670c4834abe4f950ac6 Mon Sep 17 00:00:00 2001 From: Steffen Larsen Date: Fri, 15 Nov 2024 14:00:59 +0100 Subject: [PATCH 01/36] [SYCL][NFCI] Unify queue submit paths (#15776) The submit method on queue has caused multiple divergences throughout its lifetime, when additional information was required for new functionality. However, that design means an exponential growth in functions every time a new optional argument is needed. To battle this, this patch adds a new pimpl class with the optional submission info, which will only be initialized if needed. This boils the submit function paths down to one with event and one without. --------- Signed-off-by: Larsen, Steffen --- sycl/include/sycl/detail/optional.hpp | 147 +++++++++++++ sycl/include/sycl/queue.hpp | 206 +++++++++--------- sycl/source/detail/queue_impl.cpp | 10 +- sycl/source/detail/queue_impl.hpp | 76 +++++-- sycl/source/queue.cpp | 75 +++++-- sycl/test/abi/sycl_symbols_linux.dump | 8 + sycl/test/abi/sycl_symbols_windows.dump | 12 + .../include_deps/sycl_detail_core.hpp.cpp | 1 + 8 files changed, 389 insertions(+), 146 deletions(-) create mode 100644 sycl/include/sycl/detail/optional.hpp diff --git a/sycl/include/sycl/detail/optional.hpp b/sycl/include/sycl/detail/optional.hpp new file mode 100644 index 0000000000000..da9ff4d900000 --- /dev/null +++ b/sycl/include/sycl/detail/optional.hpp @@ -0,0 +1,147 @@ +//==-------- optional.hpp - limited variant of std::optional -------- C++ --==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// ===--------------------------------------------------------------------=== // + +#pragma once + +#include +#include + +namespace sycl { +inline namespace _V1 { +namespace detail { + +// ABI-stable implementation of optional to avoid reliance on potentially +// differing implementations of std::optional when crossing the library +// boundary. +template class optional { +public: + constexpr optional() noexcept {} + constexpr optional(std::nullopt_t) noexcept : optional() {} + + template + constexpr optional(const optional &Other) + : ContainsValue{Other.ContainsValue} { + new (Storage) T(Other.Value); + } + template + constexpr optional(optional &&Other) + : ContainsValue{std::move(Other.ContainsValue)} { + new (Storage) T(std::move(Other.Value)); + } + + constexpr optional(T &&Value) : ContainsValue{true} { + new (Storage) T(std::move(Value)); + } + + constexpr optional(const T &Value) : ContainsValue{true} { + new (Storage) T(Value); + } + + template + constexpr optional(const std::optional &Other) : ContainsValue{Other} { + if (Other) + new (Storage) T(*Other); + } + + ~optional() { + if (has_value()) + reinterpret_cast(Storage)->~T(); + } + + optional &operator=(std::nullopt_t) noexcept { + if (has_value()) + reinterpret_cast(Storage)->~T(); + ContainsValue = false; + return *this; + } + + template optional &operator=(const optional &Other) { + if (has_value()) + reinterpret_cast(Storage)->~T(); + ContainsValue = Other; + new (Storage) T(Other.Value); + return *this; + } + template optional &operator=(optional &&Other) noexcept { + if (has_value()) + reinterpret_cast(Storage)->~T(); + ContainsValue = Other; + new (Storage) T(std::move(Other.Value)); + return *this; + } + + optional &operator=(T &&Value) { + if (has_value()) + reinterpret_cast(Storage)->~T(); + ContainsValue = true; + new (Storage) T(std::move(Value)); + return *this; + } + + optional &operator=(const T &Value) { + if (has_value()) + reinterpret_cast(Storage)->~T(); + ContainsValue = true; + new (Storage) T(Value); + return *this; + } + + template optional &operator=(const std::optional &Other) { + if (has_value()) + reinterpret_cast(Storage)->~T(); + ContainsValue = Other; + if (Other) + new (Storage) T(*Other); + return *this; + } + + constexpr bool has_value() const noexcept { return ContainsValue; } + constexpr explicit operator bool() const noexcept { return has_value(); } + + constexpr T &value() & { + if (!has_value()) + throw std::bad_optional_access{}; + return *reinterpret_cast(Storage); + } + constexpr const T &value() const & { + if (!has_value()) + throw std::bad_optional_access{}; + return *reinterpret_cast(Storage); + } + constexpr T &&value() && { + if (!has_value()) + throw std::bad_optional_access{}; + return std::move(*reinterpret_cast(Storage)); + } + constexpr const T &&value() const && { + if (!has_value()) + throw std::bad_optional_access{}; + return std::move(*reinterpret_cast(Storage)); + } + + template constexpr T value_or(U &&DefaultVal) { + return has_value() ? value() : static_cast(std::forward(DefaultVal)); + } + template constexpr T value_or(U &&DefaultVal) const { + return has_value() ? std::move(value()) + : static_cast(std::forward(DefaultVal)); + } + + constexpr T &operator*() & { return value(); } + constexpr const T &operator*() const & { return value(); } + constexpr T &&operator*() && { return value(); } + constexpr const T &&operator*() const && { return value(); } + +private: + alignas(alignof(T)) char Storage[sizeof(T)] = {0}; + bool ContainsValue = false; +}; + +} // namespace detail +} // namespace _V1 +} // namespace sycl diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 1ed95e756ca53..9e530604ce84e 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -22,12 +22,13 @@ #include // for __SYCL_EXPORT #include // for is_queue_info_... #include // for KernelInfo -#include // for OwnerLessBase -#include // for device -#include // for device_selector -#include // for event -#include // for make_error_code -#include // for defaultAsyncHa... +#include +#include // for OwnerLessBase +#include // for device +#include // for device_selector +#include // for event +#include // for make_error_code +#include // for defaultAsyncHa... #include // for device_global #include // for device_image_s... #include // for command_graph... @@ -81,6 +82,30 @@ class queue_impl; inline event submitAssertCapture(queue &, event &, queue *, const detail::code_location &); #endif + +// Function to postprocess submitted command +// Arguments: +// bool IsKernel - true if the submitted command was kernel, false otherwise +// bool KernelUsesAssert - true if submitted kernel uses assert, only +// meaningful when IsKernel is true +// event &Event - event after which post processing should be executed +using SubmitPostProcessF = std::function; + +struct SubmissionInfoImpl; + +class __SYCL_EXPORT SubmissionInfo { +public: + SubmissionInfo(); + + sycl::detail::optional &PostProcessorFunc(); + const sycl::detail::optional &PostProcessorFunc() const; + + std::shared_ptr &SecondaryQueue(); + const std::shared_ptr &SecondaryQueue() const; + +private: + std::shared_ptr impl = nullptr; +}; } // namespace detail namespace ext ::oneapi ::experimental { @@ -340,28 +365,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { std::enable_if_t, event> submit( T CGF, const detail::code_location &CodeLoc = detail::code_location::current()) { - detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); -#if __SYCL_USE_FALLBACK_ASSERT - auto PostProcess = [this, &TlsCodeLocCapture]( - bool IsKernel, bool KernelUsesAssert, event &E) { - if (IsKernel && !device_has(aspect::ext_oneapi_native_assert) && - KernelUsesAssert && !device_has(aspect::accelerator)) { - // __devicelib_assert_fail isn't supported by Device-side Runtime - // Linking against fallback impl of __devicelib_assert_fail is - // performed by program manager class - // Fallback assert isn't supported for FPGA - submitAssertCapture(*this, E, /* SecondaryQueue = */ nullptr, - TlsCodeLocCapture.query()); - } - }; - - return submit_impl_and_postprocess(CGF, TlsCodeLocCapture.query(), - PostProcess, - TlsCodeLocCapture.isToplevel()); -#else - return submit_impl(CGF, TlsCodeLocCapture.query(), - TlsCodeLocCapture.isToplevel()); -#endif // __SYCL_USE_FALLBACK_ASSERT + return submit_with_event(CGF, /*SecondaryQueuePtr=*/nullptr, CodeLoc); } /// Submits a command group function object to the queue, in order to be @@ -379,30 +383,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { std::enable_if_t, event> submit( T CGF, queue &SecondaryQueue, const detail::code_location &CodeLoc = detail::code_location::current()) { - detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); -#if __SYCL_USE_FALLBACK_ASSERT - auto PostProcess = [this, &SecondaryQueue, &TlsCodeLocCapture]( - bool IsKernel, bool KernelUsesAssert, event &E) { - if (IsKernel && !device_has(aspect::ext_oneapi_native_assert) && - KernelUsesAssert && !device_has(aspect::accelerator)) { - // Only secondary queues on devices need to be added to the assert - // capture. - // __devicelib_assert_fail isn't supported by Device-side Runtime - // Linking against fallback impl of __devicelib_assert_fail is - // performed by program manager class - // Fallback assert isn't supported for FPGA - submitAssertCapture(*this, E, &SecondaryQueue, - TlsCodeLocCapture.query()); - } - }; - - return submit_impl_and_postprocess(CGF, SecondaryQueue, - TlsCodeLocCapture.query(), PostProcess, - TlsCodeLocCapture.isToplevel()); -#else - return submit_impl(CGF, SecondaryQueue, TlsCodeLocCapture.query(), - TlsCodeLocCapture.isToplevel()); -#endif // __SYCL_USE_FALLBACK_ASSERT + return submit_with_event(CGF, &SecondaryQueue, CodeLoc); } /// Prevents any commands submitted afterward to this queue from executing @@ -2770,23 +2751,84 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { queue &Q, CommandGroupFunc &&CGF, const sycl::detail::code_location &CodeLoc); - /// A template-free version of submit. +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES + /// TODO: Unused. Remove these when ABI-break window is open. event submit_impl(std::function CGH, const detail::code_location &CodeLoc); event submit_impl(std::function CGH, const detail::code_location &CodeLoc, bool IsTopCodeLoc); - /// A template-free version of submit. event submit_impl(std::function CGH, queue secondQueue, const detail::code_location &CodeLoc); event submit_impl(std::function CGH, queue secondQueue, const detail::code_location &CodeLoc, bool IsTopCodeLoc); - - /// A template-free version of submit_without_event. void submit_without_event_impl(std::function CGH, const detail::code_location &CodeLoc); void submit_without_event_impl(std::function CGH, const detail::code_location &CodeLoc, bool IsTopCodeLoc); + event + submit_impl_and_postprocess(std::function CGH, + const detail::code_location &CodeLoc, + const detail::SubmitPostProcessF &PostProcess); + event submit_impl_and_postprocess( + std::function CGH, const detail::code_location &CodeLoc, + const detail::SubmitPostProcessF &PostProcess, bool IsTopCodeLoc); + event + submit_impl_and_postprocess(std::function CGH, + queue secondQueue, + const detail::code_location &CodeLoc, + const detail::SubmitPostProcessF &PostProcess); + event submit_impl_and_postprocess( + std::function CGH, queue secondQueue, + const detail::code_location &CodeLoc, + const detail::SubmitPostProcessF &PostProcess, bool IsTopCodeLoc); +#endif // __INTEL_PREVIEW_BREAKING_CHANGES + + /// A template-free versions of submit. + event submit_with_event_impl(std::function CGH, + const detail::SubmissionInfo &SubmitInfo, + const detail::code_location &CodeLoc, + bool IsTopCodeLoc); + + /// A template-free version of submit_without_event. + void submit_without_event_impl(std::function CGH, + const detail::SubmissionInfo &SubmitInfo, + const detail::code_location &CodeLoc, + bool IsTopCodeLoc); + + /// Submits a command group function object to the queue, in order to be + /// scheduled for execution on the device. + /// + /// \param CGF is a function object containing command group. + /// \param CodeLoc is the code location of the submit call (default argument) + /// \return a SYCL event object for the submitted command group. + template + std::enable_if_t, event> + submit_with_event( + T CGF, queue *SecondaryQueuePtr, + const detail::code_location &CodeLoc = detail::code_location::current()) { + detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + detail::SubmissionInfo SI{}; + if (SecondaryQueuePtr) + SI.SecondaryQueue() = detail::getSyclObjImpl(*SecondaryQueuePtr); +#if __SYCL_USE_FALLBACK_ASSERT + SI.PostProcessorFunc() = + [this, &SecondaryQueuePtr, + &TlsCodeLocCapture](bool IsKernel, bool KernelUsesAssert, event &E) { + if (IsKernel && !device_has(aspect::ext_oneapi_native_assert) && + KernelUsesAssert && !device_has(aspect::accelerator)) { + // __devicelib_assert_fail isn't supported by Device-side Runtime + // Linking against fallback impl of __devicelib_assert_fail is + // performed by program manager class + // Fallback assert isn't supported for FPGA + submitAssertCapture(*this, E, SecondaryQueuePtr, + TlsCodeLocCapture.query()); + } + }; +#endif // __SYCL_USE_FALLBACK_ASSERT + return submit_with_event_impl(CGF, SI, TlsCodeLocCapture.query(), + TlsCodeLocCapture.isToplevel()); + } /// Submits a command group function object to the queue, in order to be /// scheduled for execution on the device. @@ -2796,53 +2838,18 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { template std::enable_if_t, void> submit_without_event(T CGF, const detail::code_location &CodeLoc) { - detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); #if __SYCL_USE_FALLBACK_ASSERT // If post-processing is needed, fall back to the regular submit. // TODO: Revisit whether we can avoid this. - submit(CGF, TlsCodeLocCapture.query()); + submit_with_event(CGF, nullptr, CodeLoc); #else - submit_without_event_impl(CGF, TlsCodeLocCapture.query(), + detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + detail::SubmissionInfo SI{}; + submit_without_event_impl(CGF, SI, TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); #endif // __SYCL_USE_FALLBACK_ASSERT } - // Function to postprocess submitted command - // Arguments: - // bool IsKernel - true if the submitted command was kernel, false otherwise - // bool KernelUsesAssert - true if submitted kernel uses assert, only - // meaningful when IsKernel is true - // event &Event - event after which post processing should be executed - using SubmitPostProcessF = std::function; - - /// A template-free version of submit. - /// \param CGH command group function/handler - /// \param CodeLoc code location - /// - /// This method stores additional information within event_impl class instance - event submit_impl_and_postprocess(std::function CGH, - const detail::code_location &CodeLoc, - const SubmitPostProcessF &PostProcess); - event submit_impl_and_postprocess(std::function CGH, - const detail::code_location &CodeLoc, - const SubmitPostProcessF &PostProcess, - bool IsTopCodeLoc); - /// A template-free version of submit. - /// \param CGH command group function/handler - /// \param secondQueue fallback queue - /// \param CodeLoc code location - /// - /// This method stores additional information within event_impl class instance - event submit_impl_and_postprocess(std::function CGH, - queue secondQueue, - const detail::code_location &CodeLoc, - const SubmitPostProcessF &PostProcess); - event submit_impl_and_postprocess(std::function CGH, - queue secondQueue, - const detail::code_location &CodeLoc, - const SubmitPostProcessF &PostProcess, - bool IsTopCodeLoc); - /// parallel_for_impl with a kernel represented as a lambda + range that /// specifies global size only. /// @@ -3064,13 +3071,8 @@ event submitAssertCapture(queue &Self, event &Event, queue *SecondaryQueue, }); }; - if (SecondaryQueue) { - CopierEv = Self.submit_impl(CopierCGF, *SecondaryQueue, CodeLoc); - CheckerEv = Self.submit_impl(CheckerCGF, *SecondaryQueue, CodeLoc); - } else { - CopierEv = Self.submit_impl(CopierCGF, CodeLoc); - CheckerEv = Self.submit_impl(CheckerCGF, CodeLoc); - } + CopierEv = Self.submit_with_event(CopierCGF, SecondaryQueue, CodeLoc); + CheckerEv = Self.submit_with_event(CheckerCGF, SecondaryQueue, CodeLoc); return CheckerEv; } diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 428f06ea0aaa4..ab8348d3aacac 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -355,7 +355,7 @@ event queue_impl::submit_impl(const std::function &CGF, bool CallerNeedsEvent, const detail::code_location &Loc, bool IsTopCodeLoc, - const SubmitPostProcessF *PostProcess) { + const SubmissionInfo &SubmitInfo) { handler Handler(Self, PrimaryQueue, SecondaryQueue, CallerNeedsEvent); Handler.saveCodeLoc(Loc, IsTopCodeLoc); @@ -374,7 +374,9 @@ event queue_impl::submit_impl(const std::function &CGF, if (Type == CGType::Kernel) Streams = std::move(Handler.MStreamStorage); - if (PostProcess) { + if (SubmitInfo.PostProcessorFunc()) { + auto &PostProcess = *SubmitInfo.PostProcessorFunc(); + bool IsKernel = Type == CGType::Kernel; bool KernelUsesAssert = false; @@ -385,7 +387,7 @@ event queue_impl::submit_impl(const std::function &CGF, Handler.MKernelName.c_str()); finalizeHandler(Handler, Event); - (*PostProcess)(IsKernel, KernelUsesAssert, Event); + PostProcess(IsKernel, KernelUsesAssert, Event); } else finalizeHandler(Handler, Event); @@ -416,7 +418,7 @@ event queue_impl::submitWithHandler(const std::shared_ptr &Self, CGH.depends_on(DepEvents); HandlerFunc(CGH); }, - Self, /*CodeLoc*/ {}, /*IsTopCodeLoc*/ true); + Self, /*CodeLoc*/ {}, /*SubmissionInfo*/ {}, /*IsTopCodeLoc*/ true); } template diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 42e769bbe2025..2daef04280c05 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -67,6 +67,12 @@ constexpr CUDAContextT DefaultContextType = CUDAContextT::custom; enum QueueOrder { Ordered, OOO }; +// Implementation of the submission information storage. +struct SubmissionInfoImpl { + optional MPostProcessorFunc = std::nullopt; + std::shared_ptr MSecondaryQueue = nullptr; +}; + class queue_impl { public: // \return a default context for the platform if it includes the device @@ -319,8 +325,6 @@ class queue_impl { } } - using SubmitPostProcessF = std::function; - /// Submits a command group function object to the queue, in order to be /// scheduled for execution on the device. /// @@ -340,16 +344,11 @@ class queue_impl { const detail::code_location &Loc, bool IsTopCodeLoc, const SubmitPostProcessF *PostProcess = nullptr) { event ResEvent; - try { - ResEvent = submit_impl(CGF, Self, Self, SecondQueue, - /*CallerNeedsEvent=*/true, Loc, IsTopCodeLoc, - PostProcess); - } catch (...) { - ResEvent = SecondQueue->submit_impl(CGF, SecondQueue, Self, SecondQueue, - /*CallerNeedsEvent=*/true, Loc, - IsTopCodeLoc, PostProcess); - } - return discard_or_return(ResEvent); + SubmissionInfo SI{}; + SI.SecondaryQueue() = SecondQueue; + if (PostProcess) + SI.PostProcessorFunc() = *PostProcess; + return submit_with_event(CGF, Self, SI, Loc, IsTopCodeLoc); } /// Submits a command group function object to the queue, in order to be @@ -357,25 +356,55 @@ class queue_impl { /// /// \param CGF is a function object containing command group. /// \param Self is a shared_ptr to this queue. + /// \param SubmitInfo is additional optional information for the submission. /// \param Loc is the code location of the submit call (default argument) /// \param StoreAdditionalInfo makes additional info be stored in event_impl /// \return a SYCL event object for the submitted command group. - event submit(const std::function &CGF, - const std::shared_ptr &Self, - const detail::code_location &Loc, bool IsTopCodeLoc, - const SubmitPostProcessF *PostProcess = nullptr) { - auto ResEvent = + event submit_with_event(const std::function &CGF, + const std::shared_ptr &Self, + const SubmissionInfo &SubmitInfo, + const detail::code_location &Loc, bool IsTopCodeLoc) { + if (SubmitInfo.SecondaryQueue()) { + event ResEvent; + const std::shared_ptr SecondQueue = + SubmitInfo.SecondaryQueue(); + try { + ResEvent = submit_impl(CGF, Self, Self, SecondQueue, + /*CallerNeedsEvent=*/true, Loc, IsTopCodeLoc, + SubmitInfo); + } catch (...) { + ResEvent = SecondQueue->submit_impl(CGF, SecondQueue, Self, SecondQueue, + /*CallerNeedsEvent=*/true, Loc, + IsTopCodeLoc, SubmitInfo); + } + return ResEvent; + } + event ResEvent = submit_impl(CGF, Self, Self, nullptr, - /*CallerNeedsEvent=*/true, Loc, IsTopCodeLoc, PostProcess); + /*CallerNeedsEvent=*/true, Loc, IsTopCodeLoc, SubmitInfo); return discard_or_return(ResEvent); } void submit_without_event(const std::function &CGF, const std::shared_ptr &Self, - const detail::code_location &Loc, bool IsTopCodeLoc, - const SubmitPostProcessF *PostProcess = nullptr) { - submit_impl(CGF, Self, Self, nullptr, /*CallerNeedsEvent=*/false, Loc, - IsTopCodeLoc, PostProcess); + const SubmissionInfo &SubmitInfo, + const detail::code_location &Loc, + bool IsTopCodeLoc) { + if (SubmitInfo.SecondaryQueue()) { + const std::shared_ptr SecondQueue = + SubmitInfo.SecondaryQueue(); + try { + submit_impl(CGF, Self, Self, SecondQueue, + /*CallerNeedsEvent=*/false, Loc, IsTopCodeLoc, SubmitInfo); + } catch (...) { + SecondQueue->submit_impl(CGF, SecondQueue, Self, SecondQueue, + /*CallerNeedsEvent=*/false, Loc, IsTopCodeLoc, + SubmitInfo); + } + } else { + submit_impl(CGF, Self, Self, nullptr, /*CallerNeedsEvent=*/false, Loc, + IsTopCodeLoc, SubmitInfo); + } } /// Performs a blocking wait for the completion of all enqueued tasks in the @@ -822,13 +851,14 @@ class queue_impl { /// \param CallerNeedsEvent is a boolean indicating whether the event is /// required by the user after the call. /// \param Loc is the code location of the submit call (default argument) + /// \param SubmitInfo is additional optional information for the submission. /// \return a SYCL event representing submitted command group. event submit_impl(const std::function &CGF, const std::shared_ptr &Self, const std::shared_ptr &PrimaryQueue, const std::shared_ptr &SecondaryQueue, bool CallerNeedsEvent, const detail::code_location &Loc, - bool IsTopCodeLoc, const SubmitPostProcessF *PostProcess); + bool IsTopCodeLoc, const SubmissionInfo &SubmitInfo); /// Helper function for submitting a memory operation with a handler. /// \param Self is a shared_ptr to this queue. diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index 43abe91b20014..ac7273081410a 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -20,6 +20,28 @@ namespace sycl { inline namespace _V1 { +namespace detail { +SubmissionInfo::SubmissionInfo() + : impl{std::make_shared()} {} + +optional &SubmissionInfo::PostProcessorFunc() { + return impl->MPostProcessorFunc; +} + +const optional &SubmissionInfo::PostProcessorFunc() const { + return impl->MPostProcessorFunc; +} + +std::shared_ptr &SubmissionInfo::SecondaryQueue() { + return impl->MSecondaryQueue; +} + +const std::shared_ptr & +SubmissionInfo::SecondaryQueue() const { + return impl->MSecondaryQueue; +} +} // namespace detail + queue::queue(const context &SyclContext, const device_selector &DeviceSelector, const async_handler &AsyncHandler, const property_list &PropList) { const std::vector Devs = SyclContext.get_devices(); @@ -164,14 +186,16 @@ event queue::mem_advise(const void *Ptr, size_t Length, int Advice, /*CallerNeedsEvent=*/true); } +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES +/// TODO: Unused. Remove these when ABI-break window is open. event queue::submit_impl(std::function CGH, const detail::code_location &CodeLoc) { - return impl->submit(CGH, impl, CodeLoc, true); + return submit_with_event_impl(CGH, {}, CodeLoc, true); } event queue::submit_impl(std::function CGH, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - return impl->submit(CGH, impl, CodeLoc, IsTopCodeLoc); + return submit_with_event_impl(CGH, {}, CodeLoc, IsTopCodeLoc); } event queue::submit_impl(std::function CGH, queue SecondQueue, @@ -186,40 +210,57 @@ event queue::submit_impl(std::function CGH, queue SecondQueue, void queue::submit_without_event_impl(std::function CGH, const detail::code_location &CodeLoc) { - return impl->submit_without_event(CGH, impl, CodeLoc, true); + submit_without_event_impl(CGH, {}, CodeLoc, true); } void queue::submit_without_event_impl(std::function CGH, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - return impl->submit_without_event(CGH, impl, CodeLoc, IsTopCodeLoc); + submit_without_event_impl(CGH, {}, CodeLoc, IsTopCodeLoc); } event queue::submit_impl_and_postprocess( std::function CGH, const detail::code_location &CodeLoc, - const SubmitPostProcessF &PostProcess) { - return impl->submit(CGH, impl, CodeLoc, true, &PostProcess); + const detail::SubmitPostProcessF &PostProcess) { + detail::SubmissionInfo SI{}; + SI.PostProcessorFunc() = std::move(PostProcess); + return submit_with_event_impl(CGH, SI, CodeLoc, true); } -event queue::submit_impl_and_postprocess(std::function CGH, - const detail::code_location &CodeLoc, - const SubmitPostProcessF &PostProcess, - bool IsTopCodeLoc) { - return impl->submit(CGH, impl, CodeLoc, IsTopCodeLoc, &PostProcess); +event queue::submit_impl_and_postprocess( + std::function CGH, const detail::code_location &CodeLoc, + const detail::SubmitPostProcessF &PostProcess, bool IsTopCodeLoc) { + detail::SubmissionInfo SI{}; + SI.PostProcessorFunc() = std::move(PostProcess); + return submit_with_event_impl(CGH, SI, CodeLoc, IsTopCodeLoc); } event queue::submit_impl_and_postprocess( std::function CGH, queue SecondQueue, const detail::code_location &CodeLoc, - const SubmitPostProcessF &PostProcess) { + const detail::SubmitPostProcessF &PostProcess) { return impl->submit(CGH, impl, SecondQueue.impl, CodeLoc, true, &PostProcess); } -event queue::submit_impl_and_postprocess(std::function CGH, - queue SecondQueue, - const detail::code_location &CodeLoc, - const SubmitPostProcessF &PostProcess, - bool IsTopCodeLoc) { +event queue::submit_impl_and_postprocess( + std::function CGH, queue SecondQueue, + const detail::code_location &CodeLoc, + const detail::SubmitPostProcessF &PostProcess, bool IsTopCodeLoc) { return impl->submit(CGH, impl, SecondQueue.impl, CodeLoc, IsTopCodeLoc, &PostProcess); } +#endif // __INTEL_PREVIEW_BREAKING_CHANGES + +event queue::submit_with_event_impl(std::function CGH, + const detail::SubmissionInfo &SubmitInfo, + const detail::code_location &CodeLoc, + bool IsTopCodeLoc) { + return impl->submit_with_event(CGH, impl, SubmitInfo, CodeLoc, IsTopCodeLoc); +} + +void queue::submit_without_event_impl(std::function CGH, + const detail::SubmissionInfo &SubmitInfo, + const detail::code_location &CodeLoc, + bool IsTopCodeLoc) { + impl->submit_without_event(CGH, impl, SubmitInfo, CodeLoc, IsTopCodeLoc); +} void queue::wait_proxy(const detail::code_location &CodeLoc) { impl->wait(CodeLoc); diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 621765568d50c..a5134a7a524ca 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3133,10 +3133,12 @@ _ZN4sycl3_V15queue18throw_asynchronousEv _ZN4sycl3_V15queue20memcpyToDeviceGlobalEPvPKvbmmRKSt6vectorINS0_5eventESaIS6_EE _ZN4sycl3_V15queue20wait_and_throw_proxyERKNS0_6detail13code_locationE _ZN4sycl3_V15queue22memcpyFromDeviceGlobalEPvPKvbmmRKSt6vectorINS0_5eventESaIS6_EE +_ZN4sycl3_V15queue22submit_with_event_implESt8functionIFvRNS0_7handlerEEERKNS0_6detail14SubmissionInfoERKNS7_13code_locationEb _ZN4sycl3_V15queue25ext_oneapi_submit_barrierERKNS0_6detail13code_locationE _ZN4sycl3_V15queue25ext_oneapi_submit_barrierERKSt6vectorINS0_5eventESaIS3_EERKNS0_6detail13code_locationE _ZN4sycl3_V15queue25submit_without_event_implESt8functionIFvRNS0_7handlerEEERKNS0_6detail13code_locationE _ZN4sycl3_V15queue25submit_without_event_implESt8functionIFvRNS0_7handlerEEERKNS0_6detail13code_locationEb +_ZN4sycl3_V15queue25submit_without_event_implESt8functionIFvRNS0_7handlerEEERKNS0_6detail14SubmissionInfoERKNS7_13code_locationEb _ZN4sycl3_V15queue27submit_impl_and_postprocessESt8functionIFvRNS0_7handlerEEERKNS0_6detail13code_locationERKS2_IFvbbRNS0_5eventEEE _ZN4sycl3_V15queue27submit_impl_and_postprocessESt8functionIFvRNS0_7handlerEEERKNS0_6detail13code_locationERKS2_IFvbbRNS0_5eventEEEb _ZN4sycl3_V15queue27submit_impl_and_postprocessESt8functionIFvRNS0_7handlerEEES1_RKNS0_6detail13code_locationERKS2_IFvbbRNS0_5eventEEE @@ -3249,6 +3251,10 @@ _ZN4sycl3_V16detail14tls_code_loc_tC2ERKNS1_13code_locationE _ZN4sycl3_V16detail14tls_code_loc_tC2Ev _ZN4sycl3_V16detail14tls_code_loc_tD1Ev _ZN4sycl3_V16detail14tls_code_loc_tD2Ev +_ZN4sycl3_V16detail14SubmissionInfo14SecondaryQueueEv +_ZN4sycl3_V16detail14SubmissionInfo17PostProcessorFuncEv +_ZN4sycl3_V16detail14SubmissionInfoC1Ev +_ZN4sycl3_V16detail14SubmissionInfoC2Ev _ZN4sycl3_V16detail16AccessorBaseHost10getAccDataEv _ZN4sycl3_V16detail16AccessorBaseHost14getAccessRangeEv _ZN4sycl3_V16detail16AccessorBaseHost14getMemoryRangeEv @@ -3707,6 +3713,8 @@ _ZNK4sycl3_V16detail12buffer_plain13handleReleaseEv _ZNK4sycl3_V16detail12buffer_plain15getNativeVectorENS0_7backendE _ZNK4sycl3_V16detail12buffer_plain22get_allocator_internalEv _ZNK4sycl3_V16detail12buffer_plain7getSizeEv +_ZNK4sycl3_V16detail14SubmissionInfo14SecondaryQueueEv +_ZNK4sycl3_V16detail14SubmissionInfo17PostProcessorFuncEv _ZNK4sycl3_V16detail16AccessorBaseHost11getElemSizeEv _ZNK4sycl3_V16detail16AccessorBaseHost11getPropListEv _ZNK4sycl3_V16detail16AccessorBaseHost13isPlaceholderEv diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index b0b7fc3f0112d..a6e6a5e47c137 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -278,6 +278,9 @@ ??0SampledImageAccessorBaseHost@detail@_V1@sycl@@QEAA@$$QEAV0123@@Z ??0SampledImageAccessorBaseHost@detail@_V1@sycl@@QEAA@AEBV0123@@Z ??0SampledImageAccessorBaseHost@detail@_V1@sycl@@QEAA@V?$range@$02@23@PEAXHHV?$id@$02@23@W4image_channel_type@23@W4image_channel_order@23@Uimage_sampler@23@AEBVproperty_list@23@@Z +??0SubmissionInfo@detail@_V1@sycl@@QEAA@XZ +??0SubmissionInfo@detail@_V1@sycl@@QEAA@AEBV0123@@Z +??0SubmissionInfo@detail@_V1@sycl@@QEAA@$$QEAV0123@@Z ??0UnsampledImageAccessorBaseHost@detail@_V1@sycl@@IEAA@AEBV?$shared_ptr@VUnsampledImageAccessorImplHost@detail@_V1@sycl@@@std@@@Z ??0UnsampledImageAccessorBaseHost@detail@_V1@sycl@@QEAA@$$QEAV0123@@Z ??0UnsampledImageAccessorBaseHost@detail@_V1@sycl@@QEAA@AEBV0123@@Z @@ -462,6 +465,7 @@ ??1LocalAccessorBaseHost@detail@_V1@sycl@@QEAA@XZ ??1SYCLCategory@detail@_V1@sycl@@UEAA@XZ ??1SampledImageAccessorBaseHost@detail@_V1@sycl@@QEAA@XZ +??1SubmissionInfo@detail@_V1@sycl@@QEAA@XZ ??1UnsampledImageAccessorBaseHost@detail@_V1@sycl@@QEAA@XZ ??1accelerator_selector@_V1@sycl@@UEAA@XZ ??1buffer_plain@detail@_V1@sycl@@QEAA@XZ @@ -525,6 +529,8 @@ ??4OSUtil@detail@_V1@sycl@@QEAAAEAV0123@AEBV0123@@Z ??4SampledImageAccessorBaseHost@detail@_V1@sycl@@QEAAAEAV0123@$$QEAV0123@@Z ??4SampledImageAccessorBaseHost@detail@_V1@sycl@@QEAAAEAV0123@AEBV0123@@Z +??4SubmissionInfo@detail@_V1@sycl@@QEAAAEAV0123@$$QEAV0123@@Z +??4SubmissionInfo@detail@_V1@sycl@@QEAAAEAV0123@AEBV0123@@Z ??4UnsampledImageAccessorBaseHost@detail@_V1@sycl@@QEAAAEAV0123@$$QEAV0123@@Z ??4UnsampledImageAccessorBaseHost@detail@_V1@sycl@@QEAAAEAV0123@AEBV0123@@Z ??4accelerator_selector@_V1@sycl@@QEAAAEAV012@$$QEAV012@@Z @@ -638,9 +644,13 @@ ?GDBMethodsAnchor@UnsampledImageAccessorBaseHost@detail@_V1@sycl@@IEAAXXZ ?GetRangeRoundingSettings@handler@_V1@sycl@@AEAAXAEA_K00@Z ?HasAssociatedAccessor@handler@_V1@sycl@@AEBA_NPEAVAccessorImplHost@detail@23@W4target@access@23@@Z +?PostProcessorFunc@SubmissionInfo@detail@_V1@sycl@@QEBAAEBV?$optional@V?$function@$$A6AX_N0AEAVevent@_V1@sycl@@@Z@std@@@234@XZ +?PostProcessorFunc@SubmissionInfo@detail@_V1@sycl@@QEAAAEAV?$optional@V?$function@$$A6AX_N0AEAVevent@_V1@sycl@@@Z@std@@@234@XZ ?PushBack@exception_list@_V1@sycl@@AEAAX$$QEAVexception_ptr@std@@@Z ?PushBack@exception_list@_V1@sycl@@AEAAXAEBVexception_ptr@std@@@Z ?RangeRoundingTrace@handler@_V1@sycl@@AEAA_NXZ +?SecondaryQueue@SubmissionInfo@detail@_V1@sycl@@QEAAAEAV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@XZ +?SecondaryQueue@SubmissionInfo@detail@_V1@sycl@@QEBAAEBV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@XZ ?SetHostTask@handler@_V1@sycl@@AEAAX$$QEAV?$function@$$A6AXVinterop_handle@_V1@sycl@@@Z@std@@@Z ?SetHostTask@handler@_V1@sycl@@AEAAX$$QEAV?$function@$$A6AXXZ@std@@@Z ?__abs_diff_impl@_V1@sycl@@YA?AV?$vec@C$00@12@V312@0@Z @@ -4272,8 +4282,10 @@ ?submit_impl_and_postprocess@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@AEBUcode_location@detail@23@AEBV?$function@$$A6AX_N0AEAVevent@_V1@sycl@@@Z@6@_N@Z ?submit_impl_and_postprocess@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@V123@AEBUcode_location@detail@23@AEBV?$function@$$A6AX_N0AEAVevent@_V1@sycl@@@Z@6@@Z ?submit_impl_and_postprocess@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@V123@AEBUcode_location@detail@23@AEBV?$function@$$A6AX_N0AEAVevent@_V1@sycl@@@Z@6@_N@Z +?submit_with_event_impl@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@AEBVSubmissionInfo@detail@23@AEBUcode_location@823@_N@Z ?submit_without_event_impl@queue@_V1@sycl@@AEAAXV?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@AEBUcode_location@detail@23@@Z ?submit_without_event_impl@queue@_V1@sycl@@AEAAXV?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@AEBUcode_location@detail@23@_N@Z +?submit_without_event_impl@queue@_V1@sycl@@AEAAXV?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@AEBVSubmissionInfo@detail@23@AEBUcode_location@723@_N@Z ?supportsUSMFill2D@handler@_V1@sycl@@AEAA_NXZ ?supportsUSMMemcpy2D@handler@_V1@sycl@@AEAA_NXZ ?supportsUSMMemset2D@handler@_V1@sycl@@AEAA_NXZ diff --git a/sycl/test/include_deps/sycl_detail_core.hpp.cpp b/sycl/test/include_deps/sycl_detail_core.hpp.cpp index dc959046444af..8b0144fdbf44f 100644 --- a/sycl/test/include_deps/sycl_detail_core.hpp.cpp +++ b/sycl/test/include_deps/sycl_detail_core.hpp.cpp @@ -100,6 +100,7 @@ // CHECK-NEXT: kernel_handler.hpp // CHECK-NEXT: nd_item.hpp // CHECK-NEXT: nd_range.hpp +// CHECK-NEXT: detail/optional.hpp // CHECK-NEXT: device.hpp // CHECK-NEXT: kernel_bundle_enums.hpp // CHECK-NEXT: event.hpp From 1581225ca3eee9729e6af81022a36d50511226f7 Mon Sep 17 00:00:00 2001 From: Alexey Sachkov Date: Fri, 15 Nov 2024 14:10:45 +0100 Subject: [PATCH 02/36] [SYCL][NFC] Refactor `#include`s (#16030) This patch is a collection of various cleanups made in public headers: - Cleaned up many unnecessary includes. It doesn't change total amount of header files we use in total by `sycl.hpp`, but makes our code cleaner - Made it so there are no headers (except for `backend/%backend_name%.hpp`) depend on `backend.hpp` and it is (almost) only included by `sycl.hpp`, so that we can make it an opt-in header - Removed `types.hpp` in favor of direct use of `vector.hpp` - Added missing includes and forward-declarations to places where we relied on implicit includes - Moved certain helper function declarations/definitions to better places (common utils to utils headers, library-only declarations to library headers, etc.) --- libdevice/nativecpu_utils.cpp | 2 +- sycl/include/sycl/aliases.hpp | 1 - sycl/include/sycl/backend.hpp | 17 +-------- sycl/include/sycl/builtins_esimd.hpp | 2 +- sycl/include/sycl/builtins_utils_vec.hpp | 3 +- sycl/include/sycl/detail/backend_traits.hpp | 3 +- sycl/include/sycl/detail/device_filter.hpp | 2 +- .../sycl/detail/image_accessor_util.hpp | 2 +- sycl/include/sycl/detail/ur.hpp | 2 +- sycl/include/sycl/detail/util.hpp | 37 +++++++++++++++++++ sycl/include/sycl/device.hpp | 8 ++-- .../ext/intel/esimd/detail/memory_intrin.hpp | 2 +- .../sycl/ext/intel/fpga_device_selector.hpp | 1 + sycl/include/sycl/ext/oneapi/backend/hip.hpp | 2 +- .../sycl/ext/oneapi/backend/level_zero.hpp | 3 +- sycl/include/sycl/ext/oneapi/dot_product.hpp | 2 +- .../ext/oneapi/experimental/bfloat16_math.hpp | 7 ++-- .../sycl/ext/oneapi/experimental/builtins.hpp | 2 +- .../ext/oneapi/experimental/cuda/builtins.hpp | 2 +- .../experimental/non_uniform_groups.hpp | 2 +- .../sycl/ext/oneapi/experimental/prefetch.hpp | 2 +- .../sycl/ext/oneapi/sub_group_mask.hpp | 2 +- sycl/include/sycl/group_algorithm.hpp | 2 +- sycl/include/sycl/image.hpp | 2 +- sycl/include/sycl/interop_handle.hpp | 6 +-- sycl/include/sycl/kernel.hpp | 14 +++---- sycl/include/sycl/known_identity.hpp | 2 +- sycl/include/sycl/platform.hpp | 34 ----------------- sycl/include/sycl/stream.hpp | 11 +++--- sycl/include/sycl/sycl.hpp | 13 ++++--- sycl/include/sycl/types.hpp | 27 -------------- sycl/include/sycl/vector.hpp | 3 +- sycl/source/backend/level_zero.cpp | 2 +- sycl/source/backend/opencl.cpp | 1 - sycl/source/detail/adapter.hpp | 2 +- sycl/source/detail/allowlist.cpp | 2 +- sycl/source/detail/buffer_impl.hpp | 1 - .../composite_device/composite_device.cpp | 2 + sycl/source/detail/config.cpp | 1 + sycl/source/detail/config.hpp | 2 +- sycl/source/detail/image_accessor_util.cpp | 1 + sycl/source/detail/kernel_impl.hpp | 1 + sycl/source/detail/platform_impl.cpp | 2 +- sycl/source/detail/platform_impl.hpp | 2 +- sycl/source/detail/sampler_impl.cpp | 1 + sycl/source/detail/ur.hpp | 6 ++- sycl/source/detail/windows_ur.cpp | 1 - sycl/test-e2e/BFloat16/bfloat16_vec.cpp | 1 + sycl/test-e2e/Basic/buffer/buffer.cpp | 2 +- sycl/test-e2e/Basic/group_async_copy.cpp | 2 +- .../Basic/group_async_copy_legacy.cpp | 2 +- sycl/test-e2e/Basic/half_type.cpp | 2 +- sycl/test-e2e/Basic/parallel_for_indexers.cpp | 2 +- .../Basic/parallel_for_range_roundup.cpp | 2 +- sycl/test-e2e/Basic/swizzle_op.cpp | 2 +- .../Basic/sycl_2020_images/common.hpp | 1 + sycl/test-e2e/Basic/vector/bool.cpp | 3 +- sycl/test-e2e/Basic/vector/byte.cpp | 3 +- sycl/test-e2e/Basic/vector/int-convert.cpp | 3 +- sycl/test-e2e/Basic/vector/load_store.cpp | 2 +- sycl/test-e2e/Basic/vector/operators.cpp | 2 +- sycl/test-e2e/Basic/vector/scalar_access.cpp | 2 +- .../Basic/vector/vec_binary_scalar_order.hpp | 2 +- .../Regression/get_spec_const_vec16.cpp | 2 +- sycl/test-e2e/Regression/group.cpp | 2 +- sycl/test-e2e/Regression/local-arg-align.cpp | 2 +- sycl/test-e2e/Regression/swizzle_opassign.cpp | 2 +- .../Regression/vec_rel_swizzle_ops.cpp | 2 +- sycl/test/CMakeLists.txt | 1 - sycl/test/abi/layout_vec.cpp | 2 +- sycl/test/abi/symbol_size_alignment.cpp | 2 +- .../basic_tests/vectors/size_one_checks.cpp | 2 +- .../vector/vector_bf16_builtins.cpp | 1 + .../vector/vector_convert_bfloat.cpp | 3 +- sycl/test/include_deps/sycl_accessor.hpp.cpp | 4 +- .../include_deps/sycl_detail_core.hpp.cpp | 4 +- 76 files changed, 142 insertions(+), 167 deletions(-) delete mode 100644 sycl/include/sycl/types.hpp diff --git a/libdevice/nativecpu_utils.cpp b/libdevice/nativecpu_utils.cpp index 65e6c4c1fd0d1..c3e8bb61657a7 100644 --- a/libdevice/nativecpu_utils.cpp +++ b/libdevice/nativecpu_utils.cpp @@ -17,7 +17,7 @@ #include "device.h" #include #include -#include +#include // including state definition from Native CPU UR adapter #include "nativecpu_state.hpp" diff --git a/sycl/include/sycl/aliases.hpp b/sycl/include/sycl/aliases.hpp index 1640c11db6078..fc359b7659802 100644 --- a/sycl/include/sycl/aliases.hpp +++ b/sycl/include/sycl/aliases.hpp @@ -9,7 +9,6 @@ #pragma once #include // for __SYCL2020_DEPRECATED -#include // for half #include // for uint8_t, int16_t, int32_t diff --git a/sycl/include/sycl/backend.hpp b/sycl/include/sycl/backend.hpp index 24750732b08ff..fc477128b57b4 100644 --- a/sycl/include/sycl/backend.hpp +++ b/sycl/include/sycl/backend.hpp @@ -15,7 +15,6 @@ #include // for buffer_allocator #include // for context, get_na... #include // for InteropFeatureS... -#include // for _cl_event #include // for __SYCL_DEPRECATED #include // for __SYCL_EXPORT #include // for createSyclObjFr... @@ -23,13 +22,10 @@ #include // for event, get_native #include // for make_error_code #include // for SYCL_BACKEND_OP... -#include // for buffer #include // for image, image_al... -#include // for kernel, get_native #include // for kernel_bundle #include // for bundle_state #include // for platform, get_n... -#include // for property_list #include // for queue, get_native #include // for ur_native_handle_t @@ -60,13 +56,12 @@ namespace sycl { inline namespace _V1 { +class property_list; + namespace detail { // TODO each backend can have its own custom errc enumeration // but the details for this are not fully specified yet enum class backend_errc : unsigned int {}; - -// Convert from UR backend to SYCL backend enum -backend convertUrBackend(ur_platform_backend_t UrBackend); } // namespace detail template class backend_traits { @@ -78,14 +73,6 @@ template class backend_traits { using return_type = typename detail::BackendReturn::type; }; -template -using backend_input_t = - typename backend_traits::template input_type; - -template -using backend_return_t = - typename backend_traits::template return_type; - namespace detail { template struct BufferInterop { diff --git a/sycl/include/sycl/builtins_esimd.hpp b/sycl/include/sycl/builtins_esimd.hpp index fa720a405f9be..49566ce118eba 100644 --- a/sycl/include/sycl/builtins_esimd.hpp +++ b/sycl/include/sycl/builtins_esimd.hpp @@ -11,7 +11,7 @@ #include #include #include -#include +#include // TODO Decide whether to mark functions with this attribute. #define __NOEXC /*noexcept*/ diff --git a/sycl/include/sycl/builtins_utils_vec.hpp b/sycl/include/sycl/builtins_utils_vec.hpp index 178c696495c8e..9504b8d3aa295 100644 --- a/sycl/include/sycl/builtins_utils_vec.hpp +++ b/sycl/include/sycl/builtins_utils_vec.hpp @@ -13,8 +13,9 @@ #include #include +#include #include // for marray -#include // for vec +#include // for vec namespace sycl { inline namespace _V1 { diff --git a/sycl/include/sycl/detail/backend_traits.hpp b/sycl/include/sycl/detail/backend_traits.hpp index adbbab78642ff..87c00ce6d63d3 100644 --- a/sycl/include/sycl/detail/backend_traits.hpp +++ b/sycl/include/sycl/detail/backend_traits.hpp @@ -8,10 +8,9 @@ #pragma once -#include - namespace sycl { inline namespace _V1 { +enum class backend : char; namespace detail { template struct interop; diff --git a/sycl/include/sycl/detail/device_filter.hpp b/sycl/include/sycl/detail/device_filter.hpp index 5574bf69a3484..9ca26333ab15a 100644 --- a/sycl/include/sycl/detail/device_filter.hpp +++ b/sycl/include/sycl/detail/device_filter.hpp @@ -8,7 +8,6 @@ #pragma once -#include #include #include @@ -18,6 +17,7 @@ namespace sycl { inline namespace _V1 { +enum class backend : char; namespace detail { // --------------------------------------- diff --git a/sycl/include/sycl/detail/image_accessor_util.hpp b/sycl/include/sycl/detail/image_accessor_util.hpp index d87038a8c9ce3..9b1c519301bf6 100644 --- a/sycl/include/sycl/detail/image_accessor_util.hpp +++ b/sycl/include/sycl/detail/image_accessor_util.hpp @@ -23,7 +23,7 @@ #include // for image_channel_type #include // for range #include // for addressing_mode, coor... -#include // for vec, operator*, round... +#include // for vec, operator*, round... #include // for int32_t, uint16_t #include // for size_t diff --git a/sycl/include/sycl/detail/ur.hpp b/sycl/include/sycl/detail/ur.hpp index 70b6517b43748..1ed65046c0c1b 100644 --- a/sycl/include/sycl/detail/ur.hpp +++ b/sycl/include/sycl/detail/ur.hpp @@ -14,7 +14,6 @@ #pragma once -#include #include #include #include @@ -42,6 +41,7 @@ struct trace_event_data_t; namespace sycl { inline namespace _V1 { +enum class backend : char; class context; namespace detail { diff --git a/sycl/include/sycl/detail/util.hpp b/sycl/include/sycl/detail/util.hpp index d858aba279f41..56d4b12df2bc9 100644 --- a/sycl/include/sycl/detail/util.hpp +++ b/sycl/include/sycl/detail/util.hpp @@ -15,6 +15,9 @@ #include #include +#include +#include +#include #include namespace sycl { @@ -83,6 +86,40 @@ template <> struct ABINeutralT> { }; template using ABINeutralT_t = typename ABINeutralT::type; + +template auto convert_to_abi_neutral(ParamT &&Info) { + using ParamNoRef = std::remove_reference_t; + if constexpr (std::is_same_v) { + return detail::string{Info}; + } else if constexpr (std::is_same_v>) { + std::vector Res; + Res.reserve(Info.size()); + for (std::string &Str : Info) { + Res.push_back(detail::string{Str}); + } + return Res; + } else { + return std::forward(Info); + } +} + +template auto convert_from_abi_neutral(ParamT &&Info) { + using ParamNoRef = std::remove_reference_t; + if constexpr (std::is_same_v) { + return Info.c_str(); + } else if constexpr (std::is_same_v>) { + std::vector Res; + Res.reserve(Info.size()); + for (detail::string &Str : Info) { + Res.push_back(Str.c_str()); + } + return Res; + } else { + return std::forward(Info); + } +} + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/include/sycl/device.hpp b/sycl/include/sycl/device.hpp index d49a1f1d1ff9a..57b193b4987b3 100644 --- a/sycl/include/sycl/device.hpp +++ b/sycl/include/sycl/device.hpp @@ -8,7 +8,6 @@ #pragma once -#include #include #include #include @@ -21,9 +20,12 @@ #include #include #include -#include #include +#ifdef __SYCL_INTERNAL_API +#include +#endif + #include #include #include @@ -35,7 +37,7 @@ namespace sycl { inline namespace _V1 { // Forward declarations -class device_selector; +class platform; template auto get_native(const SyclObjectT &Obj) -> backend_return_t; diff --git a/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp b/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp index 89636fda85019..2240379d5eed7 100644 --- a/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp +++ b/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include diff --git a/sycl/include/sycl/ext/intel/fpga_device_selector.hpp b/sycl/include/sycl/ext/intel/fpga_device_selector.hpp index f3d6b1bb00d70..91ec593fe6f6a 100644 --- a/sycl/include/sycl/ext/intel/fpga_device_selector.hpp +++ b/sycl/include/sycl/ext/intel/fpga_device_selector.hpp @@ -10,6 +10,7 @@ #include #include +#include #include #include diff --git a/sycl/include/sycl/ext/oneapi/backend/hip.hpp b/sycl/include/sycl/ext/oneapi/backend/hip.hpp index 0f59dd2f4116a..86f22d74e78d9 100644 --- a/sycl/include/sycl/ext/oneapi/backend/hip.hpp +++ b/sycl/include/sycl/ext/oneapi/backend/hip.hpp @@ -8,7 +8,7 @@ #pragma once -#include +#include #include namespace sycl { diff --git a/sycl/include/sycl/ext/oneapi/backend/level_zero.hpp b/sycl/include/sycl/ext/oneapi/backend/level_zero.hpp index 7ff2845f6bde4..14969a309e4da 100644 --- a/sycl/include/sycl/ext/oneapi/backend/level_zero.hpp +++ b/sycl/include/sycl/ext/oneapi/backend/level_zero.hpp @@ -9,8 +9,7 @@ #pragma once #include // for async_han... -#include // for backend_i... -#include // for backend +#include // for backend #include // for buffer_al... #include // for buffer #include // for context diff --git a/sycl/include/sycl/ext/oneapi/dot_product.hpp b/sycl/include/sycl/ext/oneapi/dot_product.hpp index 4fda07052e25a..cec308ba26b55 100644 --- a/sycl/include/sycl/ext/oneapi/dot_product.hpp +++ b/sycl/include/sycl/ext/oneapi/dot_product.hpp @@ -11,7 +11,7 @@ #pragma once #include -#include +#include namespace sycl { inline namespace _V1 { diff --git a/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp b/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp index ed513ae3d2098..f82014fe3e209 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp @@ -8,9 +8,10 @@ #pragma once -#include // for ceil, cos, exp, exp10, exp2 -#include // For simplify_if_swizzle, is_swizzle -#include // sycl::detail::memcpy +#include // for ceil, cos, exp, exp10, exp2 +#include // For simplify_if_swizzle, is_swizzle +#include // sycl::detail::memcpy +#include #include // for bfloat16, bfloat16ToBits #include // for marray diff --git a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp index facc486ca2f84..d42df1fee26c8 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp @@ -15,7 +15,7 @@ #include // for is_svgenfloath, is_sv... #include // detail::memcpy #include // for marray -#include // for vec +#include // for vec #include // for size_t #include // for printf diff --git a/sycl/include/sycl/ext/oneapi/experimental/cuda/builtins.hpp b/sycl/include/sycl/ext/oneapi/experimental/cuda/builtins.hpp index 067e238c2e36c..3609c282a5319 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/cuda/builtins.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/cuda/builtins.hpp @@ -10,7 +10,7 @@ #define SYCL_EXT_ONEAPI_CUDA_TEX_CACHE_READ 1 -#include +#include #if defined(_WIN32) || defined(_WIN64) #define ATTRIBUTE_EXT_VEC_TYPE(N) __declspec(ext_vector_type(N)) diff --git a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp index af68ce0e10e0f..bbe619834dcdc 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp @@ -10,7 +10,7 @@ #include // for sub_group_mask #include // for marray -#include // for vec +#include // for vec #include // for size_t #include // for uint32_t diff --git a/sycl/include/sycl/ext/oneapi/experimental/prefetch.hpp b/sycl/include/sycl/ext/oneapi/experimental/prefetch.hpp index 441e32a085990..29c25d6a0860b 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/prefetch.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/prefetch.hpp @@ -10,7 +10,7 @@ #include #include -#include +#include namespace sycl { inline namespace _V1 { diff --git a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp index 7a3bef52110db..a99e1f9970165 100644 --- a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp +++ b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp @@ -14,7 +14,7 @@ #include // for SYCL_EXT_ONEAPI_SUB_GROUP_MASK #include // for id #include // for marray -#include // for vec +#include // for vec #include // for assert #include // for CHAR_BIT diff --git a/sycl/include/sycl/group_algorithm.hpp b/sycl/include/sycl/group_algorithm.hpp index e3192a481bdb1..9547039d45b69 100644 --- a/sycl/include/sycl/group_algorithm.hpp +++ b/sycl/include/sycl/group_algorithm.hpp @@ -20,7 +20,7 @@ #include // for known_identity_v #include // for nd_item #include // for range -#include // for vec +#include // for vec #ifdef __SYCL_DEVICE_ONLY__ #include diff --git a/sycl/include/sycl/image.hpp b/sycl/include/sycl/image.hpp index 45cfb884dae3f..4fcd2f5322990 100644 --- a/sycl/include/sycl/image.hpp +++ b/sycl/include/sycl/image.hpp @@ -28,7 +28,7 @@ #include // for property_list #include // for range, rangeTo... #include // for image_sampler -#include // for vec +#include // for vec #include // for ur_native_hand... #include // for size_t, nullptr_t diff --git a/sycl/include/sycl/interop_handle.hpp b/sycl/include/sycl/interop_handle.hpp index 81e4a9d559caa..2e7408cf5c0f9 100644 --- a/sycl/include/sycl/interop_handle.hpp +++ b/sycl/include/sycl/interop_handle.hpp @@ -11,16 +11,12 @@ #include // for target, mode, place... #include // for AccessorBaseHost #include // for backend, backend_re... -#include // for context +#include // for buffer #include // for __SYCL_EXPORT -#include // for context_impl #include // for getSyclObjImpl -#include // for device, device_impl #include -#include // for queue_impl #include // for accessor_property_list #include // for image -#include // for buffer #include // for ur_mem_handle_t, ur... #include // for shared_ptr diff --git a/sycl/include/sycl/kernel.hpp b/sycl/include/sycl/kernel.hpp index dac7f619d745e..654373c104c85 100644 --- a/sycl/include/sycl/kernel.hpp +++ b/sycl/include/sycl/kernel.hpp @@ -8,28 +8,26 @@ #pragma once -#include // for size_t -#include // for shared_ptr, hash, opera... #include // for backend, backend_return_t -#include // for context #include // for __SYCL2020_DEPRECATED #include // for __SYCL_EXPORT #include // for is_kernel_device_specif... #include // for OwnerLessBase -#include -#include #include -#include // for device #include // for bundle_state -#include // for range #include // for ur_native_handle_t -#include // for hash + +#include // for size_t +#include // for shared_ptr, hash, opera... +#include // for hash namespace sycl { inline namespace _V1 { // Forward declaration class context; class queue; +class device; +template class range; template class backend_traits; template class kernel_bundle; template diff --git a/sycl/include/sycl/known_identity.hpp b/sycl/include/sycl/known_identity.hpp index 3aecad3188e49..92c3db7959a44 100644 --- a/sycl/include/sycl/known_identity.hpp +++ b/sycl/include/sycl/known_identity.hpp @@ -13,7 +13,7 @@ #include // for bit_and, bit_or, bit_xor #include // for half #include // for marray -#include // for vec +#include // for vec #include // for byte, size_t #include // for logical_and, logical_or diff --git a/sycl/include/sycl/platform.hpp b/sycl/include/sycl/platform.hpp index 0d10080e00142..ec57731141b32 100644 --- a/sycl/include/sycl/platform.hpp +++ b/sycl/include/sycl/platform.hpp @@ -8,9 +8,7 @@ #pragma once -#include #include -#include #include #include #include @@ -55,38 +53,6 @@ class platform_impl; /// \param Val Indicates if extension should be enabled/disabled void __SYCL_EXPORT enable_ext_oneapi_default_context(bool Val); -template auto convert_to_abi_neutral(ParamT &&Info) { - using ParamNoRef = std::remove_reference_t; - if constexpr (std::is_same_v) { - return detail::string{Info}; - } else if constexpr (std::is_same_v>) { - std::vector Res; - Res.reserve(Info.size()); - for (std::string &Str : Info) { - Res.push_back(detail::string{Str}); - } - return Res; - } else { - return std::forward(Info); - } -} - -template auto convert_from_abi_neutral(ParamT &&Info) { - using ParamNoRef = std::remove_reference_t; - if constexpr (std::is_same_v) { - return Info.c_str(); - } else if constexpr (std::is_same_v>) { - std::vector Res; - Res.reserve(Info.size()); - for (detail::string &Str : Info) { - Res.push_back(Str.c_str()); - } - return Res; - } else { - return std::forward(Info); - } -} } // namespace detail namespace ext::oneapi { // Forward declaration diff --git a/sycl/include/sycl/stream.hpp b/sycl/include/sycl/stream.hpp index 9b8483679197f..ca2eb8b41c0c5 100644 --- a/sycl/include/sycl/stream.hpp +++ b/sycl/include/sycl/stream.hpp @@ -24,12 +24,11 @@ #include // for half, operator-, operator< #include // for handler #include // for item -#include -#include // for nd_item -#include // for nd_range -#include // for property_list -#include // for range -#include // for vec, SwizzleOp +#include // for nd_item +#include // for nd_range +#include // for property_list +#include // for range +#include // for vec, SwizzleOp #include // for size_t, byte #include // for hash, shared_ptr diff --git a/sycl/include/sycl/sycl.hpp b/sycl/include/sycl/sycl.hpp index 846c4fb175c38..73aa4421d0caa 100644 --- a/sycl/include/sycl/sycl.hpp +++ b/sycl/include/sycl/sycl.hpp @@ -19,15 +19,18 @@ #if SYCL_BACKEND_OPENCL #include #endif +#if SYCL_EXT_ONEAPI_BACKEND_LEVEL_ZERO +#include +#endif #include #include #include +#include #include #include #include #include #include -#include #include #include #include @@ -54,14 +57,12 @@ #include #include #include -#include #include #include #include +#include #include -#if SYCL_EXT_ONEAPI_BACKEND_LEVEL_ZERO -#include -#endif + #include #include #include @@ -73,6 +74,7 @@ #include #include #include +#include #include #include #include @@ -94,6 +96,7 @@ #include #include #include +#include #include #include #include diff --git a/sycl/include/sycl/types.hpp b/sycl/include/sycl/types.hpp deleted file mode 100644 index ed5432a912af1..0000000000000 --- a/sycl/include/sycl/types.hpp +++ /dev/null @@ -1,27 +0,0 @@ -//==---------------- types.hpp --- SYCL types ------------------------------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include // for decorated, address_space -#include // for half, cl_char, cl_int -#include // for ArrayCreator, RepeatV... -#include // for __SYCL2020_DEPRECATED -#include // for is_sigeninteger, is_s... -#include -#include // for is_floating_point -#include // for make_error_code, errc -#include // for StorageT, half, Vec16... -#include // for __SYCL_BINOP, __SYCL_... -#include // for multi_ptr - -#include - -#include - -#include // bfloat16 diff --git a/sycl/include/sycl/vector.hpp b/sycl/include/sycl/vector.hpp index 01e70f639e7b5..97d9704c3cc26 100644 --- a/sycl/include/sycl/vector.hpp +++ b/sycl/include/sycl/vector.hpp @@ -51,7 +51,8 @@ namespace sycl { -// TODO: Fix in the next ABI breaking windows. +// TODO: It should be within _V1 namespace, fix in the next ABI breaking +// windows. enum class rounding_mode { automatic = 0, rte = 1, rtz = 2, rtp = 3, rtn = 4 }; inline namespace _V1 { diff --git a/sycl/source/backend/level_zero.cpp b/sycl/source/backend/level_zero.cpp index e6b42c366243c..75a67745f6849 100644 --- a/sycl/source/backend/level_zero.cpp +++ b/sycl/source/backend/level_zero.cpp @@ -10,7 +10,7 @@ #include #include #include -#include +#include namespace sycl { inline namespace _V1 { diff --git a/sycl/source/backend/opencl.cpp b/sycl/source/backend/opencl.cpp index 56627a22dbb56..e0c669cb8c267 100644 --- a/sycl/source/backend/opencl.cpp +++ b/sycl/source/backend/opencl.cpp @@ -10,7 +10,6 @@ #include #include #include -#include #include #include diff --git a/sycl/source/detail/adapter.hpp b/sycl/source/detail/adapter.hpp index 700585a4d6a72..d78743ac6159e 100644 --- a/sycl/source/detail/adapter.hpp +++ b/sycl/source/detail/adapter.hpp @@ -10,7 +10,6 @@ #include #include -#include #include #include #include @@ -35,6 +34,7 @@ namespace sycl { inline namespace _V1 { +enum class backend : char; namespace detail { /// The adapter class provides a unified interface to the underlying low-level diff --git a/sycl/source/detail/allowlist.cpp b/sycl/source/detail/allowlist.cpp index a6113130aabda..96b9577aca975 100644 --- a/sycl/source/detail/allowlist.cpp +++ b/sycl/source/detail/allowlist.cpp @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include diff --git a/sycl/source/detail/buffer_impl.hpp b/sycl/source/detail/buffer_impl.hpp index f7a7e0999344d..be3a529f17718 100644 --- a/sycl/source/detail/buffer_impl.hpp +++ b/sycl/source/detail/buffer_impl.hpp @@ -17,7 +17,6 @@ #include // for iterator_to_const_type_t #include #include -#include #include #include diff --git a/sycl/source/detail/composite_device/composite_device.cpp b/sycl/source/detail/composite_device/composite_device.cpp index 6c57eb3015df1..f3eb568a6f4a0 100644 --- a/sycl/source/detail/composite_device/composite_device.cpp +++ b/sycl/source/detail/composite_device/composite_device.cpp @@ -6,9 +6,11 @@ // //===----------------------------------------------------------------------===// +#include #include #include +#include #include namespace sycl { diff --git a/sycl/source/detail/config.cpp b/sycl/source/detail/config.cpp index 21ce89458835f..ab25564c59370 100644 --- a/sycl/source/detail/config.cpp +++ b/sycl/source/detail/config.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include +#include #include #include #include diff --git a/sycl/source/detail/config.hpp b/sycl/source/detail/config.hpp index 49bef4fbb6cf1..3c1f2f6822807 100644 --- a/sycl/source/detail/config.hpp +++ b/sycl/source/detail/config.hpp @@ -9,7 +9,6 @@ #pragma once #include -#include #include #include #include @@ -25,6 +24,7 @@ namespace sycl { inline namespace _V1 { +enum class backend : char; namespace detail { #ifdef DISABLE_CONFIG_FROM_ENV diff --git a/sycl/source/detail/image_accessor_util.cpp b/sycl/source/detail/image_accessor_util.cpp index 82d63d5b4fdc7..88ca0d1f58178 100644 --- a/sycl/source/detail/image_accessor_util.cpp +++ b/sycl/source/detail/image_accessor_util.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include namespace sycl { diff --git a/sycl/source/detail/kernel_impl.hpp b/sycl/source/detail/kernel_impl.hpp index 1b71eb3e659ad..2dee6d9da151e 100644 --- a/sycl/source/detail/kernel_impl.hpp +++ b/sycl/source/detail/kernel_impl.hpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include diff --git a/sycl/source/detail/platform_impl.cpp b/sycl/source/detail/platform_impl.cpp index a6ddbc9156043..cb9a9f0f1b97f 100644 --- a/sycl/source/detail/platform_impl.cpp +++ b/sycl/source/detail/platform_impl.cpp @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/sycl/source/detail/platform_impl.hpp b/sycl/source/detail/platform_impl.hpp index d50b3d1b5d8c9..e7e76334efaa6 100644 --- a/sycl/source/detail/platform_impl.hpp +++ b/sycl/source/detail/platform_impl.hpp @@ -10,8 +10,8 @@ #include #include +#include #include -#include #include #include #include diff --git a/sycl/source/detail/sampler_impl.cpp b/sycl/source/detail/sampler_impl.cpp index 07582dee0dd31..81989a72c1146 100644 --- a/sycl/source/detail/sampler_impl.cpp +++ b/sycl/source/detail/sampler_impl.cpp @@ -9,6 +9,7 @@ #include #include #include +#include namespace sycl { inline namespace _V1 { diff --git a/sycl/source/detail/ur.hpp b/sycl/source/detail/ur.hpp index e0014866ebeb8..a599169ee3aec 100644 --- a/sycl/source/detail/ur.hpp +++ b/sycl/source/detail/ur.hpp @@ -14,7 +14,6 @@ #pragma once -#include #include #include @@ -22,6 +21,7 @@ namespace sycl { inline namespace _V1 { +enum class backend : char; namespace detail { class Adapter; using AdapterPtr = std::shared_ptr; @@ -36,6 +36,10 @@ initializeUr(ur_loader_config_handle_t LoaderConfig = nullptr); // Get the adapter serving given backend. template const AdapterPtr &getAdapter(); } // namespace ur + +// Convert from UR backend to SYCL backend enum +backend convertUrBackend(ur_platform_backend_t UrBackend); + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/windows_ur.cpp b/sycl/source/detail/windows_ur.cpp index 6f8d1f7ae6bdf..bac1163d6d3bd 100644 --- a/sycl/source/detail/windows_ur.cpp +++ b/sycl/source/detail/windows_ur.cpp @@ -6,7 +6,6 @@ // //===----------------------------------------------------------------------===// -#include #include #include diff --git a/sycl/test-e2e/BFloat16/bfloat16_vec.cpp b/sycl/test-e2e/BFloat16/bfloat16_vec.cpp index 50364541f4272..6b95296c64c7b 100644 --- a/sycl/test-e2e/BFloat16/bfloat16_vec.cpp +++ b/sycl/test-e2e/BFloat16/bfloat16_vec.cpp @@ -15,6 +15,7 @@ // RUN: %if preview-breaking-changes-supported %{ %{run} %t2.out %} #include +#include #include #include diff --git a/sycl/test-e2e/Basic/buffer/buffer.cpp b/sycl/test-e2e/Basic/buffer/buffer.cpp index 7c32c47cc570e..e0676207b4efe 100644 --- a/sycl/test-e2e/Basic/buffer/buffer.cpp +++ b/sycl/test-e2e/Basic/buffer/buffer.cpp @@ -13,7 +13,7 @@ #include #include #include -#include +#include using namespace sycl; diff --git a/sycl/test-e2e/Basic/group_async_copy.cpp b/sycl/test-e2e/Basic/group_async_copy.cpp index 2eceadd3e9898..fc75189d147f2 100644 --- a/sycl/test-e2e/Basic/group_async_copy.cpp +++ b/sycl/test-e2e/Basic/group_async_copy.cpp @@ -6,7 +6,7 @@ #include #include -#include +#include #include using namespace sycl; diff --git a/sycl/test-e2e/Basic/group_async_copy_legacy.cpp b/sycl/test-e2e/Basic/group_async_copy_legacy.cpp index ac5fac91e0cdb..9b2279fdcbb83 100644 --- a/sycl/test-e2e/Basic/group_async_copy_legacy.cpp +++ b/sycl/test-e2e/Basic/group_async_copy_legacy.cpp @@ -6,7 +6,7 @@ #include #include -#include +#include #include using namespace sycl; diff --git a/sycl/test-e2e/Basic/half_type.cpp b/sycl/test-e2e/Basic/half_type.cpp index 4a1a43af90448..930e46b85040a 100644 --- a/sycl/test-e2e/Basic/half_type.cpp +++ b/sycl/test-e2e/Basic/half_type.cpp @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include using namespace sycl; diff --git a/sycl/test-e2e/Basic/parallel_for_indexers.cpp b/sycl/test-e2e/Basic/parallel_for_indexers.cpp index 3c41ac04fc8b1..59dbf3329e82d 100644 --- a/sycl/test-e2e/Basic/parallel_for_indexers.cpp +++ b/sycl/test-e2e/Basic/parallel_for_indexers.cpp @@ -2,7 +2,7 @@ // RUN: %{run} %t2.out #include -#include +#include #include #include diff --git a/sycl/test-e2e/Basic/parallel_for_range_roundup.cpp b/sycl/test-e2e/Basic/parallel_for_range_roundup.cpp index 0a1d7e81fd054..762dd1485353f 100644 --- a/sycl/test-e2e/Basic/parallel_for_range_roundup.cpp +++ b/sycl/test-e2e/Basic/parallel_for_range_roundup.cpp @@ -24,7 +24,7 @@ // #include #include -#include +#include using namespace sycl; diff --git a/sycl/test-e2e/Basic/swizzle_op.cpp b/sycl/test-e2e/Basic/swizzle_op.cpp index 9e96c51f9483f..0d800faa5769b 100644 --- a/sycl/test-e2e/Basic/swizzle_op.cpp +++ b/sycl/test-e2e/Basic/swizzle_op.cpp @@ -11,7 +11,7 @@ #include #include -#include +#include using namespace sycl; diff --git a/sycl/test-e2e/Basic/sycl_2020_images/common.hpp b/sycl/test-e2e/Basic/sycl_2020_images/common.hpp index b5b97a66d2f2f..4433d3d3216b0 100644 --- a/sycl/test-e2e/Basic/sycl_2020_images/common.hpp +++ b/sycl/test-e2e/Basic/sycl_2020_images/common.hpp @@ -3,6 +3,7 @@ #include #include #include +#include #include using namespace sycl; diff --git a/sycl/test-e2e/Basic/vector/bool.cpp b/sycl/test-e2e/Basic/vector/bool.cpp index 51934c41b070c..b440f65e3261e 100644 --- a/sycl/test-e2e/Basic/vector/bool.cpp +++ b/sycl/test-e2e/Basic/vector/bool.cpp @@ -13,7 +13,8 @@ //===----------------------------------------------------------------------===// #include -#include +#include +#include constexpr int size = 2; diff --git a/sycl/test-e2e/Basic/vector/byte.cpp b/sycl/test-e2e/Basic/vector/byte.cpp index 10c8e097d34ed..6eadf6a177b7b 100644 --- a/sycl/test-e2e/Basic/vector/byte.cpp +++ b/sycl/test-e2e/Basic/vector/byte.cpp @@ -13,7 +13,8 @@ //===----------------------------------------------------------------------===// #include -#include +#include +#include #include // std::byte #include // std::ignore diff --git a/sycl/test-e2e/Basic/vector/int-convert.cpp b/sycl/test-e2e/Basic/vector/int-convert.cpp index 5af5911d3b7eb..ba5060292a164 100644 --- a/sycl/test-e2e/Basic/vector/int-convert.cpp +++ b/sycl/test-e2e/Basic/vector/int-convert.cpp @@ -11,7 +11,8 @@ // RUN: %if preview-breaking-changes-supported %{ %{run} %t2.out %} #include -#include +#include +#include #include #include diff --git a/sycl/test-e2e/Basic/vector/load_store.cpp b/sycl/test-e2e/Basic/vector/load_store.cpp index 626fd0264fb71..df4af3976a1ed 100644 --- a/sycl/test-e2e/Basic/vector/load_store.cpp +++ b/sycl/test-e2e/Basic/vector/load_store.cpp @@ -9,7 +9,7 @@ #include #include #include -#include +#include namespace syclex = sycl::ext::oneapi; diff --git a/sycl/test-e2e/Basic/vector/operators.cpp b/sycl/test-e2e/Basic/vector/operators.cpp index 99f1f251762e9..f45c6e8e49e79 100644 --- a/sycl/test-e2e/Basic/vector/operators.cpp +++ b/sycl/test-e2e/Basic/vector/operators.cpp @@ -11,7 +11,7 @@ #define SYCL_SIMPLE_SWIZZLES #include -#include +#include namespace s = sycl; template diff --git a/sycl/test-e2e/Basic/vector/scalar_access.cpp b/sycl/test-e2e/Basic/vector/scalar_access.cpp index 2c474c07e64cc..8b67b5fb8faeb 100644 --- a/sycl/test-e2e/Basic/vector/scalar_access.cpp +++ b/sycl/test-e2e/Basic/vector/scalar_access.cpp @@ -17,7 +17,7 @@ #include #include -#include +#include typedef float float4_t __attribute__((ext_vector_type(4))); diff --git a/sycl/test-e2e/Basic/vector/vec_binary_scalar_order.hpp b/sycl/test-e2e/Basic/vector/vec_binary_scalar_order.hpp index c9f0876366305..3d825665a7690 100644 --- a/sycl/test-e2e/Basic/vector/vec_binary_scalar_order.hpp +++ b/sycl/test-e2e/Basic/vector/vec_binary_scalar_order.hpp @@ -3,7 +3,7 @@ #pragma once #include -#include +#include template using rel_t = std::conditional_t< diff --git a/sycl/test-e2e/Regression/get_spec_const_vec16.cpp b/sycl/test-e2e/Regression/get_spec_const_vec16.cpp index fc9437c2c79d9..e851d2fe1d1e5 100644 --- a/sycl/test-e2e/Regression/get_spec_const_vec16.cpp +++ b/sycl/test-e2e/Regression/get_spec_const_vec16.cpp @@ -6,7 +6,7 @@ #include #include -#include +#include #include diff --git a/sycl/test-e2e/Regression/group.cpp b/sycl/test-e2e/Regression/group.cpp index d906c3500a095..5c1d256550c0c 100644 --- a/sycl/test-e2e/Regression/group.cpp +++ b/sycl/test-e2e/Regression/group.cpp @@ -11,7 +11,7 @@ #include #include -#include +#include using namespace sycl; diff --git a/sycl/test-e2e/Regression/local-arg-align.cpp b/sycl/test-e2e/Regression/local-arg-align.cpp index b0e7336e14d33..4eca3aeff7f84 100644 --- a/sycl/test-e2e/Regression/local-arg-align.cpp +++ b/sycl/test-e2e/Regression/local-arg-align.cpp @@ -16,7 +16,7 @@ #include #include -#include +#include using namespace sycl; diff --git a/sycl/test-e2e/Regression/swizzle_opassign.cpp b/sycl/test-e2e/Regression/swizzle_opassign.cpp index 1d52cdc397c36..f8ee858edd655 100644 --- a/sycl/test-e2e/Regression/swizzle_opassign.cpp +++ b/sycl/test-e2e/Regression/swizzle_opassign.cpp @@ -7,8 +7,8 @@ // and correctly mutate the elements in the corresponding vector. #include -#include #include +#include constexpr std::string_view OpNames[] = { "+=", "-=", "*=", "/=", "%=", "&=", "|=", diff --git a/sycl/test-e2e/Regression/vec_rel_swizzle_ops.cpp b/sycl/test-e2e/Regression/vec_rel_swizzle_ops.cpp index aca726b62cbb6..6184f0905f86a 100644 --- a/sycl/test-e2e/Regression/vec_rel_swizzle_ops.cpp +++ b/sycl/test-e2e/Regression/vec_rel_swizzle_ops.cpp @@ -6,7 +6,7 @@ #include #include -#include +#include template bool testAndOperator(const std::string &typeName) { diff --git a/sycl/test/CMakeLists.txt b/sycl/test/CMakeLists.txt index 144e2204361ef..b0b0629cffbd6 100644 --- a/sycl/test/CMakeLists.txt +++ b/sycl/test/CMakeLists.txt @@ -15,7 +15,6 @@ set(SYCL_THREADS_LIB ${CMAKE_THREAD_LIBS_INIT}) # TEST_INCLUDE_PATH is used for syntax-only verification of type information. list(APPEND test_includes ${SYCL_INCLUDE}) -list(APPEND test_includes ${SYCL_INCLUDE}/sycl) list(APPEND test_includes ${SYCL_SOURCE_DIR}/source) list(APPEND test_includes ${BOOST_UNORDERED_INCLUDE_DIRS}) if(SYCL_ENABLE_EXTENSION_JIT) diff --git a/sycl/test/abi/layout_vec.cpp b/sycl/test/abi/layout_vec.cpp index db3b724d1873c..ade053f068f30 100644 --- a/sycl/test/abi/layout_vec.cpp +++ b/sycl/test/abi/layout_vec.cpp @@ -6,7 +6,7 @@ // clang-format off -#include +#include SYCL_EXTERNAL void foo(sycl::vec) {} diff --git a/sycl/test/abi/symbol_size_alignment.cpp b/sycl/test/abi/symbol_size_alignment.cpp index 03ab7022045c6..63286da1b9786 100644 --- a/sycl/test/abi/symbol_size_alignment.cpp +++ b/sycl/test/abi/symbol_size_alignment.cpp @@ -18,7 +18,7 @@ #include #include #include -#include +#include using namespace sycl; diff --git a/sycl/test/basic_tests/vectors/size_one_checks.cpp b/sycl/test/basic_tests/vectors/size_one_checks.cpp index bca51592f8eeb..c1dbb7bfa20e0 100644 --- a/sycl/test/basic_tests/vectors/size_one_checks.cpp +++ b/sycl/test/basic_tests/vectors/size_one_checks.cpp @@ -3,7 +3,7 @@ // RUN: %if preview-breaking-changes-supported %{ %clangxx -fsycl -fpreview-breaking-changes %s -o %t_vec.out %} // RUN: %if preview-breaking-changes-supported %{ %t_vec.out %} -#include +#include int main() { sycl::vec v1{42}; diff --git a/sycl/test/check_device_code/vector/vector_bf16_builtins.cpp b/sycl/test/check_device_code/vector/vector_bf16_builtins.cpp index 517bcba4c3732..891c79a1a8f94 100644 --- a/sycl/test/check_device_code/vector/vector_bf16_builtins.cpp +++ b/sycl/test/check_device_code/vector/vector_bf16_builtins.cpp @@ -13,6 +13,7 @@ // This test checks the device code generated for vec math builtins. #include +#include #include using namespace sycl; diff --git a/sycl/test/check_device_code/vector/vector_convert_bfloat.cpp b/sycl/test/check_device_code/vector/vector_convert_bfloat.cpp index e880e1ed3713a..3166aacc915e3 100644 --- a/sycl/test/check_device_code/vector/vector_convert_bfloat.cpp +++ b/sycl/test/check_device_code/vector/vector_convert_bfloat.cpp @@ -5,8 +5,9 @@ // REQUIRES: linux #include +#include #include -#include +#include using namespace sycl; using bfloat16 = sycl::ext::oneapi::bfloat16; diff --git a/sycl/test/include_deps/sycl_accessor.hpp.cpp b/sycl/test/include_deps/sycl_accessor.hpp.cpp index 6f961731eebfe..7f50f997fa228 100644 --- a/sycl/test/include_deps/sycl_accessor.hpp.cpp +++ b/sycl/test/include_deps/sycl_accessor.hpp.cpp @@ -17,14 +17,14 @@ // CHECK-NEXT: __spirv/spirv_vars.hpp // CHECK-NEXT: multi_ptr.hpp // CHECK-NEXT: aliases.hpp +// CHECK-NEXT: detail/type_traits.hpp +// CHECK-NEXT: detail/type_traits/vec_marray_traits.hpp // CHECK-NEXT: half_type.hpp // CHECK-NEXT: bit_cast.hpp // CHECK-NEXT: detail/iostream_proxy.hpp // CHECK-NEXT: aspects.hpp // CHECK-NEXT: info/aspects.def // CHECK-NEXT: info/aspects_deprecated.def -// CHECK-NEXT: detail/type_traits.hpp -// CHECK-NEXT: detail/type_traits/vec_marray_traits.hpp // CHECK-NEXT: buffer.hpp // CHECK-NEXT: backend_types.hpp // CHECK-NEXT: detail/array.hpp diff --git a/sycl/test/include_deps/sycl_detail_core.hpp.cpp b/sycl/test/include_deps/sycl_detail_core.hpp.cpp index 8b0144fdbf44f..60a935d9ae465 100644 --- a/sycl/test/include_deps/sycl_detail_core.hpp.cpp +++ b/sycl/test/include_deps/sycl_detail_core.hpp.cpp @@ -18,14 +18,14 @@ // CHECK-NEXT: __spirv/spirv_vars.hpp // CHECK-NEXT: multi_ptr.hpp // CHECK-NEXT: aliases.hpp +// CHECK-NEXT: detail/type_traits.hpp +// CHECK-NEXT: detail/type_traits/vec_marray_traits.hpp // CHECK-NEXT: half_type.hpp // CHECK-NEXT: bit_cast.hpp // CHECK-NEXT: detail/iostream_proxy.hpp // CHECK-NEXT: aspects.hpp // CHECK-NEXT: info/aspects.def // CHECK-NEXT: info/aspects_deprecated.def -// CHECK-NEXT: detail/type_traits.hpp -// CHECK-NEXT: detail/type_traits/vec_marray_traits.hpp // CHECK-NEXT: buffer.hpp // CHECK-NEXT: backend_types.hpp // CHECK-NEXT: detail/array.hpp From 249a89fb2ec13024d39449bd8b93bd8526d572c3 Mon Sep 17 00:00:00 2001 From: Kseniya Tikhomirova Date: Fri, 15 Nov 2024 16:12:46 +0100 Subject: [PATCH 03/36] [SYCL][NFC] Remove unnecessary object copy in MEvents analysis (#15999) Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/scheduler/graph_builder.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 5f95995e279d7..5636309cdccc1 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -1024,7 +1024,7 @@ Command *Scheduler::GraphBuilder::addCG( } // Register all the events as dependencies - for (detail::EventImplPtr e : Events) { + for (const detail::EventImplPtr &e : Events) { if (Command *ConnCmd = NewCmd->addDep(e, ToCleanUp)) ToEnqueue.push_back(ConnCmd); } From 27dab62f66270ab4aadcefc92038f367225b65cf Mon Sep 17 00:00:00 2001 From: Ross Brunton Date: Fri, 15 Nov 2024 16:36:22 +0000 Subject: [PATCH 04/36] [UR] Bump UR with cfi sanitizer (#16040) Pre-commit MR for: https://github.com/oneapi-src/unified-runtime/pull/2222 --------- Co-authored-by: Callum Fare --- sycl/cmake/modules/UnifiedRuntimeTag.cmake | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake index 1d3935f9e5e2a..9b405334a6816 100644 --- a/sycl/cmake/modules/UnifiedRuntimeTag.cmake +++ b/sycl/cmake/modules/UnifiedRuntimeTag.cmake @@ -1,7 +1,7 @@ -# commit 3a5b23c8b475712f9107c1d5ab41f27a1465578e -# Merge: f9f71f17 1696524d -# Author: Piotr Balcer -# Date: Thu Nov 14 14:38:05 2024 +0100 -# Merge pull request #2253 from pbalcer/low-power-events -# add low-power events experimental extension spec -set(UNIFIED_RUNTIME_TAG 3a5b23c8b475712f9107c1d5ab41f27a1465578e) +# commit 30391c65d2d2ccc7ee3688a14815804bfb7fdf05 +# Merge: 5e6d79b3 58dabfe8 +# Author: Callum Fare +# Date: Fri Nov 15 15:13:20 2024 +0000 +# Merge pull request #2222 from RossBrunton/ross/cfi +# Enable -flto and -fsanitize=cfi in clang +set(UNIFIED_RUNTIME_TAG 30391c65d2d2ccc7ee3688a14815804bfb7fdf05) From 61bd6a504d5a188e84bf938f8e7a3d656b047c99 Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Fri, 15 Nov 2024 09:24:55 -0800 Subject: [PATCH 05/36] [NFC][SYCL] Refactor `merged_properties_t` (#16093) New `merge_properties` can work on property lists containing run-time properties as long as their keys don't conflict. That functionality isn't used anywhere as of now though, hence `NFC` tag. --- .../sycl/ext/oneapi/properties/properties.hpp | 14 +- .../ext/oneapi/properties/property_utils.hpp | 129 ++++++------------ .../properties/properties_kernel_negative.cpp | 1 - 3 files changed, 40 insertions(+), 104 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/properties/properties.hpp b/sycl/include/sycl/ext/oneapi/properties/properties.hpp index 190a16ce5d4c3..12c0af87a0e87 100644 --- a/sycl/include/sycl/ext/oneapi/properties/properties.hpp +++ b/sycl/include/sycl/ext/oneapi/properties/properties.hpp @@ -343,19 +343,9 @@ template using properties_t = properties>; -// Helper for merging two property lists; template -struct merged_properties; -template -struct merged_properties, - properties_t> { - using type = properties< - typename MergeProperties, - properties_type_list>::type>; -}; -template -using merged_properties_t = - typename merged_properties::type; +using merged_properties_t = decltype(merge_properties( + std::declval(), std::declval())); template struct ValueOrDefault { diff --git a/sycl/include/sycl/ext/oneapi/properties/property_utils.hpp b/sycl/include/sycl/ext/oneapi/properties/property_utils.hpp index 2a1d89a4ebc96..2e10699a65fea 100644 --- a/sycl/include/sycl/ext/oneapi/properties/property_utils.hpp +++ b/sycl/include/sycl/ext/oneapi/properties/property_utils.hpp @@ -43,12 +43,6 @@ template using nth_type_t = typename nth_type::type; #endif -template struct PrependProperty {}; -template -struct PrependProperty> { - using type = properties_type_list; -}; - //****************************************************************************** // Property identification //****************************************************************************** @@ -63,91 +57,6 @@ struct AllPropertyValues> AllPropertyValues>, std::false_type> {}; -//****************************************************************************** -// Property type sorting -//****************************************************************************** - -// Splits a tuple into head and tail if ShouldSplit is true. If ShouldSplit is -// false the head will be void and the tail will be the full tuple. -template struct HeadSplit {}; -template -struct HeadSplit, true> { - using htype = T; - using ttype = properties_type_list; -}; -template struct HeadSplit, false> { - using htype = void; - using ttype = properties_type_list; -}; - -// Selects the one of two types that is not void. This assumes that at least one -// of the two template arguemnts is void. -template struct SelectNonVoid {}; -template struct SelectNonVoid { - using type = LHS; -}; -template struct SelectNonVoid { - using type = RHS; -}; - -//****************************************************************************** -// Property merging -//****************************************************************************** - -// Merges two sets of properties, failing if two properties are the same but -// with different values. -// NOTE: This assumes that the properties are in sorted order. -template struct MergeProperties; - -template <> -struct MergeProperties, properties_type_list<>> { - using type = properties_type_list<>; -}; - -template -struct MergeProperties, - properties_type_list<>> { - using type = properties_type_list; -}; - -template -struct MergeProperties, - properties_type_list> { - using type = properties_type_list; -}; - -// Identical properties are allowed, but only one will carry over. -template -struct MergeProperties, - properties_type_list> { - using merge_tails = - typename MergeProperties, - properties_type_list>::type; - using type = typename PrependProperty::type; -}; - -template -struct MergeProperties, - properties_type_list> { - using l_head = nth_type_t<0, LHSPropertyTs...>; - using r_head = nth_type_t<0, RHSPropertyTs...>; - static_assert( - PropertyID::value != PropertyID::value, - "Failed to merge property lists due to conflicting properties."); - static constexpr bool left_has_min = - PropertyID::value < PropertyID::value; - using l_split = - HeadSplit, left_has_min>; - using r_split = - HeadSplit, !left_has_min>; - using min = typename SelectNonVoid::type; - using merge_tails = typename MergeProperties::type; - using type = typename PrependProperty::type; -}; - //****************************************************************************** // Property value tooling //****************************************************************************** @@ -349,6 +258,44 @@ constexpr auto filter_properties( return filter_properties_impl::apply(props); } +template struct merge_filter { + template + struct predicate + : std::bool_constant || + ...))> {}; +}; + +template +constexpr auto merge_properties( + const properties> &lhs, + const properties> &rhs) { + auto rhs_unique_props = + filter_properties::template predicate>( + rhs); + if constexpr (std::is_same_v, + std::decay_t>) { + // None of RHS properties share keys with LHS, no conflicts possible. + return properties{ + lhs.template get_property()..., + rhs.template get_property()...}; + } else { + // Ensure no conflicts, then merge. + constexpr auto has_conflict = [](auto *lhs_prop) constexpr { + using lhs_property_ty = std::remove_pointer_t; + return (((std::is_same_v && + (!std::is_same_v || + !std::is_empty_v)) || + ...)); + }; + static_assert( + !((has_conflict(static_cast(nullptr)) || ...)), + "Failed to merge property lists due to conflicting properties."); + return merge_properties(lhs, rhs_unique_props); + } +} + } // namespace detail } // namespace ext::oneapi::experimental } // namespace _V1 diff --git a/sycl/test/extensions/properties/properties_kernel_negative.cpp b/sycl/test/extensions/properties/properties_kernel_negative.cpp index 5b00fa954fd7e..541d1b248950b 100644 --- a/sycl/test/extensions/properties/properties_kernel_negative.cpp +++ b/sycl/test/extensions/properties/properties_kernel_negative.cpp @@ -83,7 +83,6 @@ void check_work_group_size() { sycl::queue Q; // expected-error-re@sycl/ext/oneapi/properties/property_utils.hpp:* {{static assertion failed due to requirement {{.+}}: Failed to merge property lists due to conflicting properties.}} - // expected-error-re@sycl/handler.hpp:* {{static assertion failed due to requirement {{.+}}: Template type is not a property list.}} // expected-note-re@+1 {{in instantiation of function template specialization {{.+}}}} Q.single_task( sycl::ext::oneapi::experimental::properties{ From 5fb74982d0712a00d1db487f18d1d721f59a4bfc Mon Sep 17 00:00:00 2001 From: Buildbot for SYCL Date: Sat, 16 Nov 2024 01:40:30 +0800 Subject: [PATCH 06/36] [GHA] Uplift Linux IGC Dev RT version to igc-dev-6ba42ba (#16082) Scheduled igc dev drivers uplift Co-authored-by: GitHub Actions Co-authored-by: Nick Sarnie --- devops/dependencies-igc-dev.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/devops/dependencies-igc-dev.json b/devops/dependencies-igc-dev.json index 6f3cbc7da4273..6b2ee8953e920 100644 --- a/devops/dependencies-igc-dev.json +++ b/devops/dependencies-igc-dev.json @@ -1,10 +1,10 @@ { "linux": { "igc_dev": { - "github_tag": "igc-dev-ad75a20", - "version": "ad75a20", - "updated_at": "2024-11-10T01:11:34Z", - "url": "https://api.github.com/repos/intel/intel-graphics-compiler/actions/artifacts/2167439771/zip", + "github_tag": "igc-dev-6ba42ba", + "version": "6ba42ba", + "updated_at": "2024-11-13T17:40:54Z", + "url": "https://api.github.com/repos/intel/intel-graphics-compiler/actions/artifacts/2183383213/zip", "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu" } } From 620ac3520e6ff67afd8f382b3e1753955e5d3e4f Mon Sep 17 00:00:00 2001 From: Udit Agarwal Date: Fri, 15 Nov 2024 10:13:57 -0800 Subject: [PATCH 07/36] [Clang] Link ITT libraries in device code by default (#16094) Fixes: CMPLRLLVM-63157 **Problem** Consider the following case: ``` clang++ -fsycl -c testFile.cpp -o obj1.o clang++ -fsycl -c testFile2.cpp -o obj2.o -fsycl-instrument-device-code clang++ -fsycl obj1.o obj2.o -o test.exe // test.exe fails with: JIT session error: Symbols not found: [ __itt_offload_wi_finish_wrapper, __itt_offload_wi_start_wrapper ] ``` This issue was observed while using MKL static libraries built with `-fsycl-instrument-device-code` with the latest compiler that does not link ITT annotations by default. With this change, we link in ITT libraries by default to stays ABI compliant with the previous release. During device code linking, if the device code is not instrumented with ITT annotations, this library will be omitted. Note that, even with this change, we are not instrumenting device code with ITT annotations by default. --- clang/lib/Driver/ToolChains/SYCL.cpp | 5 ++++- clang/test/Driver/sycl-instrumentation-old-model.c | 8 ++++++-- clang/test/Driver/sycl-instrumentation.c | 8 ++++++-- clang/test/Driver/sycl-offload-new-driver.c | 2 +- 4 files changed, 17 insertions(+), 6 deletions(-) diff --git a/clang/lib/Driver/ToolChains/SYCL.cpp b/clang/lib/Driver/ToolChains/SYCL.cpp index 1d903aa1fc17c..6831938fd4c6d 100644 --- a/clang/lib/Driver/ToolChains/SYCL.cpp +++ b/clang/lib/Driver/ToolChains/SYCL.cpp @@ -650,8 +650,11 @@ SYCL::getDeviceLibraries(const Compilation &C, const llvm::Triple &TargetTriple, addLibraries(SYCLDeviceBfloat16FallbackLib); } + // Link in ITT annotations library unless fsycl-no-instrument-device-code + // is specified. This ensures that we are ABI-compatible with the + // instrumented device code, which was the default not so long ago. if (Args.hasFlag(options::OPT_fsycl_instrument_device_code, - options::OPT_fno_sycl_instrument_device_code, false)) + options::OPT_fno_sycl_instrument_device_code, true)) addLibraries(SYCLDeviceAnnotationLibs); #if !defined(_WIN32) diff --git a/clang/test/Driver/sycl-instrumentation-old-model.c b/clang/test/Driver/sycl-instrumentation-old-model.c index 3e07d6d1ca298..bf1471fad84ef 100644 --- a/clang/test/Driver/sycl-instrumentation-old-model.c +++ b/clang/test/Driver/sycl-instrumentation-old-model.c @@ -20,12 +20,16 @@ // CHECK-SPIRV-SAME: "{{.*}}libsycl-itt-stubs.bc" // CHECK-HOST-NOT: "-cc1"{{.*}} "-fsycl-is-host"{{.*}} "-fsycl-instrument-device-code" -// ITT annotations in device code are disabled by default. +// ITT annotations in device code are disabled by default. However, for SYCL offloading, +// we still link ITT annotations libraries to ensure ABI compatibility with previous release. // RUN: %clangxx -fsycl --no-offload-new-driver -fsycl-targets=spir64 -### %s 2>&1 \ -// RUN: | FileCheck -check-prefixes=CHECK-NONPASSED %s +// RUN: | FileCheck -check-prefixes=CHECK-ITT-LINK-ONLY %s // RUN: %clangxx -fsycl --no-offload-new-driver -fsycl-targets=nvptx64-nvidia-cuda -nocudalib -### %s 2>&1 \ // RUN: | FileCheck -check-prefixes=CHECK-NONPASSED %s +// CHECK-ITT-LINK-ONLY-NOT: "-fsycl-instrument-device-code" +// CHECK-ITT-LINK-ONLY: llvm-link{{.*}} {{.*}}libsycl-itt-{{.*}} + // RUN: %clangxx -fsycl --no-offload-new-driver -fno-sycl-instrument-device-code -fsycl-targets=spir64 -### %s 2>&1 \ // RUN: | FileCheck -check-prefixes=CHECK-NONPASSED %s // RUN: %clangxx -fsycl --no-offload-new-driver -fsycl-targets=nvptx64-nvidia-cuda -fno-sycl-instrument-device-code -nocudalib -### %s 2>&1 \ diff --git a/clang/test/Driver/sycl-instrumentation.c b/clang/test/Driver/sycl-instrumentation.c index ccb3d857d46af..c2dbf8b6b83f7 100644 --- a/clang/test/Driver/sycl-instrumentation.c +++ b/clang/test/Driver/sycl-instrumentation.c @@ -19,12 +19,16 @@ // CHECK-SPIRV-SAME: libsycl-itt-compiler-wrappers.new.o // CHECK-SPIRV-SAME: libsycl-itt-stubs.new.o -// ITT annotations in device code are disabled by default. +// ITT annotations in device code are disabled by default. However, for SYCL offloading, +// we still link ITT annotations libraries to ensure ABI compatibility with previous release. // RUN: %clangxx -fsycl --offload-new-driver -fsycl-targets=spir64 -### %s 2>&1 \ -// RUN: | FileCheck -check-prefixes=CHECK-NONPASSED %s +// RUN: | FileCheck -check-prefixes=CHECK-ITT-LINK-ONLY %s // RUN: %clangxx -fsycl --offload-new-driver -fsycl-targets=nvptx64-nvidia-cuda -nocudalib -### %s 2>&1 \ // RUN: | FileCheck -check-prefixes=CHECK-NONPASSED %s +// CHECK-ITT-LINK-ONLY-NOT: "-fsycl-instrument-device-code" +// CHECK-ITT-LINK-ONLY: clang-linker-wrapper{{.*}} {{.*}}libsycl-itt-{{.*}} + // RUN: %clangxx -fsycl --offload-new-driver -fno-sycl-instrument-device-code -fsycl-targets=spir64 -### %s 2>&1 \ // RUN: | FileCheck -check-prefixes=CHECK-NONPASSED %s // RUN: %clangxx -fsycl --offload-new-driver -fsycl-targets=nvptx64-nvidia-cuda -fno-sycl-instrument-device-code -nocudalib -### %s 2>&1 \ diff --git a/clang/test/Driver/sycl-offload-new-driver.c b/clang/test/Driver/sycl-offload-new-driver.c index b6732b3e9312e..dd656192b80f3 100644 --- a/clang/test/Driver/sycl-offload-new-driver.c +++ b/clang/test/Driver/sycl-offload-new-driver.c @@ -34,7 +34,7 @@ // RUN: %clangxx --target=x86_64-unknown-linux-gnu -fsycl --offload-new-driver \ // RUN: --sysroot=%S/Inputs/SYCL -### %s 2>&1 \ // RUN: | FileCheck -check-prefix WRAPPER_OPTIONS %s -// WRAPPER_OPTIONS: clang-linker-wrapper{{.*}} "-sycl-device-libraries=libsycl-crt.new.o,libsycl-complex.new.o,libsycl-complex-fp64.new.o,libsycl-cmath.new.o,libsycl-cmath-fp64.new.o,libsycl-imf.new.o,libsycl-imf-fp64.new.o,libsycl-imf-bf16.new.o,libsycl-fallback-cassert.new.o,libsycl-fallback-cstring.new.o,libsycl-fallback-complex.new.o,libsycl-fallback-complex-fp64.new.o,libsycl-fallback-cmath.new.o,libsycl-fallback-cmath-fp64.new.o,libsycl-fallback-imf.new.o,libsycl-fallback-imf-fp64.new.o,libsycl-fallback-imf-bf16.new.o" +// WRAPPER_OPTIONS: clang-linker-wrapper{{.*}} "-sycl-device-libraries=libsycl-crt.new.o,libsycl-complex.new.o,libsycl-complex-fp64.new.o,libsycl-cmath.new.o,libsycl-cmath-fp64.new.o,libsycl-imf.new.o,libsycl-imf-fp64.new.o,libsycl-imf-bf16.new.o,libsycl-fallback-cassert.new.o,libsycl-fallback-cstring.new.o,libsycl-fallback-complex.new.o,libsycl-fallback-complex-fp64.new.o,libsycl-fallback-cmath.new.o,libsycl-fallback-cmath-fp64.new.o,libsycl-fallback-imf.new.o,libsycl-fallback-imf-fp64.new.o,libsycl-fallback-imf-bf16.new.o,libsycl-itt-user-wrappers.new.o,libsycl-itt-compiler-wrappers.new.o,libsycl-itt-stubs.new.o" // WRAPPER_OPTIONS-SAME: "-sycl-device-library-location={{.*}}/lib" /// Verify phases used to generate SPIR-V instead of LLVM-IR From 01f7e442783215cdd23cc042f7bdbfbe4c9ab335 Mon Sep 17 00:00:00 2001 From: Nick Sarnie Date: Sat, 16 Nov 2024 05:13:25 +0900 Subject: [PATCH 08/36] [SYCL][E2E] Move sporadicly passing test vector_with_virtual_mem.cpp to UNSUPPORTED (#16098) XPASSing rarely like [here](https://github.com/intel/llvm/actions/runs/11859980629/job/33054700795). I updated the GH issue. Signed-off-by: Sarnie, Nick --- sycl/test-e2e/VirtualMem/vector_with_virtual_mem.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/test-e2e/VirtualMem/vector_with_virtual_mem.cpp b/sycl/test-e2e/VirtualMem/vector_with_virtual_mem.cpp index 4d80ade89df04..35a095efcff98 100644 --- a/sycl/test-e2e/VirtualMem/vector_with_virtual_mem.cpp +++ b/sycl/test-e2e/VirtualMem/vector_with_virtual_mem.cpp @@ -1,7 +1,7 @@ // REQUIRES: aspect-usm_shared_allocations -// XFAIL: linux && gpu-intel-dg2 -// XFAIL-TRACKER: https://github.com/intel/llvm/issues/15812 +// UNSUPPORTED: linux && gpu-intel-dg2 +// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/15812 // RUN: %{build} -o %t.out // RUN: %{run} %t.out From 08a2edc060dd458528f0dfbe5310029ada8983ba Mon Sep 17 00:00:00 2001 From: Mariya Podchishchaeva Date: Mon, 18 Nov 2024 09:50:26 +0100 Subject: [PATCH 09/36] [SYCL][clang] Emit default template arguments in integration header (#16005) For free function kernels support clang forward declares the kernel itself as well as its parameter types. In case a free function kernel has a parameter that is templated and has a default template argument, all template arguments including arguments that match default arguments must be printed in kernel's forward declarations, for example ``` template struct Arg { T val; }; // For the kernel SYCL_EXT_ONEAPI_FUNCTION_PROPERTY( (ext::oneapi::experimental::nd_range_kernel<1>)) void foo(Arg arg) { arg.val = 42; } // Integration header must contain void foo(Arg arg); ``` Unfortunately, even though integration header emission already has extensive support for forward declarations priting, some modifications to clang's type printing are still required, since neither of existing PrintingPolicy flags help to reach the correct result. Using `SuppressDefaultTemplateArgs = true` doesn't help without printing canonical types, printing canonical types for the case like ``` template SYCL_EXT_ONEAPI_FUNCTION_PROPERTY( (ext::oneapi::experimental::nd_range_kernel<1>)) void foo(Arg arg) { arg.val = 42; } // Printing canonical types is causing the following integration header template void foo(Arg arg); ``` Using `SkipCanonicalizationOfTemplateTypeParms` field of printing policy doesn't help here since at the one point where it is checked we take canonical type of `Arg`, not its parameters and it will contain template argument types in canonical type after that. --- clang/include/clang/AST/PrettyPrinter.h | 20 +++- clang/lib/AST/TypePrinter.cpp | 60 +++++++---- clang/lib/Sema/SemaSYCL.cpp | 32 +++++- ...ee_function_default_template_arguments.cpp | 100 ++++++++++++++++++ 4 files changed, 185 insertions(+), 27 deletions(-) create mode 100644 clang/test/CodeGenSYCL/free_function_default_template_arguments.cpp diff --git a/clang/include/clang/AST/PrettyPrinter.h b/clang/include/clang/AST/PrettyPrinter.h index a50216615c4a9..beadfea6e3ae1 100644 --- a/clang/include/clang/AST/PrettyPrinter.h +++ b/clang/include/clang/AST/PrettyPrinter.h @@ -68,17 +68,18 @@ struct PrintingPolicy { SuppressStrongLifetime(false), SuppressLifetimeQualifiers(false), SuppressTypedefs(false), SuppressFinalSpecifier(false), SuppressTemplateArgsInCXXConstructors(false), - SuppressDefaultTemplateArgs(true), Bool(LO.Bool), - Nullptr(LO.CPlusPlus11 || LO.C23), NullptrTypeInNamespace(LO.CPlusPlus), - Restrict(LO.C99), Alignof(LO.CPlusPlus11), UnderscoreAlignof(LO.C11), + SuppressDefaultTemplateArgs(true), EnforceDefaultTemplateArgs(false), + Bool(LO.Bool), Nullptr(LO.CPlusPlus11 || LO.C23), + NullptrTypeInNamespace(LO.CPlusPlus), Restrict(LO.C99), + Alignof(LO.CPlusPlus11), UnderscoreAlignof(LO.C11), UseVoidForZeroParams(!LO.CPlusPlus), SplitTemplateClosers(!LO.CPlusPlus11), TerseOutput(false), PolishForDeclaration(false), Half(LO.Half), MSWChar(LO.MicrosoftExt && !LO.WChar), IncludeNewlines(true), MSVCFormatting(false), ConstantsAsWritten(false), SuppressImplicitBase(false), FullyQualifiedName(false), - SuppressDefinition(false), SuppressDefaultTemplateArguments(false), - PrintCanonicalTypes(false), + EnforceScopeForElaboratedTypes(false), SuppressDefinition(false), + SuppressDefaultTemplateArguments(false), PrintCanonicalTypes(false), SkipCanonicalizationOfTemplateTypeParms(false), PrintInjectedClassNameWithArguments(true), UsePreferredNames(true), AlwaysIncludeTypeForTemplateArgument(false), @@ -241,6 +242,11 @@ struct PrintingPolicy { LLVM_PREFERRED_TYPE(bool) unsigned SuppressDefaultTemplateArgs : 1; + /// When true, print template arguments that match the default argument for + /// the parameter, even if they're not specified in the source. + LLVM_PREFERRED_TYPE(bool) + unsigned EnforceDefaultTemplateArgs : 1; + /// Whether we can use 'bool' rather than '_Bool' (even if the language /// doesn't actually have 'bool', because, e.g., it is defined as a macro). LLVM_PREFERRED_TYPE(bool) @@ -339,6 +345,10 @@ struct PrintingPolicy { LLVM_PREFERRED_TYPE(bool) unsigned FullyQualifiedName : 1; + /// Enforce fully qualified name printing for elaborated types. + LLVM_PREFERRED_TYPE(bool) + unsigned EnforceScopeForElaboratedTypes : 1; + /// When true does not print definition of a type. E.g. /// \code /// template class C0 : public C1 {...} diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp index 636ddaddf8769..49eb096cf369f 100644 --- a/clang/lib/AST/TypePrinter.cpp +++ b/clang/lib/AST/TypePrinter.cpp @@ -101,7 +101,7 @@ class ElaboratedTypePolicyRAII { SuppressTagKeyword = Policy.SuppressTagKeyword; SuppressScope = Policy.SuppressScope; Policy.SuppressTagKeyword = true; - Policy.SuppressScope = true; + Policy.SuppressScope = !Policy.EnforceScopeForElaboratedTypes; } ~ElaboratedTypePolicyRAII() { @@ -1728,8 +1728,10 @@ void TypePrinter::printElaboratedBefore(const ElaboratedType *T, Policy.SuppressScope = OldSupressScope; return; } - if (Qualifier && !(Policy.SuppressTypedefs && - T->getNamedType()->getTypeClass() == Type::Typedef)) + if (Qualifier && + !(Policy.SuppressTypedefs && + T->getNamedType()->getTypeClass() == Type::Typedef) && + !Policy.EnforceScopeForElaboratedTypes) Qualifier->print(OS, Policy); } @@ -2220,15 +2222,6 @@ static void printArgument(const TemplateArgument &A, const PrintingPolicy &PP, A.print(PP, OS, IncludeType); } -static void printArgument(const TemplateArgumentLoc &A, - const PrintingPolicy &PP, llvm::raw_ostream &OS, - bool IncludeType) { - const TemplateArgument::ArgKind &Kind = A.getArgument().getKind(); - if (Kind == TemplateArgument::ArgKind::Type) - return A.getTypeSourceInfo()->getType().print(OS, PP); - return A.getArgument().print(PP, OS, IncludeType); -} - static bool isSubstitutedTemplateArgument(ASTContext &Ctx, TemplateArgument Arg, TemplateArgument Pattern, ArrayRef Args, @@ -2399,15 +2392,40 @@ template static void printTo(raw_ostream &OS, ArrayRef Args, const PrintingPolicy &Policy, const TemplateParameterList *TPL, bool IsPack, unsigned ParmIndex) { - // Drop trailing template arguments that match default arguments. - if (TPL && Policy.SuppressDefaultTemplateArgs && - !Policy.PrintCanonicalTypes && !Args.empty() && !IsPack && + llvm::SmallVector ArgsToPrint; + for (const TA &A : Args) + ArgsToPrint.push_back(getArgument(A)); + if (TPL && !Policy.PrintCanonicalTypes && !IsPack && Args.size() <= TPL->size()) { - llvm::SmallVector OrigArgs; - for (const TA &A : Args) - OrigArgs.push_back(getArgument(A)); - while (!Args.empty() && getArgument(Args.back()).getIsDefaulted()) - Args = Args.drop_back(); + // Drop trailing template arguments that match default arguments. + if (Policy.SuppressDefaultTemplateArgs) { + while (!ArgsToPrint.empty() && + getArgument(ArgsToPrint.back()).getIsDefaulted()) + ArgsToPrint.pop_back(); + } else if (Policy.EnforceDefaultTemplateArgs) { + for (unsigned I = Args.size(); I < TPL->size(); ++I) { + auto Param = TPL->getParam(I); + if (auto *TTPD = dyn_cast(Param)) { + // If we met a non default-argument past provided list of arguments, + // it is either a pack which must be the last arguments, or provided + // argument list was problematic. Bail out either way. Do the same + // for each kind of template argument. + if (!TTPD->hasDefaultArgument()) + break; + ArgsToPrint.push_back(getArgument(TTPD->getDefaultArgument())); + } else if (auto *TTPD = dyn_cast(Param)) { + if (!TTPD->hasDefaultArgument()) + break; + ArgsToPrint.push_back(getArgument(TTPD->getDefaultArgument())); + } else if (auto *NTTPD = dyn_cast(Param)) { + if (!NTTPD->hasDefaultArgument()) + break; + ArgsToPrint.push_back(getArgument(NTTPD->getDefaultArgument())); + } else { + llvm_unreachable("unexpected template parameter"); + } + } + } } const char *Comma = Policy.MSVCFormatting ? "," : ", "; @@ -2416,7 +2434,7 @@ printTo(raw_ostream &OS, ArrayRef Args, const PrintingPolicy &Policy, bool NeedSpace = false; bool FirstArg = true; - for (const auto &Arg : Args) { + for (const auto &Arg : ArgsToPrint) { // Print the argument into a string. SmallString<128> Buf; llvm::raw_svector_ostream ArgOS(Buf); diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp index 7d53638c8eff3..767dde6512b83 100644 --- a/clang/lib/Sema/SemaSYCL.cpp +++ b/clang/lib/Sema/SemaSYCL.cpp @@ -6509,16 +6509,46 @@ void SYCLIntegrationHeader::emit(raw_ostream &O) { O << "extern \"C\" "; std::string ParmList; bool FirstParam = true; + Policy.SuppressDefaultTemplateArgs = false; for (ParmVarDecl *Param : K.SyclKernel->parameters()) { if (FirstParam) FirstParam = false; else ParmList += ", "; - ParmList += Param->getType().getCanonicalType().getAsString(); + ParmList += Param->getType().getCanonicalType().getAsString(Policy); } FunctionTemplateDecl *FTD = K.SyclKernel->getPrimaryTemplate(); Policy.SuppressDefinition = true; Policy.PolishForDeclaration = true; + Policy.FullyQualifiedName = true; + Policy.EnforceScopeForElaboratedTypes = true; + + // Now we need to print the declaration of the kernel itself. + // Example: + // template struct Arg { + // T val; + // }; + // For the following free function kernel: + // template + // SYCL_EXT_ONEAPI_FUNCTION_PROPERTY( + // (ext::oneapi::experimental::nd_range_kernel<1>)) + // void foo(Arg arg) {} + // Integration header must contain the following declaration: + // template + // void foo(Arg arg); + // SuppressDefaultTemplateArguments is a downstream addition that suppresses + // default template arguments in the function declaration. It should be set + // to true to emit function declaration that won't cause any compilation + // errors when present in the integration header. + // To print Arg in the function declaration and shim functions we + // need to disable default arguments printing suppression via community flag + // SuppressDefaultTemplateArgs, otherwise they will be suppressed even for + // canonical types or if even written in the original source code. + Policy.SuppressDefaultTemplateArguments = true; + // EnforceDefaultTemplateArgs is a downstream addition that forces printing + // template arguments that match default template arguments while printing + // template-ids, even if the source code doesn't reference them. + Policy.EnforceDefaultTemplateArgs = true; if (FTD) { FTD->print(O, Policy); } else { diff --git a/clang/test/CodeGenSYCL/free_function_default_template_arguments.cpp b/clang/test/CodeGenSYCL/free_function_default_template_arguments.cpp new file mode 100644 index 0000000000000..808f7b93d8112 --- /dev/null +++ b/clang/test/CodeGenSYCL/free_function_default_template_arguments.cpp @@ -0,0 +1,100 @@ +// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -triple spir64-unknown-unknown -sycl-std=2020 -fsycl-int-header=%t.h %s +// RUN: FileCheck -input-file=%t.h %s + +// This test checks integration header contents for free functions kernels with +// parameter types that have default template arguments. + +#include "mock_properties.hpp" +#include "sycl.hpp" + +namespace ns { + +struct notatuple { + int a; +}; + +namespace ns1 { +template +class hasDefaultArg { + +}; +} + +template struct Arg { + T val; +}; + +[[__sycl_detail__::add_ir_attributes_function("sycl-single-task-kernel", + 2)]] void +simple(Arg){ +} + +} + +[[__sycl_detail__::add_ir_attributes_function("sycl-single-task-kernel", + 2)]] void +simple1(ns::Arg>){ +} + + +template +[[__sycl_detail__::add_ir_attributes_function("sycl-nd-range-kernel", 2)]] void +templated(ns::Arg, T end) { +} + +template void templated(ns::Arg, int); + +using namespace ns; + +template +[[__sycl_detail__::add_ir_attributes_function("sycl-nd-range-kernel", 2)]] void +templated2(Arg, T end) { +} + +template void templated2(Arg, int); + +template +[[__sycl_detail__::add_ir_attributes_function("sycl-nd-range-kernel", 2)]] void +templated3(Arg, int, int>, T end) { +} + +template void templated3(Arg, int, int>, int); + +// CHECK: Forward declarations of kernel and its argument types: +// CHECK-NEXT: namespace ns { +// CHECK-NEXT: struct notatuple; +// CHECK-NEXT: } +// CHECK-NEXT: namespace ns { +// CHECK-NEXT: template struct Arg; +// CHECK-NEXT: } + +// CHECK: void ns::simple(ns::Arg); +// CHECK-NEXT: static constexpr auto __sycl_shim1() { +// CHECK-NEXT: return (void (*)(struct ns::Arg))simple; +// CHECK-NEXT: } + +// CHECK: Forward declarations of kernel and its argument types: +// CHECK: namespace ns { +// CHECK: namespace ns1 { +// CHECK-NEXT: template class hasDefaultArg; +// CHECK-NEXT: } + +// CHECK: void simple1(ns::Arg, int, 12, ns::notatuple>); +// CHECK-NEXT: static constexpr auto __sycl_shim2() { +// CHECK-NEXT: return (void (*)(struct ns::Arg, int, 12, struct ns::notatuple>))simple1; +// CHECK-NEXT: } + +// CHECK: template void templated(ns::Arg, T end); +// CHECK-NEXT: static constexpr auto __sycl_shim3() { +// CHECK-NEXT: return (void (*)(struct ns::Arg, int))templated; +// CHECK-NEXT: } + +// CHECK: template void templated2(ns::Arg, T end); +// CHECK-NEXT: static constexpr auto __sycl_shim4() { +// CHECK-NEXT: return (void (*)(struct ns::Arg, int))templated2; +// CHECK-NEXT: } + +// CHECK: template void templated3(ns::Arg, int, int>, T end); +// CHECK-NEXT: static constexpr auto __sycl_shim5() { +// CHECK-NEXT: return (void (*)(struct ns::Arg, int, int>, int))templated3; +// CHECK-NEXT: } From 2e11d261bd62db670a1ca2a630385860e8600003 Mon Sep 17 00:00:00 2001 From: Nick Sarnie Date: Tue, 19 Nov 2024 00:00:33 +0900 Subject: [PATCH 10/36] [CI] Windows test hang detection should always run (#16100) If the tests fail or the job is cancelled this should run. Signed-off-by: Sarnie, Nick --- .github/workflows/sycl-windows-run-tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/sycl-windows-run-tests.yml b/.github/workflows/sycl-windows-run-tests.yml index 65c2df1290771..9186392f8fa46 100644 --- a/.github/workflows/sycl-windows-run-tests.yml +++ b/.github/workflows/sycl-windows-run-tests.yml @@ -88,6 +88,7 @@ jobs: cmake --build build-e2e --target check-sycl-e2e - name: Detect hung tests shell: powershell + if: always() run: | $exitCode = 0 $hungTests = Get-Process | Where-Object { ($_.Path -match "llvm\\install") -or ($_.Path -match "llvm\\build-e2e") } From c5845a7276b012b996b68b26bb66df5705361e71 Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Mon, 18 Nov 2024 08:56:04 -0800 Subject: [PATCH 11/36] [NFCI][SYCL] Remove most uses of `ContainsProperty` (#16103) First, `properties` itself has an interface that does exactly that. Second, this trait uses `std::tuple` which is relatively expensive to instantiate and that happens *on top* of the normal `properties` instantation that doesn't use `std::tuple` anymore. Remaining uses are related to `ConflictingProperties` which is ugly enough to be addressed in a separate PR. --- .../fpga_annotated_properties.hpp | 25 ++++++------- .../annotated_arg/annotated_arg.hpp | 8 ++--- .../annotated_ptr/annotated_ptr.hpp | 32 ++++++++--------- .../annotated_ptr_properties.hpp | 36 ++++++++++--------- .../experimental/annotated_usm/alloc_util.hpp | 8 ++--- 5 files changed, 51 insertions(+), 58 deletions(-) diff --git a/sycl/include/sycl/ext/intel/experimental/fpga_annotated_properties.hpp b/sycl/include/sycl/ext/intel/experimental/fpga_annotated_properties.hpp index ea9f531c764b9..bbf55d469809a 100644 --- a/sycl/include/sycl/ext/intel/experimental/fpga_annotated_properties.hpp +++ b/sycl/include/sycl/ext/intel/experimental/fpga_annotated_properties.hpp @@ -319,28 +319,25 @@ struct propagateToPtrAnnotation : std::true_type {}; //===----------------------------------------------------------------------===// // namespace detail { -template struct checkValidFPGAPropertySet { - using list = std::tuple; - static constexpr bool has_BufferLocation = - ContainsProperty::value; +template struct checkValidFPGAPropertySet { + template + static constexpr bool has_one_of = + ((property_list_t::template has_property() || ...)); + + static constexpr bool has_BufferLocation = has_one_of; static constexpr bool has_InterfaceConfig = - ContainsProperty::value || - ContainsProperty::value || - ContainsProperty::value || - ContainsProperty::value || - ContainsProperty::value || - ContainsProperty::value; + has_one_of; static constexpr bool value = !(!has_BufferLocation && has_InterfaceConfig); }; -template struct checkHasConduitAndRegisterMap { - using list = std::tuple; +template struct checkHasConduitAndRegisterMap { static constexpr bool has_Conduit = - ContainsProperty::value; + property_list_t::template has_property(); static constexpr bool has_RegisterMap = - ContainsProperty::value; + property_list_t::template has_property(); static constexpr bool value = !(has_Conduit && has_RegisterMap); }; } // namespace detail diff --git a/sycl/include/sycl/ext/oneapi/experimental/annotated_arg/annotated_arg.hpp b/sycl/include/sycl/ext/oneapi/experimental/annotated_arg/annotated_arg.hpp index 91baca3e14e3f..851497b7ead3b 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/annotated_arg/annotated_arg.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/annotated_arg/annotated_arg.hpp @@ -214,13 +214,13 @@ __SYCL_TYPE(annotated_arg) annotated_arg> { "The property list contains invalid property."); // check the set if FPGA specificed properties are used static constexpr bool hasValidFPGAProperties = - detail::checkValidFPGAPropertySet::value; + detail::checkValidFPGAPropertySet::value; static_assert(hasValidFPGAProperties, "FPGA Interface properties (i.e. awidth, dwidth, etc.) " "can only be set with BufferLocation together."); // check if conduit and register_map properties are specified together static constexpr bool hasConduitAndRegisterMapProperties = - detail::checkHasConduitAndRegisterMap::value; + detail::checkHasConduitAndRegisterMap::value; static_assert(hasConduitAndRegisterMapProperties, "The properties conduit and register_map cannot be " "specified at the same time."); @@ -447,13 +447,13 @@ __SYCL_TYPE(annotated_arg) annotated_arg> { "The property list contains invalid property."); // check the set if FPGA specificed properties are used static constexpr bool hasValidFPGAProperties = - detail::checkValidFPGAPropertySet::value; + detail::checkValidFPGAPropertySet::value; static_assert(hasValidFPGAProperties, "FPGA Interface properties (i.e. awidth, dwidth, etc.) " "can only be set with BufferLocation together."); // check if conduit and register_map properties are specified together static constexpr bool hasConduitAndRegisterMapProperties = - detail::checkHasConduitAndRegisterMap::value; + detail::checkHasConduitAndRegisterMap::value; static_assert(hasConduitAndRegisterMapProperties, "The properties conduit and register_map cannot be " "specified at the same time."); diff --git a/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr.hpp b/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr.hpp index c9a59a8d85c0c..13db0b377aecc 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr.hpp @@ -50,10 +50,6 @@ template constexpr bool is_ann_ref_v = is_ann_ref_impl>::value; -template -using contains_alignment = - detail::ContainsProperty>; - // filter properties that are applied on annotations template using annotation_filter = decltype(filter_properties( @@ -392,38 +388,38 @@ __SYCL_TYPE(annotated_ptr) annotated_ptr> { // turned off for these operators to make sure the complete error notes are // printed // clang-format off - template ::value, + template (), class = std::enable_if_t> reference operator[](std::ptrdiff_t idx) const noexcept { return reference(m_Ptr + idx); } - template ::value, + template (), class = std::enable_if_t> auto operator[](std::ptrdiff_t idx) const noexcept -> decltype("operator[] is not available when alignment is specified!") = delete; - template ::value, + template (), class = std::enable_if_t> annotated_ptr operator+(size_t offset) const noexcept { return annotated_ptr(m_Ptr + offset); } - template ::value, + template (), class = std::enable_if_t> auto operator+(size_t offset) const noexcept -> decltype("operator+ is not available when alignment is specified!") = delete; - template ::value, + template (), class = std::enable_if_t> annotated_ptr &operator++() noexcept { m_Ptr += 1; return *this; } - template ::value, + template (), class = std::enable_if_t> auto operator++() noexcept -> decltype("operator++ is not available when alignment is specified!") = delete; - template ::value, + template (), class = std::enable_if_t> annotated_ptr operator++(int) noexcept { auto tmp = *this; @@ -431,22 +427,22 @@ __SYCL_TYPE(annotated_ptr) annotated_ptr> { return tmp; } - template ::value, + template (), class = std::enable_if_t> auto operator++(int) noexcept -> decltype("operator++ is not available when alignment is specified!") = delete; - template ::value, + template (), class = std::enable_if_t> annotated_ptr &operator--() noexcept { m_Ptr -= 1; return *this; } - template ::value, + template (), class = std::enable_if_t> auto operator--() noexcept -> decltype("operator-- is not available when alignment is specified!") = delete; - template ::value, + template (), class = std::enable_if_t> annotated_ptr operator--(int) noexcept { auto tmp = *this; @@ -454,7 +450,7 @@ __SYCL_TYPE(annotated_ptr) annotated_ptr> { return tmp; } - template ::value, + template (), class = std::enable_if_t> auto operator--(int) noexcept -> decltype("operator-- is not available when alignment is specified!") = delete; @@ -485,13 +481,13 @@ __SYCL_TYPE(annotated_ptr) annotated_ptr> { "The property list contains invalid property."); // check the set if FPGA specificed properties are used static constexpr bool hasValidFPGAProperties = - detail::checkValidFPGAPropertySet::value; + detail::checkValidFPGAPropertySet::value; static_assert(hasValidFPGAProperties, "FPGA Interface properties (i.e. awidth, dwidth, etc.) " "can only be set with BufferLocation together."); // check if conduit and register_map properties are specified together static constexpr bool hasConduitAndRegisterMapProperties = - detail::checkHasConduitAndRegisterMap::value; + detail::checkHasConduitAndRegisterMap::value; static_assert(hasConduitAndRegisterMapProperties, "The properties conduit and register_map cannot be " "specified at the same time."); diff --git a/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr_properties.hpp b/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr_properties.hpp index fcdd05f3a497e..55dfa02fef68f 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr_properties.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr_properties.hpp @@ -55,24 +55,26 @@ struct PropertyMetaInfo> { static constexpr sycl::usm::alloc value = Kind; }; -template struct IsUsmKindDevice : std::false_type {}; -template -struct IsUsmKindDevice> - : detail::ContainsProperty, - std::tuple> {}; - -template struct IsUsmKindHost : std::false_type {}; -template -struct IsUsmKindHost> - : detail::ContainsProperty, - std::tuple> {}; - -template struct IsUsmKindShared : std::false_type {}; -template -struct IsUsmKindShared> - : detail::ContainsProperty, - std::tuple> {}; +template +inline constexpr bool is_usm_kind = []() constexpr { + if constexpr (PropertyListT::template has_property()) + return PropertyListT::template get_property() == + usm_kind; + else + return false; +}(); +template +struct IsUsmKindDevice + : std::bool_constant> { +}; +template +struct IsUsmKindHost + : std::bool_constant> {}; +template +struct IsUsmKindShared + : std::bool_constant> { +}; } // namespace detail } // namespace experimental diff --git a/sycl/include/sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp b/sycl/include/sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp index 6548f3d3c3673..cb8dc3bbb7376 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp @@ -32,11 +32,9 @@ using MergeUsmKind = decltype(properties{usm_kind})>; // Check if a property list contains the a certain property -template struct HasProperty {}; - -template -struct HasProperty> - : detail::ContainsProperty> {}; +template +struct HasProperty + : std::bool_constant()> {}; template using HasAlign = HasProperty; From a0401d0bc3b26162a83e56c9b1960a9068f8fe9a Mon Sep 17 00:00:00 2001 From: Buildbot for SYCL Date: Tue, 19 Nov 2024 01:08:47 +0800 Subject: [PATCH 12/36] [GHA] Uplift Linux IGC Dev RT version to igc-dev-0b4b682 (#16104) Scheduled igc dev drivers uplift Co-authored-by: GitHub Actions --- devops/dependencies-igc-dev.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/devops/dependencies-igc-dev.json b/devops/dependencies-igc-dev.json index 6b2ee8953e920..72a9ef49b2e89 100644 --- a/devops/dependencies-igc-dev.json +++ b/devops/dependencies-igc-dev.json @@ -1,10 +1,10 @@ { "linux": { "igc_dev": { - "github_tag": "igc-dev-6ba42ba", - "version": "6ba42ba", - "updated_at": "2024-11-13T17:40:54Z", - "url": "https://api.github.com/repos/intel/intel-graphics-compiler/actions/artifacts/2183383213/zip", + "github_tag": "igc-dev-0b4b682", + "version": "0b4b682", + "updated_at": "2024-11-17T01:09:50Z", + "url": "https://api.github.com/repos/intel/intel-graphics-compiler/actions/artifacts/2197388704/zip", "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu" } } From 69572a2843dd32656332c10f45ae0b6a3f8f12ab Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Mon, 18 Nov 2024 09:53:07 -0800 Subject: [PATCH 13/36] [SYCL] Refactor compile-time properties' `detail::` traits (#16099) * Prefer to perform the check on values and not keys * Use `std::is_empty_v` to distinguish between compile/run-time properties * Remove unused traits (including the ones that became such after changes above) Technically, I'm (almost?) able to remove the `compile_time_property_key` and `run_time_property_key` base classes at this point, but I'd prefer to do that later once `property_value` template is removed from the spec too, as both of these changes will require updating all the properties and I'd rather do that once. --- .../experimental/annotated_usm/alloc_util.hpp | 16 ++--- .../sycl/ext/oneapi/properties/property.hpp | 23 ------- .../ext/oneapi/properties/property_utils.hpp | 14 ---- .../ext/oneapi/properties/property_value.hpp | 43 ++++-------- .../annotated_usm/fake_properties.hpp | 66 +++++++++---------- 5 files changed, 55 insertions(+), 107 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp b/sycl/include/sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp index cb8dc3bbb7376..450f6b087d9c3 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp @@ -89,13 +89,13 @@ struct ValidAllocPropertyList> IsRuntimePropertyValid::value) && ValidAllocPropertyList< T, detail::properties_t>::value> { + static_assert(is_property_value_v); + static constexpr bool is_compile_time = std::is_empty_v; // check if a compile-time property is valid for annotated_ptr - static_assert(!detail::IsCompileTimePropertyValue::value || - is_valid_property::value, + static_assert(!is_compile_time || is_valid_property::value, "Found invalid compile-time property in the property list."); // check if a runtime property is valid for malloc - static_assert(!detail::IsRuntimeProperty::value || - IsRuntimePropertyValid::value, + static_assert(is_compile_time || IsRuntimePropertyValid::value, "Found invalid runtime property in the property list."); }; @@ -110,15 +110,15 @@ template <> struct GetCompileTimeProperties { template struct GetCompileTimeProperties> { using type = - std::conditional_t::value, - detail::properties_t, empty_properties_t>; + std::conditional_t, detail::properties_t, + empty_properties_t>; }; template struct GetCompileTimeProperties> { using filtered_this_property_t = - std::conditional_t::value, - detail::properties_t, empty_properties_t>; + std::conditional_t, detail::properties_t, + empty_properties_t>; using filtered_other_properties_t = typename GetCompileTimeProperties>::type; using type = detail::merged_properties_t struct PropertyID { static_cast(PropertyToKind::Kind); }; -// Trait for identifying runtime properties. -template -struct IsRuntimeProperty - : std::bool_constant< - !is_property_list_v && - std::is_base_of_v && - !std::is_base_of_v> {}; - -// Trait for identifying compile-time properties. -template -struct IsCompileTimeProperty - : std::bool_constant< - !is_property_list_v && - std::is_base_of_v && - std::is_base_of_v> {}; - -// Checks if a type is either a runtime property or if it is a compile-time -// property -template struct IsProperty { - static constexpr bool value = - IsRuntimeProperty::value || IsCompileTimeProperty::value; -}; - // Trait for property compile-time meta names and values. template struct PropertyMetaInfo { // Some properties don't have meaningful compile-time values. diff --git a/sycl/include/sycl/ext/oneapi/properties/property_utils.hpp b/sycl/include/sycl/ext/oneapi/properties/property_utils.hpp index 2e10699a65fea..0fdbc1e82e518 100644 --- a/sycl/include/sycl/ext/oneapi/properties/property_utils.hpp +++ b/sycl/include/sycl/ext/oneapi/properties/property_utils.hpp @@ -43,20 +43,6 @@ template using nth_type_t = typename nth_type::type; #endif -//****************************************************************************** -// Property identification -//****************************************************************************** - -// Checks that all types in a tuple are valid properties. -template struct AllPropertyValues {}; -template -struct AllPropertyValues> : std::true_type {}; -template -struct AllPropertyValues> - : std::conditional_t::value, - AllPropertyValues>, - std::false_type> {}; - //****************************************************************************** // Property value tooling //****************************************************************************** diff --git a/sycl/include/sycl/ext/oneapi/properties/property_value.hpp b/sycl/include/sycl/ext/oneapi/properties/property_value.hpp index 024819d3f77c1..c3b825e6054d6 100644 --- a/sycl/include/sycl/ext/oneapi/properties/property_value.hpp +++ b/sycl/include/sycl/ext/oneapi/properties/property_value.hpp @@ -48,7 +48,7 @@ struct property_value PropertyT> {}; template -constexpr std::enable_if_t::value, +constexpr std::enable_if_t>, bool> operator==(const property_value &, const property_value &) { @@ -56,32 +56,30 @@ operator==(const property_value &, } template -constexpr std::enable_if_t::value, +constexpr std::enable_if_t>, bool> operator!=(const property_value &, const property_value &) { return (!std::is_same::value || ...); } -template struct is_property_value { - static constexpr bool value = - detail::IsRuntimeProperty::value && is_property_key::value; -}; -template struct is_property_value_of { - static constexpr bool value = - detail::IsRuntimeProperty::value && is_property_key_of::value; -}; -// Specialization for compile-time-constant properties template -struct is_property_value> - : is_property_key {}; -template -struct is_property_value_of> - : is_property_key_of {}; +struct is_property_value + : std::bool_constant && + std::is_base_of_v> {}; template inline constexpr bool is_property_value_v = is_property_value::value; +template struct is_property_value_of { + static constexpr bool value = []() constexpr { + if constexpr (is_property_value_v) + return is_property_key_of::value; + else + return false; + }(); +}; + namespace detail { // Specialization of PropertyID for propagating IDs through property_value. @@ -89,19 +87,6 @@ template struct PropertyID> : PropertyID {}; -// Checks if a type is a compile-time property values. -template -struct IsCompileTimePropertyValue : std::false_type {}; -template -struct IsCompileTimePropertyValue> - : IsCompileTimeProperty {}; - -// Checks if a type is a valid property value, i.e either runtime property or -// property_value with a valid compile-time property -template struct IsPropertyValue { - static constexpr bool value = - IsRuntimeProperty::value || IsCompileTimePropertyValue::value; -}; } // namespace detail } // namespace ext::oneapi::experimental } // namespace _V1 diff --git a/sycl/test/extensions/annotated_usm/fake_properties.hpp b/sycl/test/extensions/annotated_usm/fake_properties.hpp index 0462b96e8e24c..f2151f32adfdc 100644 --- a/sycl/test/extensions/annotated_usm/fake_properties.hpp +++ b/sycl/test/extensions/annotated_usm/fake_properties.hpp @@ -62,39 +62,39 @@ struct foz : detail::run_time_property_key { }; // clang-format off -struct rt_prop1 : detail::run_time_property_key {}; -struct rt_prop2 : detail::run_time_property_key {}; -struct rt_prop3 : detail::run_time_property_key {}; -struct rt_prop4 : detail::run_time_property_key {}; -struct rt_prop5 : detail::run_time_property_key {}; -struct rt_prop6 : detail::run_time_property_key {}; -struct rt_prop7 : detail::run_time_property_key {}; -struct rt_prop8 : detail::run_time_property_key {}; -struct rt_prop9 : detail::run_time_property_key {}; -struct rt_prop10 : detail::run_time_property_key {}; -struct rt_prop11 : detail::run_time_property_key {}; -struct rt_prop12 : detail::run_time_property_key {}; -struct rt_prop13 : detail::run_time_property_key {}; -struct rt_prop14 : detail::run_time_property_key {}; -struct rt_prop15 : detail::run_time_property_key {}; -struct rt_prop16 : detail::run_time_property_key {}; -struct rt_prop17 : detail::run_time_property_key {}; -struct rt_prop18 : detail::run_time_property_key {}; -struct rt_prop19 : detail::run_time_property_key {}; -struct rt_prop20 : detail::run_time_property_key {}; -struct rt_prop21 : detail::run_time_property_key {}; -struct rt_prop22 : detail::run_time_property_key {}; -struct rt_prop23 : detail::run_time_property_key {}; -struct rt_prop24 : detail::run_time_property_key {}; -struct rt_prop25 : detail::run_time_property_key {}; -struct rt_prop26 : detail::run_time_property_key {}; -struct rt_prop27 : detail::run_time_property_key {}; -struct rt_prop28 : detail::run_time_property_key {}; -struct rt_prop29 : detail::run_time_property_key {}; -struct rt_prop30 : detail::run_time_property_key {}; -struct rt_prop31 : detail::run_time_property_key {}; -struct rt_prop32 : detail::run_time_property_key {}; -struct rt_prop33 : detail::run_time_property_key {}; +struct rt_prop1 : detail::run_time_property_key { int x; }; +struct rt_prop2 : detail::run_time_property_key { int x; }; +struct rt_prop3 : detail::run_time_property_key { int x; }; +struct rt_prop4 : detail::run_time_property_key { int x; }; +struct rt_prop5 : detail::run_time_property_key { int x; }; +struct rt_prop6 : detail::run_time_property_key { int x; }; +struct rt_prop7 : detail::run_time_property_key { int x; }; +struct rt_prop8 : detail::run_time_property_key { int x; }; +struct rt_prop9 : detail::run_time_property_key { int x; }; +struct rt_prop10 : detail::run_time_property_key { int x; }; +struct rt_prop11 : detail::run_time_property_key { int x; }; +struct rt_prop12 : detail::run_time_property_key { int x; }; +struct rt_prop13 : detail::run_time_property_key { int x; }; +struct rt_prop14 : detail::run_time_property_key { int x; }; +struct rt_prop15 : detail::run_time_property_key { int x; }; +struct rt_prop16 : detail::run_time_property_key { int x; }; +struct rt_prop17 : detail::run_time_property_key { int x; }; +struct rt_prop18 : detail::run_time_property_key { int x; }; +struct rt_prop19 : detail::run_time_property_key { int x; }; +struct rt_prop20 : detail::run_time_property_key { int x; }; +struct rt_prop21 : detail::run_time_property_key { int x; }; +struct rt_prop22 : detail::run_time_property_key { int x; }; +struct rt_prop23 : detail::run_time_property_key { int x; }; +struct rt_prop24 : detail::run_time_property_key { int x; }; +struct rt_prop25 : detail::run_time_property_key { int x; }; +struct rt_prop26 : detail::run_time_property_key { int x; }; +struct rt_prop27 : detail::run_time_property_key { int x; }; +struct rt_prop28 : detail::run_time_property_key { int x; }; +struct rt_prop29 : detail::run_time_property_key { int x; }; +struct rt_prop30 : detail::run_time_property_key { int x; }; +struct rt_prop31 : detail::run_time_property_key { int x; }; +struct rt_prop32 : detail::run_time_property_key { int x; }; +struct rt_prop33 : detail::run_time_property_key { int x; }; // clang-format on using foo_key = foo; From 121997271ec629b69e4eb21f320971a1d2d67dbe Mon Sep 17 00:00:00 2001 From: Udit Agarwal Date: Mon, 18 Nov 2024 10:21:06 -0800 Subject: [PATCH 14/36] [SYCL] Remove build options from fast kernel cache key (#16101) Build options were removed from Kernel cache key in https://github.com/intel/llvm/pull/11351 to reduce the kernel lookup overhead. This PR removes build options from fast kernel cache key as well. Quoting https://github.com/intel/llvm/pull/11351 > This can be done because they are either empty or set by the environment variable so they stay the same for the entire program lifecycle for the purposes of in-memory caching. --- sycl/source/detail/kernel_program_cache.hpp | 19 ++++++++++++++----- .../program_manager/program_manager.cpp | 10 ++-------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/sycl/source/detail/kernel_program_cache.hpp b/sycl/source/detail/kernel_program_cache.hpp index 5794d6930016a..f58cda059bcce 100644 --- a/sycl/source/detail/kernel_program_cache.hpp +++ b/sycl/source/detail/kernel_program_cache.hpp @@ -163,10 +163,19 @@ class KernelProgramCache { ::boost::unordered_map; using KernelFastCacheKeyT = - std::tuple; + std::tuple; + using KernelFastCacheValT = - std::tuple; + std::tuple; + // This container is used as a fast path for retrieving cached kernels. // unordered_flat_map is used here to reduce lookup overhead. // The slow path is used only once for each newly created kernel, so the @@ -283,7 +292,7 @@ class KernelProgramCache { std::unique_lock Lock(MKernelFastCacheMutex); auto It = MKernelFastCache.find(CacheKey); if (It != MKernelFastCache.end()) { - traceKernel("Kernel fetched.", std::get<3>(CacheKey), true); + traceKernel("Kernel fetched.", std::get<2>(CacheKey), true); return It->second; } return std::make_tuple(nullptr, nullptr, nullptr, nullptr); @@ -294,7 +303,7 @@ class KernelProgramCache { std::unique_lock Lock(MKernelFastCacheMutex); // if no insertion took place, thus some other thread has already inserted // smth in the cache - traceKernel("Kernel inserted.", std::get<3>(CacheKey), true); + traceKernel("Kernel inserted.", std::get<2>(CacheKey), true); MKernelFastCache.emplace(CacheKey, CacheVal); } diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index d6602725663ff..8f13c0745ad21 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -969,17 +969,11 @@ ProgramManager::getOrCreateKernel(const ContextImplPtr &ContextImpl, using KernelArgMaskPairT = KernelProgramCache::KernelArgMaskPairT; KernelProgramCache &Cache = ContextImpl->getKernelProgramCache(); - - std::string CompileOpts, LinkOpts; SerializedObj SpecConsts; - applyOptionsFromEnvironment(CompileOpts, LinkOpts); - // Should always come last! - appendCompileEnvironmentVariablesThatAppend(CompileOpts); - appendLinkEnvironmentVariablesThatAppend(LinkOpts); + ur_device_handle_t UrDevice = DeviceImpl->getHandleRef(); - auto key = std::make_tuple(std::move(SpecConsts), UrDevice, - CompileOpts + LinkOpts, KernelName); + auto key = std::make_tuple(std::move(SpecConsts), UrDevice, KernelName); if (SYCLConfig::get()) { auto ret_tuple = Cache.tryToGetKernelFast(key); constexpr size_t Kernel = 0; // see KernelFastCacheValT tuple From 33fe64c3973db2300cdc955a309de259e839dce8 Mon Sep 17 00:00:00 2001 From: Maosu Zhao Date: Tue, 19 Nov 2024 22:07:31 +0800 Subject: [PATCH 15/36] [DeviceSanitizer] Do sanitize for device globals in AddressSanitizer pass (#13678) Now UR already implemented API "urProgramGetGlobalVariablePointer", so we can use it to query the size of device globals and remove "__AsanDeviceGlobalCount". UR Part: https://github.com/oneapi-src/unified-runtime/pull/1584 --------- Co-authored-by: Callum Fare --- clang/lib/Driver/OffloadBundler.cpp | 7 +- .../llvm/SYCLLowerIR/SanitizeDeviceGlobal.h | 23 --- llvm/lib/SYCLLowerIR/CMakeLists.txt | 1 - llvm/lib/SYCLLowerIR/SanitizeDeviceGlobal.cpp | 144 ------------------ .../Instrumentation/AddressSanitizer.cpp | 87 +++++++++++ .../Transforms/Instrumentation/CMakeLists.txt | 1 + .../AddressSanitizer/SPIRV/device_global.ll | 13 ++ .../SPIRV/device_global_non_image_scope.ll | 11 ++ llvm/tools/sycl-post-link/sycl-post-link.cpp | 6 - sycl/cmake/modules/UnifiedRuntimeTag.cmake | 12 +- 10 files changed, 123 insertions(+), 182 deletions(-) delete mode 100644 llvm/include/llvm/SYCLLowerIR/SanitizeDeviceGlobal.h delete mode 100644 llvm/lib/SYCLLowerIR/SanitizeDeviceGlobal.cpp create mode 100644 llvm/test/Instrumentation/AddressSanitizer/SPIRV/device_global.ll create mode 100644 llvm/test/Instrumentation/AddressSanitizer/SPIRV/device_global_non_image_scope.ll diff --git a/clang/lib/Driver/OffloadBundler.cpp b/clang/lib/Driver/OffloadBundler.cpp index f6d555aaa8ce3..d026805d22f4a 100644 --- a/clang/lib/Driver/OffloadBundler.cpp +++ b/clang/lib/Driver/OffloadBundler.cpp @@ -688,8 +688,11 @@ class ObjectFileHandler final : public FileHandler { return std::move(Err); // If we are dealing with a bitcode file do not add special globals - // llvm.used and llvm.compiler.used to the list of defined symbols. - if (SF->isIR() && (Name == "llvm.used" || Name == "llvm.compiler.used")) + // llvm.used and llvm.compiler.used and __AsanDeviceGlobalMetadata to + // the list of defined symbols. + if (SF->isIR() && + (Name == "llvm.used" || Name == "llvm.compiler.used" || + Name == "__AsanDeviceGlobalMetadata")) continue; // Add symbol name with the target prefix to the buffer. diff --git a/llvm/include/llvm/SYCLLowerIR/SanitizeDeviceGlobal.h b/llvm/include/llvm/SYCLLowerIR/SanitizeDeviceGlobal.h deleted file mode 100644 index a0e7b2999b480..0000000000000 --- a/llvm/include/llvm/SYCLLowerIR/SanitizeDeviceGlobal.h +++ /dev/null @@ -1,23 +0,0 @@ -//===-- SanitizeDeviceGlobal.h - instrument device global for sanitizer ---===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// This pass adds red zone to each image scope device global and record the -// information like size, red zone size and beginning address. The information -// will be used by address sanitizer. -//===----------------------------------------------------------------------===// - -#include "llvm/IR/PassManager.h" - -namespace llvm { - -class SanitizeDeviceGlobalPass - : public PassInfoMixin { -public: - PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM); -}; - -} // namespace llvm diff --git a/llvm/lib/SYCLLowerIR/CMakeLists.txt b/llvm/lib/SYCLLowerIR/CMakeLists.txt index 0ce2a91f91a29..9f0b7fe7e43b0 100644 --- a/llvm/lib/SYCLLowerIR/CMakeLists.txt +++ b/llvm/lib/SYCLLowerIR/CMakeLists.txt @@ -69,7 +69,6 @@ add_llvm_component_library(LLVMSYCLLowerIR SYCLPropagateJointMatrixUsage.cpp SYCLVirtualFunctionsAnalysis.cpp SYCLUtils.cpp - SanitizeDeviceGlobal.cpp LocalAccessorToSharedMemory.cpp GlobalOffset.cpp diff --git a/llvm/lib/SYCLLowerIR/SanitizeDeviceGlobal.cpp b/llvm/lib/SYCLLowerIR/SanitizeDeviceGlobal.cpp deleted file mode 100644 index 81415b0f6f9dc..0000000000000 --- a/llvm/lib/SYCLLowerIR/SanitizeDeviceGlobal.cpp +++ /dev/null @@ -1,144 +0,0 @@ -//===-- SanitizeDeviceGlobal.cpp - instrument device global for sanitizer -===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// This pass adds red zone to each image scope device global and record the -// information like size, red zone size and beginning address. The information -// will be used by address sanitizer. -// TODO: Do this in AddressSanitizer pass when urProgramGetGlobalVariablePointer -// is implemented. -//===----------------------------------------------------------------------===// - -#include "llvm/SYCLLowerIR/SanitizeDeviceGlobal.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/SYCLLowerIR/DeviceGlobals.h" - -#define DEBUG_TYPE "SanitizeDeviceGlobal" - -using namespace llvm; - -namespace { - -// Add extra red zone to each image scope device globals if the module has been -// instrumented by sanitizer pass. And record their infomation like size, red -// zone size, beginning address. -static bool instrumentDeviceGlobal(Module &M) { - auto &DL = M.getDataLayout(); - IRBuilder<> IRB(M.getContext()); - SmallVector GlobalsToRemove; - SmallVector NewDeviceGlobals; - SmallVector DeviceGlobalMetadata; - - constexpr uint64_t MaxRZ = 1 << 18; - constexpr uint64_t MinRZ = 32; - - Type *IntTy = Type::getIntNTy(M.getContext(), DL.getPointerSizeInBits()); - - // Device global meta data is described by a structure - // size_t device_global_size - // size_t device_global_size_with_red_zone - // size_t beginning address of the device global - StructType *StructTy = StructType::get(IntTy, IntTy, IntTy); - - for (auto &G : M.globals()) { - // Non image scope device globals are implemented by device USM, and the - // out-of-bounds check for them will be done by sanitizer USM part. So we - // exclude them here. - if (!isDeviceGlobalVariable(G) || !hasDeviceImageScopeProperty(G)) - continue; - - Type *Ty = G.getValueType(); - const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty); - const uint64_t RightRedzoneSize = [&] { - // The algorithm for calculating red zone size comes from - // llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp - uint64_t RZ = 0; - if (SizeInBytes <= MinRZ / 2) { - // Reduce redzone size for small size objects, e.g. int, char[1]. - // Optimize when SizeInBytes is less than or equal to half of MinRZ. - RZ = MinRZ - SizeInBytes; - } else { - // Calculate RZ, where MinRZ <= RZ <= MaxRZ, and RZ ~ 1/4 * - // SizeInBytes. - RZ = std::clamp((SizeInBytes / MinRZ / 4) * MinRZ, MinRZ, MaxRZ); - - // Round up to multiple of MinRZ. - if (SizeInBytes % MinRZ) - RZ += MinRZ - (SizeInBytes % MinRZ); - } - - assert((RZ + SizeInBytes) % MinRZ == 0); - return RZ; - }(); - Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize); - StructType *NewTy = StructType::get(Ty, RightRedZoneTy); - Constant *NewInitializer = ConstantStruct::get( - NewTy, G.getInitializer(), Constant::getNullValue(RightRedZoneTy)); - - // Create a new global variable with enough space for a redzone. - GlobalVariable *NewGlobal = new GlobalVariable( - M, NewTy, G.isConstant(), G.getLinkage(), NewInitializer, "", &G, - G.getThreadLocalMode(), G.getAddressSpace()); - NewGlobal->copyAttributesFrom(&G); - NewGlobal->setComdat(G.getComdat()); - NewGlobal->setAlignment(Align(MinRZ)); - NewGlobal->copyMetadata(&G, 0); - - Value *Indices2[2]; - Indices2[0] = IRB.getInt32(0); - Indices2[1] = IRB.getInt32(0); - - G.replaceAllUsesWith( - ConstantExpr::getGetElementPtr(NewTy, NewGlobal, Indices2, true)); - NewGlobal->takeName(&G); - GlobalsToRemove.push_back(&G); - NewDeviceGlobals.push_back(NewGlobal); - DeviceGlobalMetadata.push_back(ConstantStruct::get( - StructTy, ConstantInt::get(IntTy, SizeInBytes), - ConstantInt::get(IntTy, SizeInBytes + RightRedzoneSize), - ConstantExpr::getPointerCast(NewGlobal, IntTy))); - } - - if (GlobalsToRemove.empty()) - return false; - - // Create global to record number of device globals - GlobalVariable *NumOfDeviceGlobals = new GlobalVariable( - M, IntTy, false, GlobalValue::ExternalLinkage, - ConstantInt::get(IntTy, NewDeviceGlobals.size()), - "__AsanDeviceGlobalCount", nullptr, GlobalValue::NotThreadLocal, 1); - NumOfDeviceGlobals->setUnnamedAddr(GlobalValue::UnnamedAddr::Local); - - // Create meta data global to record device globals' information - ArrayType *ArrayTy = ArrayType::get(StructTy, NewDeviceGlobals.size()); - Constant *MetadataInitializer = - ConstantArray::get(ArrayTy, DeviceGlobalMetadata); - GlobalVariable *AsanDeviceGlobalMetadata = new GlobalVariable( - M, MetadataInitializer->getType(), false, GlobalValue::ExternalLinkage, - MetadataInitializer, "__AsanDeviceGlobalMetadata", nullptr, - GlobalValue::NotThreadLocal, 1); - AsanDeviceGlobalMetadata->setUnnamedAddr(GlobalValue::UnnamedAddr::Local); - - for (auto *G : GlobalsToRemove) - G->eraseFromParent(); - - return true; -} - -} - -namespace llvm { - -PreservedAnalyses SanitizeDeviceGlobalPass::run(Module &M, - ModuleAnalysisManager &MAM) { - bool Modified = false; - - Modified |= instrumentDeviceGlobal(M); - - return Modified ? PreservedAnalyses::none() : PreservedAnalyses::all(); -} - -} diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 84d839ee2fba9..58d603524cd79 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -64,6 +64,7 @@ #include "llvm/IR/Use.h" #include "llvm/IR/Value.h" #include "llvm/MC/MCSectionMachO.h" +#include "llvm/SYCLLowerIR/DeviceGlobals.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -465,6 +466,10 @@ static cl::opt cl::desc("instrument generic pointer"), cl::Hidden, cl::init(true)); +static cl::opt ClDeviceGlobals("asan-device-globals", + cl::desc("instrument device globals"), + cl::Hidden, cl::init(true)); + // Debug flags. static cl::opt ClDebug("asan-debug", cl::desc("debug"), cl::Hidden, @@ -970,6 +975,7 @@ class ModuleAddressSanitizer { private: void initializeCallbacks(); + void instrumentDeviceGlobal(IRBuilder<> &IRB); void instrumentGlobals(IRBuilder<> &IRB, bool *CtorComdat); void InstrumentGlobalsCOFF(IRBuilder<> &IRB, ArrayRef ExtendedGlobals, @@ -1556,12 +1562,23 @@ static bool isJointMatrixAccess(Value *V) { return false; } +static bool isUnsupportedDeviceGlobal(GlobalVariable *G) { + // Non image scope device globals are implemented by device USM, and the + // out-of-bounds check for them will be done by sanitizer USM part. So we + // exclude them here. + return (!isDeviceGlobalVariable(*G) || !hasDeviceImageScopeProperty(*G)); +} + static bool isUnsupportedSPIRAccess(Value *Addr, Instruction *Inst) { // Skip SPIR-V built-in varibles auto *OrigValue = Addr->stripInBoundsOffsets(); if (OrigValue->getName().starts_with("__spirv_BuiltIn")) return true; + GlobalVariable *GV = dyn_cast(OrigValue); + if (GV && isUnsupportedDeviceGlobal(GV)) + return true; + // Ignore load/store for target ext type since we can't know exactly what size // it is. if (auto *SI = dyn_cast(Inst)) @@ -2766,6 +2783,71 @@ Instruction *ModuleAddressSanitizer::CreateAsanModuleDtor() { return ReturnInst::Create(*C, AsanDtorBB); } +void ModuleAddressSanitizer::instrumentDeviceGlobal(IRBuilder<> &IRB) { + auto &DL = M.getDataLayout(); + SmallVector GlobalsToRemove; + SmallVector DeviceGlobalMetadata; + + Type *IntptrTy = M.getDataLayout().getIntPtrType(*C, kSpirOffloadGlobalAS); + + // Device global meta data is described by a structure + // size_t device_global_size + // size_t device_global_size_with_red_zone + // size_t beginning address of the device global + StructType *StructTy = StructType::get(IntptrTy, IntptrTy, IntptrTy); + + for (auto &G : M.globals()) { + if (isUnsupportedDeviceGlobal(&G)) + continue; + + Type *Ty = G.getValueType(); + const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty); + const uint64_t RightRedzoneSize = getRedzoneSizeForGlobal(SizeInBytes); + Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize); + StructType *NewTy = StructType::get(Ty, RightRedZoneTy); + Constant *NewInitializer = ConstantStruct::get( + NewTy, G.getInitializer(), Constant::getNullValue(RightRedZoneTy)); + + // Create a new global variable with enough space for a redzone. + GlobalVariable *NewGlobal = new GlobalVariable( + M, NewTy, G.isConstant(), G.getLinkage(), NewInitializer, "", &G, + G.getThreadLocalMode(), G.getAddressSpace()); + NewGlobal->copyAttributesFrom(&G); + NewGlobal->setComdat(G.getComdat()); + NewGlobal->setAlignment(Align(getMinRedzoneSizeForGlobal())); + NewGlobal->copyMetadata(&G, 0); + + Value *Indices2[2]; + Indices2[0] = IRB.getInt32(0); + Indices2[1] = IRB.getInt32(0); + + G.replaceAllUsesWith( + ConstantExpr::getGetElementPtr(NewTy, NewGlobal, Indices2, true)); + NewGlobal->takeName(&G); + GlobalsToRemove.push_back(&G); + DeviceGlobalMetadata.push_back(ConstantStruct::get( + StructTy, ConstantInt::get(IntptrTy, SizeInBytes), + ConstantInt::get(IntptrTy, SizeInBytes + RightRedzoneSize), + ConstantExpr::getPointerCast(NewGlobal, IntptrTy))); + } + + if (GlobalsToRemove.empty()) + return; + + // Create meta data global to record device globals' information + ArrayType *ArrayTy = ArrayType::get(StructTy, DeviceGlobalMetadata.size()); + Constant *MetadataInitializer = + ConstantArray::get(ArrayTy, DeviceGlobalMetadata); + GlobalVariable *AsanDeviceGlobalMetadata = new GlobalVariable( + M, MetadataInitializer->getType(), false, GlobalValue::AppendingLinkage, + MetadataInitializer, "__AsanDeviceGlobalMetadata", nullptr, + GlobalValue::NotThreadLocal, 1); + AsanDeviceGlobalMetadata->setUnnamedAddr(GlobalValue::UnnamedAddr::Local); + + for (auto *G : GlobalsToRemove) + G->eraseFromParent(); +} + void ModuleAddressSanitizer::InstrumentGlobalsCOFF( IRBuilder<> &IRB, ArrayRef ExtendedGlobals, ArrayRef MetadataInitializers) { @@ -3234,6 +3316,11 @@ bool ModuleAddressSanitizer::instrumentModule() { auto *MD = M.getOrInsertNamedMetadata("device.sanitizer"); Metadata *MDVals[] = {MDString::get(Ctx, "asan")}; MD->addOperand(MDNode::get(Ctx, MDVals)); + + if (ClDeviceGlobals) { + IRBuilder<> IRB(*C); + instrumentDeviceGlobal(IRB); + } } const uint64_t Priority = GetCtorAndDtorPriority(TargetTriple); diff --git a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt index f08d936a5bcba..e96faba7cc323 100644 --- a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt +++ b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt @@ -39,6 +39,7 @@ add_llvm_component_library(LLVMInstrumentation Core Demangle MC + SYCLLowerIR Support TargetParser TransformUtils diff --git a/llvm/test/Instrumentation/AddressSanitizer/SPIRV/device_global.ll b/llvm/test/Instrumentation/AddressSanitizer/SPIRV/device_global.ll new file mode 100644 index 0000000000000..a30eca4bc75be --- /dev/null +++ b/llvm/test/Instrumentation/AddressSanitizer/SPIRV/device_global.ll @@ -0,0 +1,13 @@ +; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 -asan-stack=0 -asan-globals=0 -asan-constructor-kind=none -S | FileCheck %s + +; check that image scope device globals can be correctly instrumented. + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" +target triple = "spir64-unknown-unknown" + +@dev_global = addrspace(1) global { [4 x i32] } zeroinitializer #0 + +; CHECK: @dev_global = addrspace(1) global { { [4 x i32] }, [16 x i8] } +; CHECK: @__AsanDeviceGlobalMetadata = appending local_unnamed_addr addrspace(1) global [1 x { i64, i64, i64 }] [{ i64, i64, i64 } { i64 16, i64 32, i64 ptrtoint (ptr addrspace(1) @dev_global to i64) }] + +attributes #0 = { "sycl-device-global-size"="16" "sycl-device-image-scope" } diff --git a/llvm/test/Instrumentation/AddressSanitizer/SPIRV/device_global_non_image_scope.ll b/llvm/test/Instrumentation/AddressSanitizer/SPIRV/device_global_non_image_scope.ll new file mode 100644 index 0000000000000..735c437c47169 --- /dev/null +++ b/llvm/test/Instrumentation/AddressSanitizer/SPIRV/device_global_non_image_scope.ll @@ -0,0 +1,11 @@ +; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 -asan-stack=0 -asan-globals=0 -asan-constructor-kind=none -S | FileCheck %s + +; check non image scope device globals will not be instrumented. + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" +target triple = "spir64-unknown-unknown" + +@dev_global = addrspace(1) global { ptr addrspace(1), [4 x i32] } zeroinitializer + +; CHECK: @dev_global = addrspace(1) global { ptr addrspace(1), [4 x i32] } +; CHECK-NOT: @__AsanDeviceGlobalMetadata diff --git a/llvm/tools/sycl-post-link/sycl-post-link.cpp b/llvm/tools/sycl-post-link/sycl-post-link.cpp index 3800c5875e44f..0a15c42dc4333 100644 --- a/llvm/tools/sycl-post-link/sycl-post-link.cpp +++ b/llvm/tools/sycl-post-link/sycl-post-link.cpp @@ -40,7 +40,6 @@ #include "llvm/SYCLLowerIR/ModuleSplitter.h" #include "llvm/SYCLLowerIR/SYCLJointMatrixTransform.h" #include "llvm/SYCLLowerIR/SYCLUtils.h" -#include "llvm/SYCLLowerIR/SanitizeDeviceGlobal.h" #include "llvm/SYCLLowerIR/SpecConstants.h" #include "llvm/SYCLLowerIR/Support.h" #include "llvm/Support/CommandLine.h" @@ -791,11 +790,6 @@ processInputModule(std::unique_ptr M) { if (M->getTargetTriple().find("spir") != std::string::npos) Modified |= removeDeviceGlobalFromCompilerUsed(*M.get()); - // Instrument each image scope device globals if the module has been - // instrumented by sanitizer pass. - if (isModuleUsingAsan(*M)) - Modified |= runModulePass(*M); - // Transform Joint Matrix builtin calls to align them with SPIR-V friendly // LLVM IR specification. Modified |= runModulePass(*M); diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake index 9b405334a6816..3744a4e87ad76 100644 --- a/sycl/cmake/modules/UnifiedRuntimeTag.cmake +++ b/sycl/cmake/modules/UnifiedRuntimeTag.cmake @@ -1,7 +1,7 @@ -# commit 30391c65d2d2ccc7ee3688a14815804bfb7fdf05 -# Merge: 5e6d79b3 58dabfe8 +# commit 0ea47d7c70b9a21a3d90612a0a0e7525034e62f7 +# Merge: e3247c23 e36941cb # Author: Callum Fare -# Date: Fri Nov 15 15:13:20 2024 +0000 -# Merge pull request #2222 from RossBrunton/ross/cfi -# Enable -flto and -fsanitize=cfi in clang -set(UNIFIED_RUNTIME_TAG 30391c65d2d2ccc7ee3688a14815804bfb7fdf05) +# Date: Tue Nov 19 10:24:08 2024 +0000 +# Merge pull request #1584 from zhaomaosu/simplify-device-global +# [DeviceSanitizer] Remove device global "__AsanDeviceGlobalCount" +set(UNIFIED_RUNTIME_TAG 0ea47d7c70b9a21a3d90612a0a0e7525034e62f7) From f741bdd8d49d3b3896d1250b101e90ed6fbf7541 Mon Sep 17 00:00:00 2001 From: Lorenc Bushi Date: Tue, 19 Nov 2024 09:38:56 -0500 Subject: [PATCH 16/36] [SYCL] Write tests following the test plan for the work group memory extension (#15928) This PR adds feature tests as per the work group memory extension [test plan](https://github.com/intel/llvm/blob/sycl/sycl/test-e2e/WorkGroupMemory/test-plan.md). --------- Co-authored-by: Steffen Larsen --- .../oneapi/experimental/work_group_memory.hpp | 19 +- .../{swap_test.cpp => basic_usage.cpp} | 66 +++-- sycl/test-e2e/WorkGroupMemory/common.hpp | 57 +++++ .../WorkGroupMemory/common_free_function.hpp | 121 +++++++++ .../WorkGroupMemory/common_lambda.hpp | 22 ++ .../reduction_free_function.cpp | 132 ++++++++++ .../WorkGroupMemory/reduction_lambda.cpp | 242 ++++++++++++++++++ .../WorkGroupMemory/api_misuse_test.cpp | 43 ++++ .../extensions/WorkGroupMemory/api_test.cpp | 102 ++++++++ .../WorkGroupMemory/empty_properties_test.cpp | 22 ++ 10 files changed, 808 insertions(+), 18 deletions(-) rename sycl/test-e2e/WorkGroupMemory/{swap_test.cpp => basic_usage.cpp} (88%) create mode 100644 sycl/test-e2e/WorkGroupMemory/common.hpp create mode 100644 sycl/test-e2e/WorkGroupMemory/common_free_function.hpp create mode 100644 sycl/test-e2e/WorkGroupMemory/common_lambda.hpp create mode 100644 sycl/test-e2e/WorkGroupMemory/reduction_free_function.cpp create mode 100644 sycl/test-e2e/WorkGroupMemory/reduction_lambda.cpp create mode 100644 sycl/test/extensions/WorkGroupMemory/api_misuse_test.cpp create mode 100644 sycl/test/extensions/WorkGroupMemory/api_test.cpp create mode 100644 sycl/test/extensions/WorkGroupMemory/empty_properties_test.cpp diff --git a/sycl/include/sycl/ext/oneapi/experimental/work_group_memory.hpp b/sycl/include/sycl/ext/oneapi/experimental/work_group_memory.hpp index 0be24c912907b..c156c484f539d 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/work_group_memory.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/work_group_memory.hpp @@ -46,6 +46,15 @@ class __SYCL_SPECIAL_CLASS __SYCL_TYPE(work_group_memory) work_group_memory using value_type = std::remove_all_extents_t; private: + // At the moment we do not have a way to set properties nor property values to + // set for work group memory. So, we check here for diagnostic purposes that + // the property list is empty. + // TODO: Remove this function and its occurrences in this file once properties + // have been created for work group memory. + void check_props_empty() const { + static_assert(std::is_same_v && + "Work group memory class does not support properties yet!"); + } using decoratedPtr = typename sycl::detail::DecoratedType< value_type, access::address_space::local_space>::type *; @@ -62,18 +71,22 @@ class __SYCL_SPECIAL_CLASS __SYCL_TYPE(work_group_memory) work_group_memory #endif public: - work_group_memory(const indeterminate_t &) {}; + work_group_memory(const indeterminate_t &) { check_props_empty(); }; work_group_memory(const work_group_memory &rhs) = default; work_group_memory &operator=(const work_group_memory &rhs) = default; template >> work_group_memory(handler &) - : sycl::detail::work_group_memory_impl(sizeof(DataT)) {} + : sycl::detail::work_group_memory_impl(sizeof(DataT)) { + check_props_empty(); + } template >> work_group_memory(size_t num, handler &) : sycl::detail::work_group_memory_impl( - num * sizeof(std::remove_extent_t)) {} + num * sizeof(std::remove_extent_t)) { + check_props_empty(); + } template multi_ptr get_multi_ptr() const { diff --git a/sycl/test-e2e/WorkGroupMemory/swap_test.cpp b/sycl/test-e2e/WorkGroupMemory/basic_usage.cpp similarity index 88% rename from sycl/test-e2e/WorkGroupMemory/swap_test.cpp rename to sycl/test-e2e/WorkGroupMemory/basic_usage.cpp index 7552774edcbbf..4dbc2073d7009 100644 --- a/sycl/test-e2e/WorkGroupMemory/swap_test.cpp +++ b/sycl/test-e2e/WorkGroupMemory/basic_usage.cpp @@ -5,6 +5,8 @@ #include #include #include +#include + namespace syclexp = sycl::ext::oneapi::experimental; sycl::queue q; @@ -50,7 +52,9 @@ template void swap_scalar(T &a, T &b) { sycl::nd_range<1> ndr{size, wgsize}; cgh.parallel_for(ndr, [=](sycl::nd_item<1> it) { syclexp::work_group_memory temp2{syclexp::indeterminate}; - temp2 = temp; // temp and temp2 have the same underlying data + temp2 = temp; // temp and temp2 have the same underlying data + assert(&temp2 == &temp); // check that both objects return same + // underlying address after assignment temp = acc_a[0]; acc_a[0] = acc_b[0]; acc_b[0] = temp2; // safe to use temp2 @@ -86,6 +90,8 @@ template void swap_scalar(T &a, T &b) { assert(a == old_b && b == old_a && "Incorrect swap!"); // Same as above but instead of using multi_ptr, use address-of operator. + // Also verify that get_multi_ptr() returns the same address as address-of + // operator. { sycl::buffer buf_a{&a, 1}; sycl::buffer buf_b{&b, 1}; @@ -96,6 +102,7 @@ template void swap_scalar(T &a, T &b) { syclexp::work_group_memory temp2{cgh}; sycl::nd_range<1> ndr{size, wgsize}; cgh.parallel_for(ndr, [=](sycl::nd_item<> it) { + assert(&temp == temp.get_multi_ptr().get()); temp = acc_a[0]; acc_a[0] = acc_b[0]; temp2 = *(&temp); @@ -294,6 +301,8 @@ void swap_array_2d(T (&a)[N][N], T (&b)[N][N], size_t batch_size) { temp[i][j] = acc_a[i][j]; acc_a[i][j] = acc_b[i][j]; syclexp::work_group_memory temp2{temp}; + assert(&temp2 == &temp); // check both objects return same underlying + // address after copy construction. acc_b[i][j] = temp2[i][j]; }); }); @@ -342,28 +351,28 @@ void swap_array_2d(T (&a)[N][N], T (&b)[N][N], size_t batch_size) { // so we can verify that each work-item sees the value written by its leader. // The test also is a sanity check that different work groups get different // work group memory locations as otherwise we'd have data races. -void coherency(size_t size, size_t wgsize) { +template void coherency(size_t size, size_t wgsize) { q.submit([&](sycl::handler &cgh) { - syclexp::work_group_memory data{cgh}; + syclexp::work_group_memory data{cgh}; sycl::nd_range<1> ndr{size, wgsize}; cgh.parallel_for(ndr, [=](sycl::nd_item<1> it) { if (it.get_group().leader()) { - data = it.get_global_id() / wgsize; + data = T(it.get_global_id() / wgsize); } sycl::group_barrier(it.get_group()); - assert(data == it.get_global_id() / wgsize); + assert(data == T(it.get_global_id() / wgsize)); }); }); } constexpr size_t N = 32; -int main() { - int intarr1[N][N]; - int intarr2[N][N]; +template void test() { + T intarr1[N][N]; + T intarr2[N][N]; for (int i = 0; i < N; ++i) { for (int j = 0; j < N; ++j) { - intarr1[i][j] = i + j; - intarr2[i][j] = i * j; + intarr1[i][j] = T(i) + T(j); + intarr2[i][j] = T(i) * T(j); } } for (int i = 0; i < N; ++i) { @@ -373,10 +382,37 @@ int main() { swap_array_1d(intarr1[i], intarr2[i], 8); } swap_array_2d(intarr1, intarr2, 8); - coherency(N, N / 2); - coherency(N, N / 4); - coherency(N, N / 8); - coherency(N, N / 16); - coherency(N, N / 32); + coherency(N, N / 2); + coherency(N, N / 4); + coherency(N, N / 8); + coherency(N, N / 16); + coherency(N, N / 32); +} + +template void test_ptr() { + T arr1[N][N]; + T arr2[N][N]; + for (int i = 0; i < N; ++i) { + for (int j = 0; j < N; ++j) { + swap_scalar(arr1[i][j], arr2[i][j]); + } + swap_array_1d(arr1[i], arr2[i], 8); + } + swap_array_2d(arr1, arr2, 8); +} + +int main() { + test(); + test(); + test(); + if (q.get_device().has(sycl::aspect::fp16)) + test(); + test_ptr(); + test_ptr(); + test_ptr(); + test_ptr(); + if (q.get_device().has(sycl::aspect::fp16)) + test_ptr(); + test_ptr(); return 0; } diff --git a/sycl/test-e2e/WorkGroupMemory/common.hpp b/sycl/test-e2e/WorkGroupMemory/common.hpp new file mode 100644 index 0000000000000..64452745ede1b --- /dev/null +++ b/sycl/test-e2e/WorkGroupMemory/common.hpp @@ -0,0 +1,57 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace sycl; + +template bool check_half_aspect(queue &q) { + if (std::is_same_v && + !q.get_device().has(sycl::aspect::fp16)) { + std::cout << "Device does not support fp16 aspect. Skipping all tests with " + "sycl::half type!" + << std::endl; + return false; + } + return true; +} + +template bool check_double_aspect(queue &q) { + if (std::is_same_v && !q.get_device().has(aspect::fp64)) { + std::cout << "Device does not support fp64 aspect. Skipping all tests with " + "double type!" + << std::endl; + return false; + } + return true; +} + +template struct S { + T val; +}; + +template struct M { + T val; +}; + +union U { + S s; + M m; +}; + +template +void sum_helper(sycl::ext::oneapi::experimental::work_group_memory mem, + sycl::ext::oneapi::experimental::work_group_memory ret, + size_t WGSIZE) { + for (int i = 0; i < WGSIZE; ++i) { + ret = ret + mem[i]; + } +} diff --git a/sycl/test-e2e/WorkGroupMemory/common_free_function.hpp b/sycl/test-e2e/WorkGroupMemory/common_free_function.hpp new file mode 100644 index 0000000000000..e13f50214593d --- /dev/null +++ b/sycl/test-e2e/WorkGroupMemory/common_free_function.hpp @@ -0,0 +1,121 @@ +#pragma once + +#include "common.hpp" +#include "common_lambda.hpp" +#include +#include +#include +#include +#include +#include + +using namespace sycl; + +template +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY( + (ext::oneapi::experimental::nd_range_kernel<1>)) +void sum(sycl::ext::oneapi::experimental::work_group_memory mem, T *buf, + sycl::ext::oneapi::experimental::work_group_memory result, + T expected, size_t WGSIZE, bool UseHelper) { + const auto it = sycl::ext::oneapi::this_work_item::get_nd_item<1>(); + size_t local_id = it.get_local_id(); + mem[local_id] = buf[local_id]; + group_barrier(it.get_group()); + if (it.get_group().leader()) { + result = 0; + if (!UseHelper) { + for (int i = 0; i < WGSIZE; ++i) { + result = result + mem[i]; + } + } else { + sum_helper(mem, result, WGSIZE); + } + assert(result == expected); + } +} + +// Explicit instantiations for the relevant data types. +// These are needed because free function kernel support is not fully +// implemented yet. +// TODO: Remove these once free function kernel support is fully there. +#define SUM(T) \ + template void sum( \ + sycl::ext::oneapi::experimental::work_group_memory mem, T * buf, \ + sycl::ext::oneapi::experimental::work_group_memory result, \ + T expected, size_t WGSIZE, bool UseHelper); + +SUM(int) +SUM(uint16_t) +SUM(half) +SUM(double) +SUM(float) +SUM(char) +SUM(bool) + +template +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY( + (ext::oneapi::experimental::nd_range_kernel<1>)) +void sum_marray( + sycl::ext::oneapi::experimental::work_group_memory> mem, + T *buf, sycl::ext::oneapi::experimental::work_group_memory result, + T expected) { + const auto it = sycl::ext::oneapi::this_work_item::get_nd_item<1>(); + size_t local_id = it.get_local_id(); + constexpr T tolerance = 0.0001; + sycl::marray &data = mem; + data[local_id] = buf[local_id]; + group_barrier(it.get_group()); + if (it.get_group().leader()) { + result = 0; + for (int i = 0; i < 16; ++i) { + result = result + data[i]; + } + assert((result - expected) * (result - expected) <= tolerance); + } +} + +// Explicit instantiations for the relevant data types. +#define SUM_MARRAY(T) \ + template void sum_marray( \ + sycl::ext::oneapi::experimental::work_group_memory> \ + mem, \ + T * buf, sycl::ext::oneapi::experimental::work_group_memory result, \ + T expected); + +SUM_MARRAY(float); +SUM_MARRAY(double); +SUM_MARRAY(half); + +template +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY( + (ext::oneapi::experimental::nd_range_kernel<1>)) +void sum_vec( + sycl::ext::oneapi::experimental::work_group_memory> mem, + T *buf, sycl::ext::oneapi::experimental::work_group_memory result, + T expected) { + const auto it = sycl::ext::oneapi::this_work_item::get_nd_item<1>(); + size_t local_id = it.get_local_id(); + constexpr T tolerance = 0.0001; + sycl::vec &data = mem; + data[local_id] = buf[local_id]; + group_barrier(it.get_group()); + if (it.get_group().leader()) { + result = 0; + for (int i = 0; i < 16; ++i) { + result = result + data[i]; + } + assert((result - expected) * (result - expected) <= tolerance); + } +} + +// Explicit instantiations for the relevant data types. +#define SUM_VEC(T) \ + template void sum_vec( \ + sycl::ext::oneapi::experimental::work_group_memory> \ + mem, \ + T * buf, sycl::ext::oneapi::experimental::work_group_memory result, \ + T expected); + +SUM_VEC(float); +SUM_VEC(double); +SUM_VEC(half); diff --git a/sycl/test-e2e/WorkGroupMemory/common_lambda.hpp b/sycl/test-e2e/WorkGroupMemory/common_lambda.hpp new file mode 100644 index 0000000000000..f5c8b6651ffcc --- /dev/null +++ b/sycl/test-e2e/WorkGroupMemory/common_lambda.hpp @@ -0,0 +1,22 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace sycl; + +template +void sum_helper(sycl::ext::oneapi::experimental::work_group_memory mem, + sycl::ext::oneapi::experimental::work_group_memory ret, + size_t WGSIZE) { + for (int i = 0; i < WGSIZE; ++i) { + ret = ret + mem[i]; + } +} diff --git a/sycl/test-e2e/WorkGroupMemory/reduction_free_function.cpp b/sycl/test-e2e/WorkGroupMemory/reduction_free_function.cpp new file mode 100644 index 0000000000000..ff2aa8aa19385 --- /dev/null +++ b/sycl/test-e2e/WorkGroupMemory/reduction_free_function.cpp @@ -0,0 +1,132 @@ +// REQUIRES: aspect-usm_shared_allocations +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// UNSUPPORTED: cuda +// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/16004 + +// XFAIL: * +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/15927 + +#include "common_free_function.hpp" + +// Basic usage reduction test using free function kernels. +// A global buffer is allocated using USM and it is passed to the kernel on the +// device. On the device, a work group memory buffer is allocated and each item +// copies the correspondng element of the global buffer to the corresponding +// element of the work group memory buffer using its lcoal index. The leader of +// every work-group, after waiting for every work-item to complete, then sums +// these values storing the result in another work group memory object. Finally, +// each work item then verifies that the sum of the work group memory elements +// equals the sum of the global buffer elements. This is repeated for several +// data types. + +queue q; +context ctx = q.get_context(); + +constexpr size_t SIZE = 128; +constexpr size_t VEC_SIZE = 16; + +template void test_marray() { + if (!check_half_aspect(q) || !check_double_aspect(q)) + return; + + constexpr size_t WGSIZE = VEC_SIZE; + T *buf = malloc_shared(WGSIZE, q); + assert(buf && "Shared USM allocation failed!"); + T expected = 0; + for (int i = 0; i < WGSIZE; ++i) { + buf[i] = T(i) / WGSIZE; + expected = expected + buf[i]; + } + nd_range ndr{{SIZE}, {WGSIZE}}; +#ifndef __SYCL_DEVICE_ONLY__ + // Get the kernel object for the "mykernel" kernel. + auto Bundle = get_kernel_bundle(ctx); + kernel_id sum_id = ext::oneapi::experimental::get_kernel_id>(); + kernel k_sum = Bundle.get_kernel(sum_id); + q.submit([&](sycl::handler &cgh) { + ext::oneapi::experimental::work_group_memory> mem{cgh}; + ext::oneapi::experimental ::work_group_memory result{cgh}; + cgh.set_args(mem, buf, result, expected); + cgh.parallel_for(ndr, k_sum); + }).wait(); +#endif // __SYCL_DEVICE_ONLY + free(buf, q); + if constexpr (sizeof...(Ts)) + test_marray(); +} + +template void test_vec() { + if (!check_half_aspect(q) || !check_double_aspect(q)) + return; + + constexpr size_t WGSIZE = VEC_SIZE; + T *buf = malloc_shared(WGSIZE, q); + assert(buf && "Shared USM allocation failed!"); + T expected = 0; + for (int i = 0; i < WGSIZE; ++i) { + buf[i] = T(i) / WGSIZE; + expected = expected + buf[i]; + } + nd_range ndr{{SIZE}, {WGSIZE}}; +#ifndef __SYCL_DEVICE_ONLY__ + // Get the kernel object for the "mykernel" kernel. + auto Bundle = get_kernel_bundle(ctx); + kernel_id sum_id = ext::oneapi::experimental::get_kernel_id>(); + kernel k_sum = Bundle.get_kernel(sum_id); + q.submit([&](sycl::handler &cgh) { + ext::oneapi::experimental::work_group_memory> mem{cgh}; + ext::oneapi::experimental ::work_group_memory result{cgh}; + cgh.set_args(mem, buf, result, expected); + cgh.parallel_for(ndr, k_sum); + }).wait(); +#endif // __SYCL_DEVICE_ONLY + free(buf, q); + if constexpr (sizeof...(Ts)) + test_vec(); +} + +template +void test(size_t SIZE, size_t WGSIZE, bool UseHelper) { + if (!check_half_aspect(q) || !check_double_aspect(q)) + return; + + T *buf = malloc_shared(WGSIZE, q); + assert(buf && "Shared USM allocation failed!"); + T expected = 0; + for (int i = 0; i < WGSIZE; ++i) { + buf[i] = T(i); + expected = expected + buf[i]; + } + nd_range ndr{{SIZE}, {WGSIZE}}; + // The following ifndef is required due to a number of limitations of free + // function kernels. See CMPLRLLVM-61498. + // TODO: Remove it once these limitations are no longer there. +#ifndef __SYCL_DEVICE_ONLY__ + // Get the kernel object for the "mykernel" kernel. + auto Bundle = get_kernel_bundle(ctx); + kernel_id sum_id = ext::oneapi::experimental::get_kernel_id>(); + kernel k_sum = Bundle.get_kernel(sum_id); + q.submit([&](sycl::handler &cgh) { + ext::oneapi::experimental::work_group_memory mem{WGSIZE, cgh}; + ext::oneapi::experimental ::work_group_memory result{cgh}; + cgh.set_args(mem, buf, result, expected, WGSIZE, UseHelper); + cgh.parallel_for(ndr, k_sum); + }).wait(); + +#endif // __SYCL_DEVICE_ONLY + free(buf, q); + if constexpr (sizeof...(Ts)) + test(SIZE, WGSIZE, UseHelper); +} + +int main() { + test(SIZE, SIZE, true /* UseHelper */); + test(SIZE, SIZE, false); + test(SIZE, SIZE / 2, false); + test(SIZE, SIZE / 4, false); + test_marray(); + test_vec(); + return 0; +} diff --git a/sycl/test-e2e/WorkGroupMemory/reduction_lambda.cpp b/sycl/test-e2e/WorkGroupMemory/reduction_lambda.cpp new file mode 100644 index 0000000000000..5759e86f17fe7 --- /dev/null +++ b/sycl/test-e2e/WorkGroupMemory/reduction_lambda.cpp @@ -0,0 +1,242 @@ +// REQUIRES: aspect-usm_shared_allocations +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "common.hpp" + +queue q; +context ctx = q.get_context(); + +constexpr size_t SIZE = 128; + +template +void test_struct(size_t SIZE, size_t WGSIZE) { + if (!check_half_aspect(q) || !check_double_aspect(q)) + return; + S *buf = malloc_shared>(WGSIZE, q); + assert(buf && "Shared USM allocation failed!"); + T expected = 0; + for (int i = 0; i < WGSIZE; ++i) { + buf[i].val = T(i); + expected = expected + buf[i].val; + } + nd_range ndr{{SIZE}, {WGSIZE}}; + q.submit([&](sycl::handler &cgh) { + ext::oneapi::experimental::work_group_memory[]> mem{WGSIZE, cgh}; + ext::oneapi::experimental ::work_group_memory result{cgh}; + cgh.parallel_for(ndr, [=](nd_item<> it) { + size_t local_id = it.get_local_id(); + mem[local_id] = buf[local_id]; + group_barrier(it.get_group()); + if (it.get_group().leader()) { + result = 0; + for (int i = 0; i < WGSIZE; ++i) { + result = result + mem[i].val; + } + assert(result == expected); + } + }); + }).wait(); + free(buf, q); + if constexpr (sizeof...(Ts)) + test_struct(SIZE, WGSIZE); +} + +void test_union(size_t SIZE, size_t WGSIZE) { + U *buf = malloc_shared(WGSIZE, q); + assert(buf && "Shared USM allocation failed!"); + int expected = 0; + for (int i = 0; i < WGSIZE; ++i) { + if (i % 2) + buf[i].s = S{i}; + else + buf[i].m = M{i}; + expected = expected + (i % 2) ? buf[i].s.val : buf[i].m.val; + } + nd_range ndr{{SIZE}, {WGSIZE}}; + q.submit([&](sycl::handler &cgh) { + ext::oneapi::experimental::work_group_memory mem{WGSIZE, cgh}; + ext::oneapi::experimental::work_group_memory result{cgh}; + cgh.parallel_for(ndr, [=](nd_item<> it) { + size_t local_id = it.get_local_id(); + mem[local_id] = buf[local_id]; + group_barrier(it.get_group()); + if (it.get_group().leader()) { + result = 0; + for (int i = 0; i < WGSIZE; ++i) { + result = result + (i % 2) ? mem[i].s.val : mem[i].m.val; + } + assert(result == expected); + } + }); + }).wait(); + free(buf, q); +} + +template +void test(size_t SIZE, size_t WGSIZE, bool UseHelper) { + if (!check_half_aspect(q) || !check_double_aspect(q)) + return; + T *buf = malloc_shared(WGSIZE, q); + assert(buf && "Shared USM allocation failed!"); + T expected = 0; + for (int i = 0; i < WGSIZE; ++i) { + buf[i] = T(i); + expected = expected + buf[i]; + } + nd_range ndr{{SIZE}, {WGSIZE}}; + q.submit([&](sycl::handler &cgh) { + ext::oneapi::experimental::work_group_memory mem{WGSIZE, cgh}; + ext::oneapi::experimental ::work_group_memory result{cgh}; + cgh.parallel_for(ndr, [=](nd_item<> it) { + size_t local_id = it.get_local_id(); + mem[local_id] = buf[local_id]; + group_barrier(it.get_group()); + if (it.get_group().leader()) { + result = 0; + if (!UseHelper) { + for (int i = 0; i < WGSIZE; ++i) { + result = result + mem[i]; + } + } else { + sum_helper(mem, result, WGSIZE); + } + assert(result == expected); + } + }); + }).wait(); + free(buf, q); + if constexpr (sizeof...(Ts)) + test(SIZE, WGSIZE, UseHelper); +} + +template void test_marray() { + if (!check_half_aspect(q) || !check_double_aspect(q)) + return; + constexpr size_t WGSIZE = SIZE; + T *buf = malloc_shared(WGSIZE, q); + assert(buf && "Shared USM allocation failed!"); + T expected = 0; + for (int i = 0; i < WGSIZE; ++i) { + buf[i] = T(i) / WGSIZE; + expected = expected + buf[i]; + } + nd_range ndr{{SIZE}, {WGSIZE}}; + q.submit([&](sycl::handler &cgh) { + ext::oneapi::experimental::work_group_memory> mem{cgh}; + ext::oneapi::experimental ::work_group_memory result{cgh}; + cgh.parallel_for(ndr, [=](nd_item<> it) { + size_t local_id = it.get_local_id(); + constexpr T tolerance = 0.0001; + // User-defined conversion from work group memory to underlying type is + // not applied during member access calls so we have to explicitly + // convert to the value_type ourselves. + marray &data = mem; + data[local_id] = buf[local_id]; + group_barrier(it.get_group()); + if (it.get_group().leader()) { + result = 0; + for (int i = 0; i < WGSIZE; ++i) { + result = result + data[i]; + } + assert((result - expected) * (result - expected) <= tolerance); + } + }); + }).wait(); + free(buf, q); + if constexpr (sizeof...(Ts)) + test_marray(); +} + +template void test_vec() { + if (!check_half_aspect(q) || !check_double_aspect(q)) + return; + constexpr size_t WGSIZE = 8; + T *buf = malloc_shared(WGSIZE, q); + assert(buf && "Shared USM allocation failed!"); + T expected = 0; + for (int i = 0; i < WGSIZE; ++i) { + buf[i] = T(i) / WGSIZE; + expected = expected + buf[i]; + } + nd_range ndr{{SIZE}, {WGSIZE}}; + q.submit([&](sycl::handler &cgh) { + ext::oneapi::experimental::work_group_memory> mem{cgh}; + ext::oneapi::experimental ::work_group_memory result{cgh}; + cgh.parallel_for(ndr, [=](nd_item<> it) { + size_t local_id = it.get_local_id(); + constexpr T tolerance = 0.0001; + vec &data = mem; + data[local_id] = buf[local_id]; + group_barrier(it.get_group()); + if (it.get_group().leader()) { + result = 0; + for (int i = 0; i < WGSIZE; ++i) { + result = result + data[i]; + } + assert((result - expected) * (result - expected) <= tolerance); + } + }); + }).wait(); + free(buf, q); + if constexpr (sizeof...(Ts)) + test_vec(); +} + +template void test_atomic_ref() { + if (!(sizeof(T) == 4 || + (sizeof(T) == 8 && q.get_device().has(aspect::atomic64)))) { + std::cout << "Invalid type used with atomic_ref!\nSkipping the test!"; + return; + } + if (!check_half_aspect(q) || !check_double_aspect(q)) + return; + constexpr size_t WGSIZE = 8; + T *buf = malloc_shared(WGSIZE, q); + assert(buf && "Shared USM allocation failed!"); + T expected = 0; + for (int i = 0; i < WGSIZE; ++i) { + buf[i] = T(i); + expected = expected + buf[i]; + } + nd_range ndr{{SIZE}, {WGSIZE}}; + q.submit([&](sycl::handler &cgh) { + ext::oneapi::experimental::work_group_memory mem{WGSIZE, cgh}; + ext::oneapi::experimental::work_group_memory result{cgh}; + cgh.parallel_for(ndr, [=](nd_item<> it) { + size_t local_id = it.get_local_id(); + mem[local_id] = buf[local_id]; + atomic_ref + atomic_val{result}; + if (it.get_group().leader()) { + atomic_val.store(0); + } + group_barrier(it.get_group()); + atomic_val += mem[local_id]; + group_barrier(it.get_group()); + assert(atomic_val.load() == expected); + }); + }).wait(); + free(buf, q); + if constexpr (sizeof...(Ts)) + test_atomic_ref(); +} + +int main() { + test(SIZE, SIZE /* WorkGroupSize */, + true /* UseHelper */); + test(SIZE, SIZE, false); + test(SIZE, SIZE / 2, false); + test(SIZE, SIZE / 4, false); + test(SIZE, 1, false); + test(SIZE, 2, true); + test_marray(); + test_vec(); + test_atomic_ref(); + test_struct(SIZE, 4); + test_union(SIZE, SIZE); + test_union(SIZE, SIZE / 2); + test_union(SIZE, 1); + test_union(SIZE, 2); + return 0; +} diff --git a/sycl/test/extensions/WorkGroupMemory/api_misuse_test.cpp b/sycl/test/extensions/WorkGroupMemory/api_misuse_test.cpp new file mode 100644 index 0000000000000..d79067d34079b --- /dev/null +++ b/sycl/test/extensions/WorkGroupMemory/api_misuse_test.cpp @@ -0,0 +1,43 @@ +// RUN: %clangxx -fsycl -fsyntax-only -ferror-limit=0 -Xclang -verify -Xclang -verify-ignore-unexpected=note,warning %s + +#include + +using namespace sycl; +namespace syclexp = sycl::ext::oneapi::experimental; + +queue Q; + +// This test verifies the type restrictions on the two non-default constructors +// of work group memory. + +template void convertToDataT(DataT &data) {} + +template void test_bounded_arr() { + Q.submit([&](sycl::handler &cgh) { + nd_range<1> ndr{1, 1}; + // expected-error-re@+1 5{{no matching constructor for initialization of 'syclexp::work_group_memory<{{.*}}>'}} + syclexp::work_group_memory mem{1, cgh}; + // expected-error@+1 5{{no viable overloaded '='}} + cgh.parallel_for(ndr, [=](nd_item<1> it) { mem = {DataT{}}; }); + }); +} + +template void test_unbounded_arr() { + Q.submit([&](sycl::handler &cgh) { + nd_range<1> ndr{1, 1}; + // expected-error-re@+1 5{{no matching constructor for initialization of 'syclexp::work_group_memory<{{.*}}>'}} + syclexp::work_group_memory mem{cgh}; + // expected-error@+1 5{{no viable overloaded '='}} + cgh.parallel_for(ndr, [=](nd_item<1> it) { mem = {DataT{}}; }); + }); +} + +template void test() { + (test_bounded_arr(), ...); + (test_unbounded_arr(), ...); +} + +int main() { + test(); + return 0; +} diff --git a/sycl/test/extensions/WorkGroupMemory/api_test.cpp b/sycl/test/extensions/WorkGroupMemory/api_test.cpp new file mode 100644 index 0000000000000..81f7f9d01293d --- /dev/null +++ b/sycl/test/extensions/WorkGroupMemory/api_test.cpp @@ -0,0 +1,102 @@ +// RUN: %clangxx -fsycl -fsyntax-only %s +#include +#include +#include + +using namespace sycl; +namespace syclexp = sycl::ext::oneapi::experimental; + +queue Q; + +struct S { + int a; + char b; +}; + +union U { + int a; + char b; +}; + +template void convertToDataT(DataT &data) {} + +template void test_constness() { + Q.submit([&](sycl::handler &cgh) { + nd_range<1> ndr{1, 1}; + syclexp::work_group_memory mem{syclexp::indeterminate}; + cgh.parallel_for(ndr, [=](nd_item<1> it) { + const auto mem1 = mem; + // since mem1 is const, all of the following should succeed. + if constexpr (!std::is_array_v) + mem1 = DataT{}; + convertToDataT(mem1); + const auto *ptr = &mem1; + const auto &mptr = mem1.template get_multi_ptr<>(); + }); + }); +} + +template +void test_helper(syclexp::work_group_memory mem) { + static_assert( + std::is_same_v::value_type, + std::remove_all_extents_t>); + syclexp::work_group_memory dummy{mem}; + mem = dummy; + Q.submit([&](sycl::handler &cgh) { + if constexpr (sycl::detail::is_unbounded_array_v) + mem = syclexp::work_group_memory{1, cgh}; + else + mem = syclexp::work_group_memory{cgh}; + nd_range<1> ndr{1, 1}; + cgh.parallel_for(ndr, [=](nd_item<1> it) { + convertToDataT(mem); + if constexpr (!std::is_array_v) + mem = DataT{}; + static_assert( + std::is_same_v< + multi_ptr::value_type, + access::address_space::local_space, + access::decorated::no>, + decltype(mem.template get_multi_ptr())>); + static_assert( + std::is_same_v< + multi_ptr::value_type, + access::address_space::local_space, + access::decorated::no>, + decltype(mem.template get_multi_ptr<>())>); + static_assert( + std::is_same_v< + multi_ptr::value_type, + access::address_space::local_space, + access::decorated::no>, + decltype(mem.template get_multi_ptr())>); + + static_assert( + std::is_same_v< + multi_ptr::value_type, + access::address_space::local_space, + access::decorated::yes>, + decltype(mem.template get_multi_ptr())>); + }); + }); +} + +template void test() { + syclexp::work_group_memory mem{syclexp::indeterminate}; + test_constness(); + test_helper(mem); + if constexpr (sizeof...(rest)) + test(); +} + +int main() { + test(); + test, marray, marray>(); + test, vec, vec>(); + test(); + test(); + test(); + test(); + return 0; +} diff --git a/sycl/test/extensions/WorkGroupMemory/empty_properties_test.cpp b/sycl/test/extensions/WorkGroupMemory/empty_properties_test.cpp new file mode 100644 index 0000000000000..3d11b6d4ecbef --- /dev/null +++ b/sycl/test/extensions/WorkGroupMemory/empty_properties_test.cpp @@ -0,0 +1,22 @@ +// RUN: %clangxx -fsycl -fsyntax-only -Xclang -verify -Xclang -verify-ignore-unexpected=note,warning %s +#include + +using namespace sycl; +namespace syclexp = sycl::ext::oneapi::experimental; + +// This test checks that a diagnostic is emitted when +// instantiating a work group memory class with the properties set to anything +// other than empty_properties_t + +template +void test_properties() { + // expected-error-re@sycl/ext/oneapi/experimental/work_group_memory.hpp:* 2{{static assertion failed due to requirement 'std::is_same_v<{{.*}}, sycl::ext::oneapi::experimental::properties>>'}} + syclexp::work_group_memory{syclexp::indeterminate}; + if constexpr (sizeof...(PropertyListTs)) + test_properties(); +} + +int main() { + test_properties(); + return 0; +} From 5f65de4ad3c88f5797e9c19caaecff4665673a8f Mon Sep 17 00:00:00 2001 From: aarongreig Date: Tue, 19 Nov 2024 16:41:31 +0000 Subject: [PATCH 17/36] [UR][CL] Fix typo in CL symbol loading macro for windows. (#16106) Co-authored-by: Callum Fare --- sycl/cmake/modules/UnifiedRuntimeTag.cmake | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake index 3744a4e87ad76..b7c21f96e389d 100644 --- a/sycl/cmake/modules/UnifiedRuntimeTag.cmake +++ b/sycl/cmake/modules/UnifiedRuntimeTag.cmake @@ -1,7 +1,7 @@ -# commit 0ea47d7c70b9a21a3d90612a0a0e7525034e62f7 -# Merge: e3247c23 e36941cb +# commit 1675f053176f0860388aa67fd009750a7e03b2c2 +# Merge: 0ea47d7c 22ca5ee2 # Author: Callum Fare -# Date: Tue Nov 19 10:24:08 2024 +0000 -# Merge pull request #1584 from zhaomaosu/simplify-device-global -# [DeviceSanitizer] Remove device global "__AsanDeviceGlobalCount" -set(UNIFIED_RUNTIME_TAG 0ea47d7c70b9a21a3d90612a0a0e7525034e62f7) +# Date: Tue Nov 19 14:41:15 2024 +0000 +# Merge pull request #2337 from aarongreig/aaron/fixCoreFuncMacroWindows +# Fix the CL_CORE_FUNCTION macro on windows. +set(UNIFIED_RUNTIME_TAG 1675f053176f0860388aa67fd009750a7e03b2c2) From b7607f00d092e897acc669b84be47a135195343a Mon Sep 17 00:00:00 2001 From: Steffen Larsen Date: Tue, 19 Nov 2024 17:59:38 +0100 Subject: [PATCH 18/36] [SYCL] Fix the barrier dependency for OOO profiling tags (#16112) This commit fixes an issue where the barrier before the timestamp enqueued for the profiling tag in an out-of-order queue did not prevent future work from being enqueued prior to the start/end of the profiling tag. --------- Signed-off-by: Larsen, Steffen --- sycl/source/detail/scheduler/commands.cpp | 38 +++++++++--- sycl/unittests/Extensions/ProfilingTag.cpp | 67 ++++++++++++++++++++++ 2 files changed, 97 insertions(+), 8 deletions(-) diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 091504a983ff3..63eb048212776 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -3441,25 +3441,47 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { case CGType::ProfilingTag: { assert(MQueue && "Profiling tag requires a valid queue"); const auto &Adapter = MQueue->getAdapter(); + + bool IsInOrderQueue = MQueue->isInOrder(); + ur_event_handle_t *TimestampDeps = nullptr; + size_t NumTimestampDeps = 0; + + // If the queue is not in-order, the implementation will need to first + // insert a marker event that the timestamp waits for. + ur_event_handle_t PreTimestampMarkerEvent{}; + if (!IsInOrderQueue) { + // FIXME: urEnqueueEventsWait on the L0 adapter requires a double-release. + // Use that instead once it has been fixed. + // See https://github.com/oneapi-src/unified-runtime/issues/2347. + Adapter->call( + MQueue->getHandleRef(), + /*num_events_in_wait_list=*/0, + /*event_wait_list=*/nullptr, &PreTimestampMarkerEvent); + TimestampDeps = &PreTimestampMarkerEvent; + NumTimestampDeps = 1; + } + + Adapter->call( + MQueue->getHandleRef(), + /*blocking=*/false, NumTimestampDeps, TimestampDeps, Event); + // If the queue is not in-order, we need to insert a barrier. This barrier // does not need output events as it will implicitly enforce the following // enqueue is blocked until it finishes. - if (!MQueue->isInOrder()) { + if (!IsInOrderQueue) { + // We also need to release the timestamp event from the marker. + Adapter->call(PreTimestampMarkerEvent); // FIXME: Due to a bug in the L0 UR adapter, we will leak events if we do // not pass an output event to the UR call. Once that is fixed, // this immediately-deleted event can be removed. - ur_event_handle_t PreTimestampBarrierEvent{}; + ur_event_handle_t PostTimestampBarrierEvent{}; Adapter->call( MQueue->getHandleRef(), /*num_events_in_wait_list=*/0, - /*event_wait_list=*/nullptr, &PreTimestampBarrierEvent); - Adapter->call(PreTimestampBarrierEvent); + /*event_wait_list=*/nullptr, &PostTimestampBarrierEvent); + Adapter->call(PostTimestampBarrierEvent); } - Adapter->call( - MQueue->getHandleRef(), - /*blocking=*/false, - /*num_events_in_wait_list=*/0, /*event_wait_list=*/nullptr, Event); if (Event) MEvent->setHandle(*Event); return UR_RESULT_SUCCESS; diff --git a/sycl/unittests/Extensions/ProfilingTag.cpp b/sycl/unittests/Extensions/ProfilingTag.cpp index 7b18b9ba00e4e..394fba8497103 100644 --- a/sycl/unittests/Extensions/ProfilingTag.cpp +++ b/sycl/unittests/Extensions/ProfilingTag.cpp @@ -66,6 +66,8 @@ TEST_F(ProfilingTagTest, ProfilingTagSupportedDefaultQueue) { "urEnqueueTimestampRecordingExp", &after_urEnqueueTimestampRecordingExp); mock::getCallbacks().set_after_callback("urEventGetProfilingInfo", &after_urEventGetProfilingInfo); + mock::getCallbacks().set_after_callback( + "urEnqueueEventsWaitWithBarrier", &after_urEnqueueEventsWaitWithBarrier); sycl::context Ctx{sycl::platform()}; sycl::queue Queue{Ctx, sycl::default_selector_v}; @@ -75,6 +77,39 @@ TEST_F(ProfilingTagTest, ProfilingTagSupportedDefaultQueue) { sycl::event E = sycl::ext::oneapi::experimental::submit_profiling_tag(Queue); ASSERT_EQ(size_t{1}, counter_urEnqueueTimestampRecordingExp); + // TODO: We expect two barriers for now, while marker events leak. Adjust when + // addressed. + ASSERT_EQ(size_t{2}, counter_urEnqueueEventsWaitWithBarrier); + + E.get_profiling_info(); + ASSERT_TRUE(LatestProfilingQuery.has_value()); + ASSERT_EQ(*LatestProfilingQuery, UR_PROFILING_INFO_COMMAND_START); + + E.get_profiling_info(); + ASSERT_TRUE(LatestProfilingQuery.has_value()); + ASSERT_EQ(*LatestProfilingQuery, UR_PROFILING_INFO_COMMAND_END); +} + +TEST_F(ProfilingTagTest, ProfilingTagSupportedInOrderQueue) { + mock::getCallbacks().set_after_callback("urDeviceGetInfo", + &after_urDeviceGetInfo); + mock::getCallbacks().set_after_callback( + "urEnqueueTimestampRecordingExp", &after_urEnqueueTimestampRecordingExp); + mock::getCallbacks().set_after_callback("urEventGetProfilingInfo", + &after_urEventGetProfilingInfo); + mock::getCallbacks().set_after_callback( + "urEnqueueEventsWaitWithBarrier", &after_urEnqueueEventsWaitWithBarrier); + + sycl::context Ctx{sycl::platform()}; + sycl::queue Queue{ + Ctx, sycl::default_selector_v, {sycl::property::queue::in_order()}}; + sycl::device Dev = Queue.get_device(); + + ASSERT_TRUE(Dev.has(sycl::aspect::ext_oneapi_queue_profiling_tag)); + + sycl::event E = sycl::ext::oneapi::experimental::submit_profiling_tag(Queue); + ASSERT_EQ(size_t{1}, counter_urEnqueueTimestampRecordingExp); + ASSERT_EQ(size_t{0}, counter_urEnqueueEventsWaitWithBarrier); E.get_profiling_info(); ASSERT_TRUE(LatestProfilingQuery.has_value()); @@ -113,6 +148,38 @@ TEST_F(ProfilingTagTest, ProfilingTagSupportedProfilingQueue) { ASSERT_EQ(*LatestProfilingQuery, UR_PROFILING_INFO_COMMAND_END); } +TEST_F(ProfilingTagTest, ProfilingTagSupportedProfilingInOrderQueue) { + mock::getCallbacks().set_after_callback("urDeviceGetInfo", + &after_urDeviceGetInfo); + mock::getCallbacks().set_after_callback( + "urEnqueueTimestampRecordingExp", &after_urEnqueueTimestampRecordingExp); + mock::getCallbacks().set_after_callback("urEventGetProfilingInfo", + &after_urEventGetProfilingInfo); + mock::getCallbacks().set_after_callback( + "urEnqueueEventsWaitWithBarrier", &after_urEnqueueEventsWaitWithBarrier); + + sycl::context Ctx{sycl::platform()}; + sycl::queue Queue{Ctx, + sycl::default_selector_v, + {sycl::property::queue::enable_profiling(), + sycl::property::queue::in_order()}}; + sycl::device Dev = Queue.get_device(); + + ASSERT_TRUE(Dev.has(sycl::aspect::ext_oneapi_queue_profiling_tag)); + + sycl::event E = sycl::ext::oneapi::experimental::submit_profiling_tag(Queue); + ASSERT_EQ(size_t{1}, counter_urEnqueueTimestampRecordingExp); + ASSERT_EQ(size_t{0}, counter_urEnqueueEventsWaitWithBarrier); + + E.get_profiling_info(); + ASSERT_TRUE(LatestProfilingQuery.has_value()); + ASSERT_EQ(*LatestProfilingQuery, UR_PROFILING_INFO_COMMAND_START); + + E.get_profiling_info(); + ASSERT_TRUE(LatestProfilingQuery.has_value()); + ASSERT_EQ(*LatestProfilingQuery, UR_PROFILING_INFO_COMMAND_END); +} + TEST_F(ProfilingTagTest, ProfilingTagFallbackDefaultQueue) { mock::getCallbacks().set_after_callback("urDeviceGetInfo", &after_urDeviceGetInfo); From eb9b82313f35ba5203deffd30cb58d6e8cb507f7 Mon Sep 17 00:00:00 2001 From: Mariya Podchishchaeva Date: Tue, 19 Nov 2024 18:38:03 +0100 Subject: [PATCH 19/36] [clang][SYCL] Print canonical types in free function shim functions (#16119) Somehow even if the outer type was canonical, TypePrinter manages to skip nns printing in case of nested default template arguments. See added test case for the example. --- clang/lib/Sema/SemaSYCL.cpp | 2 ++ ...ee_function_default_template_arguments.cpp | 32 +++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp index 767dde6512b83..a4cf8c20058f8 100644 --- a/clang/lib/Sema/SemaSYCL.cpp +++ b/clang/lib/Sema/SemaSYCL.cpp @@ -6510,6 +6510,7 @@ void SYCLIntegrationHeader::emit(raw_ostream &O) { std::string ParmList; bool FirstParam = true; Policy.SuppressDefaultTemplateArgs = false; + Policy.PrintCanonicalTypes = true; for (ParmVarDecl *Param : K.SyclKernel->parameters()) { if (FirstParam) FirstParam = false; @@ -6518,6 +6519,7 @@ void SYCLIntegrationHeader::emit(raw_ostream &O) { ParmList += Param->getType().getCanonicalType().getAsString(Policy); } FunctionTemplateDecl *FTD = K.SyclKernel->getPrimaryTemplate(); + Policy.PrintCanonicalTypes = false; Policy.SuppressDefinition = true; Policy.PolishForDeclaration = true; Policy.FullyQualifiedName = true; diff --git a/clang/test/CodeGenSYCL/free_function_default_template_arguments.cpp b/clang/test/CodeGenSYCL/free_function_default_template_arguments.cpp index 808f7b93d8112..62a121d218b8b 100644 --- a/clang/test/CodeGenSYCL/free_function_default_template_arguments.cpp +++ b/clang/test/CodeGenSYCL/free_function_default_template_arguments.cpp @@ -60,6 +60,24 @@ templated3(Arg, int, int>, T end) { template void templated3(Arg, int, int>, int); + +namespace sycl { +template struct X {}; +template <> struct X {}; +namespace detail { +struct Y {}; +} // namespace detail +template <> struct X {}; +} // namespace sycl +using namespace sycl; +template > struct Arg1 { T val; }; + +[[__sycl_detail__::add_ir_attributes_function("sycl-single-task-kernel", + 2)]] void +foo(Arg1 arg) { + arg.val = 42; +} + // CHECK: Forward declarations of kernel and its argument types: // CHECK-NEXT: namespace ns { // CHECK-NEXT: struct notatuple; @@ -98,3 +116,17 @@ template void templated3(Arg, int, int>, // CHECK-NEXT: static constexpr auto __sycl_shim5() { // CHECK-NEXT: return (void (*)(struct ns::Arg, int, int>, int))templated3; // CHECK-NEXT: } + +// CHECK Forward declarations of kernel and its argument types: +// CHECK: namespace sycl { namespace detail { +// CHECK-NEXT: struct Y; +// CHECK-NEXT: }} +// CHECK-NEXT: namespace sycl { +// CHECK-NEXT: template struct X; +// CHECK-NEXT: } +// CHECK-NEXT: template struct Arg1; + +// CHECK: void foo(Arg1 > arg); +// CHECK-NEXT: static constexpr auto __sycl_shim6() { +// CHECK-NEXT: return (void (*)(struct Arg1 >))foo; +// CHECK-NEXT: } From 8922ee77cf68f322e2a762f33ee58e7a1623d037 Mon Sep 17 00:00:00 2001 From: Dmitry Sidorov Date: Tue, 19 Nov 2024 19:09:17 +0100 Subject: [PATCH 20/36] [SYCL] Enable nonsemantic.shader.debuginfo.200 by default (#16120) It's left disabled only for FPGA target. --------- Signed-off-by: Sidorov, Dmitry --- clang/lib/Driver/ToolChains/Clang.cpp | 8 ++--- .../sycl-spirv-default-options-old-model.c | 36 +++++++++++++++++++ .../test/Driver/sycl-spirv-default-options.c | 17 +++++++++ .../ClangLinkerWrapper.cpp | 14 +------- 4 files changed, 56 insertions(+), 19 deletions(-) create mode 100644 clang/test/Driver/sycl-spirv-default-options-old-model.c create mode 100644 clang/test/Driver/sycl-spirv-default-options.c diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 57949dbc3f1d2..2d2c5dab453bd 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -10674,12 +10674,8 @@ static void getTripleBasedSPIRVTransOpts(Compilation &C, ArgStringList &TranslatorArgs) { bool IsCPU = Triple.isSPIR() && Triple.getSubArch() == llvm::Triple::SPIRSubArch_x86_64; - // Enable NonSemanticShaderDebugInfo.200 for CPU AOT and for non-Windows - const bool IsWindowsMSVC = - Triple.isWindowsMSVCEnvironment() || - C.getDefaultToolChain().getTriple().isWindowsMSVCEnvironment(); - const bool EnableNonSemanticDebug = - IsCPU || (!IsWindowsMSVC && !C.getDriver().IsFPGAHWMode()); + // Enable NonSemanticShaderDebugInfo.200 for non-FPGA targets. + const bool EnableNonSemanticDebug = !C.getDriver().IsFPGAHWMode(); if (EnableNonSemanticDebug) { TranslatorArgs.push_back( "-spirv-debug-info-version=nonsemantic-shader-200"); diff --git a/clang/test/Driver/sycl-spirv-default-options-old-model.c b/clang/test/Driver/sycl-spirv-default-options-old-model.c new file mode 100644 index 0000000000000..66f63a69737e5 --- /dev/null +++ b/clang/test/Driver/sycl-spirv-default-options-old-model.c @@ -0,0 +1,36 @@ +// Test for default llvm-spirv options + +// RUN: %clang -target x86_64-unknown-linux-gnu -fsycl --no-offload-new-driver -fsycl-targets=spir64-unknown-unknown %s -### 2>&1 \ +// RUN: | FileCheck %s -check-prefixes=CHECK-DEFAULT + +// RUN: %clang -target x86_64-unknown-linux-gnu -fsycl --no-offload-new-driver -fsycl-targets=spir64-unknown-unknown %s -### 2>&1 \ +// RUN: | FileCheck %s -check-prefixes=CHECK-DEFAULT +// RUN: %clang -target x86_64-unknown-linux-gnu -fsycl --no-offload-new-driver -fsycl-targets=spir64_fpga-unknown-unknown %s -### 2>&1 \ +// RUN: | FileCheck %s -check-prefixes=CHECK-DEFAULT +// RUN: %clang -target x86_64-unknown-linux-gnu -fintelfpga %s -### 2>&1 \ +// RUN: | FileCheck %s -check-prefixes=CHECK-DEFAULT +// RUN: %clang -target x86_64-unknown-linux-gnu -fsycl --no-offload-new-driver -fsycl-targets=spir64_fpga-unknown-unknown -Xshardware %s -### 2>&1 \ +// RUN: | FileCheck %s -check-prefixes=CHECK-FPGA-HW +// RUN: %clang -target x86_64-unknown-linux-gnu -fintelfpga -Xshardware %s -### 2>&1 \ +// RUN: | FileCheck %s -check-prefixes=CHECK-FPGA-HW +// RUN: %clang -target x86_64-unknown-linux-gnu -fsycl --no-offload-new-driver -fsycl-targets=spir64_fpga-unknown-unknown -Xssimulation %s -### 2>&1 \ +// RUN: | FileCheck %s -check-prefixes=CHECK-FPGA-HW +// RUN: %clang -target x86_64-unknown-linux-gnu -fintelfpga -Xssimulation %s -### 2>&1 \ +// RUN: | FileCheck %s -check-prefixes=CHECK-FPGA-HW +// RUN: %clang -target x86_64-unknown-linux-gnu -fsycl --no-offload-new-driver -fsycl-targets=spir64_fpga-unknown-unknown -Xsemulator %s -### 2>&1 \ +// RUN: | FileCheck %s -check-prefixes=CHECK-DEFAULT +// RUN: %clang -target x86_64-unknown-linux-gnu -fintelfpga -Xsemulator %s -### 2>&1 \ +// RUN: | FileCheck %s -check-prefixes=CHECK-DEFAULT +// RUN: %clang -target x86_64-unknown-linux-gnu -fsycl --no-offload-new-driver -fsycl-targets=spir64_gen-unknown-unknown %s -### 2>&1 \ +// RUN: | FileCheck %s -check-prefixes=CHECK-DEFAULT +// RUN: %clang -target x86_64-unknown-linux-gnu -fsycl --no-offload-new-driver -fsycl-targets=spir64_gen-unknown-unknown %s -### 2>&1 \ +// RUN: | FileCheck %s -check-prefixes=CHECK-DEFAULT +// RUN: %clang -target x86_64-unknown-linux-gnu -fsycl --no-offload-new-driver -fsycl-targets=spir64_x86_64-unknown-unknown %s -### 2>&1 \ +// RUN: | FileCheck %s -check-prefixes=CHECK-DEFAULT + +// CHECK-DEFAULT: llvm-spirv{{.*}}-spirv-debug-info-version=nonsemantic-shader-200 +// CHECK-DEFAULT-NOT: -ocl-100 + +// CHECL-FPGA-HW: llvm-spirv{{.*}}-ocl-100 +// CHECK-FPGA-HW-NOT: spirv-debug-info-version=nonsemantic-shader-200 + diff --git a/clang/test/Driver/sycl-spirv-default-options.c b/clang/test/Driver/sycl-spirv-default-options.c new file mode 100644 index 0000000000000..c3ced7858e52f --- /dev/null +++ b/clang/test/Driver/sycl-spirv-default-options.c @@ -0,0 +1,17 @@ +// Generate .o file as SYCL device library file. +// +// RUN: touch %t.devicelib.cpp +// RUN: %clang %t.devicelib.cpp -fsycl -fsycl-targets=spir64-unknown-unknown -c --offload-new-driver -o %t_1.devicelib.o +// RUN: %clang %t.devicelib.cpp -fsycl -fsycl-targets=spir64_gen-unknown-unknown -c --offload-new-driver -o %t_2.devicelib.o +// RUN: %clang %t.devicelib.cpp -fsycl -fsycl-targets=spir64_x86_64-unknown-unknown -c --offload-new-driver -o %t_3.devicelib.o + +// Test for default llvm-spirv options + +// RUN: %clang -target x86_64-unknown-linux-gnu -fsycl --offload-new-driver \ +// RUN: -fsycl-targets=spir64-unknown-unknown -c %s -o %t_1.o +// RUN: clang-linker-wrapper -sycl-device-libraries=%t_1.devicelib.o \ +// RUN: "--host-triple=x86_64-unknown-linux-gnu" "--linker-path=/usr/bin/ld" \ +// RUN: "--" "-o" "a.out" %t_1.o --dry-run 2>&1 | FileCheck %s + +// CHECK: llvm-spirv{{.*}}-spirv-debug-info-version=nonsemantic-shader-200 +// CHECK-NOT: ocl-100 diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp index 2f7993c6411f4..e4a37591384c7 100644 --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -830,19 +830,7 @@ getTripleBasedSPIRVTransOpts(const ArgList &Args, const llvm::Triple Triple) { bool IsCPU = Triple.isSPIR() && Triple.getSubArch() == llvm::Triple::SPIRSubArch_x86_64; - // Enable NonSemanticShaderDebugInfo.200 for CPU AOT and for non-Windows - const bool IsWindowsMSVC = Triple.isWindowsMSVCEnvironment() || - Args.hasArg(OPT_sycl_is_windows_msvc_env); - const bool EnableNonSemanticDebug = IsCPU || !IsWindowsMSVC; - if (EnableNonSemanticDebug) { - TranslatorArgs.push_back( - "-spirv-debug-info-version=nonsemantic-shader-200"); - } else { - TranslatorArgs.push_back("-spirv-debug-info-version=ocl-100"); - // Prevent crash in the translator if input IR contains DIExpression - // operations which don't have mapping to OpenCL.DebugInfo.100 spec. - TranslatorArgs.push_back("-spirv-allow-extra-diexpressions"); - } + TranslatorArgs.push_back("-spirv-debug-info-version=nonsemantic-shader-200"); std::string UnknownIntrinsics("-spirv-allow-unknown-intrinsics=llvm.genx."); if (IsCPU) UnknownIntrinsics += ",llvm.fpbuiltin"; From 39483ab51fb67b3ac50a82001a719ed261f57596 Mon Sep 17 00:00:00 2001 From: Lorenc Bushi Date: Tue, 19 Nov 2024 16:15:10 -0500 Subject: [PATCH 21/36] [SYCL] Add support for work group memory free function kernel parameter (#15861) This PR concludes the implementation of the work group memory [extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/proposed/sycl_ext_oneapi_work_group_memory.asciidoc). It adds support for work group memory parameters when using free function kernels. --------- Co-authored-by: lorenc.bushi --- clang/lib/Sema/SemaSYCL.cpp | 141 +++++++++++++++--- .../CodeGenSYCL/free_function_int_header.cpp | 40 ++++- .../free_function_kernel_params.cpp | 17 ++- .../SemaSYCL/free_function_kernel_params.cpp | 22 ++- ...sycl_ext_oneapi_work_group_memory.asciidoc | 10 +- .../oneapi/experimental/work_group_memory.hpp | 4 +- sycl/include/sycl/handler.hpp | 4 +- sycl/source/feature_test.hpp.in | 1 + .../WorkGroupMemory/common_free_function.hpp | 1 - .../reduction_free_function.cpp | 3 - 10 files changed, 210 insertions(+), 33 deletions(-) rename sycl/doc/extensions/{proposed => experimental}/sycl_ext_oneapi_work_group_memory.asciidoc (98%) diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp index a4cf8c20058f8..242e6c8a9d7d4 100644 --- a/clang/lib/Sema/SemaSYCL.cpp +++ b/clang/lib/Sema/SemaSYCL.cpp @@ -1522,7 +1522,7 @@ class KernelObjVisitor { void visitParam(ParmVarDecl *Param, QualType ParamTy, HandlerTys &...Handlers) { if (isSyclSpecialType(ParamTy, SemaSYCLRef)) - KP_FOR_EACH(handleOtherType, Param, ParamTy); + KP_FOR_EACH(handleSyclSpecialType, Param, ParamTy); else if (ParamTy->isStructureOrClassType()) { if (KP_FOR_EACH(handleStructType, Param, ParamTy)) { CXXRecordDecl *RD = ParamTy->getAsCXXRecordDecl(); @@ -2075,8 +2075,11 @@ class SyclKernelFieldChecker : public SyclKernelFieldHandler { } bool handleSyclSpecialType(ParmVarDecl *PD, QualType ParamTy) final { - Diag.Report(PD->getLocation(), diag::err_bad_kernel_param_type) << ParamTy; - IsInvalid = true; + if (!SemaSYCL::isSyclType(ParamTy, SYCLTypeAttr::work_group_memory)) { + Diag.Report(PD->getLocation(), diag::err_bad_kernel_param_type) + << ParamTy; + IsInvalid = true; + } return isValid(); } @@ -2228,8 +2231,8 @@ class SyclKernelUnionChecker : public SyclKernelFieldHandler { } bool handleSyclSpecialType(ParmVarDecl *PD, QualType ParamTy) final { - // TODO - unsupportedFreeFunctionParamType(); + if (!SemaSYCL::isSyclType(ParamTy, SYCLTypeAttr::work_group_memory)) + unsupportedFreeFunctionParamType(); // TODO return true; } @@ -3013,9 +3016,26 @@ class SyclKernelDeclCreator : public SyclKernelFieldHandler { return handleSpecialType(FD, FieldTy); } - bool handleSyclSpecialType(ParmVarDecl *, QualType) final { - // TODO - unsupportedFreeFunctionParamType(); + bool handleSyclSpecialType(ParmVarDecl *PD, QualType ParamTy) final { + if (SemaSYCL::isSyclType(ParamTy, SYCLTypeAttr::work_group_memory)) { + const auto *RecordDecl = ParamTy->getAsCXXRecordDecl(); + assert(RecordDecl && "The type must be a RecordDecl"); + CXXMethodDecl *InitMethod = getMethodByName(RecordDecl, InitMethodName); + assert(InitMethod && "The type must have the __init method"); + // Don't do -1 here because we count on this to be the first parameter + // added (if any). + size_t ParamIndex = Params.size(); + for (const ParmVarDecl *Param : InitMethod->parameters()) { + QualType ParamTy = Param->getType(); + addParam(Param, ParamTy.getCanonicalType()); + // Propagate add_ir_attributes_kernel_parameter attribute. + if (const auto *AddIRAttr = + Param->getAttr()) + Params.back()->addAttr(AddIRAttr->clone(SemaSYCLRef.getASTContext())); + } + LastParamIndex = ParamIndex; + } else // TODO + unsupportedFreeFunctionParamType(); return true; } @@ -3291,9 +3311,7 @@ class SyclKernelArgsSizeChecker : public SyclKernelFieldHandler { } bool handleSyclSpecialType(ParmVarDecl *PD, QualType ParamTy) final { - // TODO - unsupportedFreeFunctionParamType(); - return true; + return handleSpecialType(ParamTy); } bool handleSyclSpecialType(const CXXRecordDecl *, const CXXBaseSpecifier &BS, @@ -4442,6 +4460,45 @@ class FreeFunctionKernelBodyCreator : public SyclKernelFieldHandler { {}); } + MemberExpr *buildMemberExpr(Expr *Base, ValueDecl *Member) { + DeclAccessPair MemberDAP = DeclAccessPair::make(Member, AS_none); + MemberExpr *Result = SemaSYCLRef.SemaRef.BuildMemberExpr( + Base, /*IsArrow */ false, FreeFunctionSrcLoc, NestedNameSpecifierLoc(), + FreeFunctionSrcLoc, Member, MemberDAP, + /*HadMultipleCandidates*/ false, + DeclarationNameInfo(Member->getDeclName(), FreeFunctionSrcLoc), + Member->getType(), VK_LValue, OK_Ordinary); + return Result; + } + + void createSpecialMethodCall(const CXXRecordDecl *RD, StringRef MethodName, + Expr *MemberBaseExpr, + SmallVectorImpl &AddTo) { + CXXMethodDecl *Method = getMethodByName(RD, MethodName); + if (!Method) + return; + unsigned NumParams = Method->getNumParams(); + llvm::SmallVector ParamDREs(NumParams); + llvm::ArrayRef KernelParameters = + DeclCreator.getParamVarDeclsForCurrentField(); + for (size_t I = 0; I < NumParams; ++I) { + QualType ParamType = KernelParameters[I]->getOriginalType(); + ParamDREs[I] = SemaSYCLRef.SemaRef.BuildDeclRefExpr( + KernelParameters[I], ParamType, VK_LValue, FreeFunctionSrcLoc); + } + MemberExpr *MethodME = buildMemberExpr(MemberBaseExpr, Method); + QualType ResultTy = Method->getReturnType(); + ExprValueKind VK = Expr::getValueKindForType(ResultTy); + ResultTy = ResultTy.getNonLValueExprType(SemaSYCLRef.getASTContext()); + llvm::SmallVector ParamStmts; + const auto *Proto = cast(Method->getType()); + SemaSYCLRef.SemaRef.GatherArgumentsForCall(FreeFunctionSrcLoc, Method, + Proto, 0, ParamDREs, ParamStmts); + AddTo.push_back(CXXMemberCallExpr::Create( + SemaSYCLRef.getASTContext(), MethodME, ParamStmts, ResultTy, VK, + FreeFunctionSrcLoc, FPOptionsOverride())); + } + public: static constexpr const bool VisitInsideSimpleContainers = false; @@ -4461,9 +4518,53 @@ class FreeFunctionKernelBodyCreator : public SyclKernelFieldHandler { return true; } - bool handleSyclSpecialType(ParmVarDecl *, QualType) final { - // TODO - unsupportedFreeFunctionParamType(); + // Default inits the type, then calls the init-method in the body. + // A type may not have a public default constructor as per its spec so + // typically if this is the case the default constructor will be private and + // in such cases we must manually override the access specifier from private + // to public just for the duration of this default initialization. + // TODO: Revisit this approach once https://github.com/intel/llvm/issues/16061 + // is closed. + bool handleSyclSpecialType(ParmVarDecl *PD, QualType ParamTy) final { + if (SemaSYCL::isSyclType(ParamTy, SYCLTypeAttr::work_group_memory)) { + const auto *RecordDecl = ParamTy->getAsCXXRecordDecl(); + AccessSpecifier DefaultConstructorAccess; + auto DefaultConstructor = + std::find_if(RecordDecl->ctor_begin(), RecordDecl->ctor_end(), + [](auto it) { return it->isDefaultConstructor(); }); + DefaultConstructorAccess = DefaultConstructor->getAccess(); + DefaultConstructor->setAccess(AS_public); + + QualType Ty = PD->getOriginalType(); + ASTContext &Ctx = SemaSYCLRef.SemaRef.getASTContext(); + VarDecl *WorkGroupMemoryClone = VarDecl::Create( + Ctx, DeclCreator.getKernelDecl(), FreeFunctionSrcLoc, + FreeFunctionSrcLoc, PD->getIdentifier(), PD->getType(), + Ctx.getTrivialTypeSourceInfo(Ty), SC_None); + InitializedEntity VarEntity = + InitializedEntity::InitializeVariable(WorkGroupMemoryClone); + InitializationKind InitKind = + InitializationKind::CreateDefault(FreeFunctionSrcLoc); + InitializationSequence InitSeq(SemaSYCLRef.SemaRef, VarEntity, InitKind, + std::nullopt); + ExprResult Init = InitSeq.Perform(SemaSYCLRef.SemaRef, VarEntity, + InitKind, std::nullopt); + WorkGroupMemoryClone->setInit( + SemaSYCLRef.SemaRef.MaybeCreateExprWithCleanups(Init.get())); + WorkGroupMemoryClone->setInitStyle(VarDecl::CallInit); + DefaultConstructor->setAccess(DefaultConstructorAccess); + + Stmt *DS = new (SemaSYCLRef.getASTContext()) + DeclStmt(DeclGroupRef(WorkGroupMemoryClone), FreeFunctionSrcLoc, + FreeFunctionSrcLoc); + BodyStmts.push_back(DS); + Expr *MemberBaseExpr = SemaSYCLRef.SemaRef.BuildDeclRefExpr( + WorkGroupMemoryClone, Ty, VK_PRValue, FreeFunctionSrcLoc); + createSpecialMethodCall(RecordDecl, InitMethodName, MemberBaseExpr, + BodyStmts); + ArgExprs.push_back(MemberBaseExpr); + } else // TODO + unsupportedFreeFunctionParamType(); return true; } @@ -4748,9 +4849,11 @@ class SyclKernelIntHeaderCreator : public SyclKernelFieldHandler { return true; } - bool handleSyclSpecialType(ParmVarDecl *, QualType) final { - // TODO - unsupportedFreeFunctionParamType(); + bool handleSyclSpecialType(ParmVarDecl *PD, QualType ParamTy) final { + if (SemaSYCL::isSyclType(ParamTy, SYCLTypeAttr::work_group_memory)) + addParam(PD, ParamTy, SYCLIntegrationHeader::kind_work_group_memory); + else + unsupportedFreeFunctionParamType(); // TODO return true; } @@ -6227,7 +6330,6 @@ void SYCLIntegrationHeader::emit(raw_ostream &O) { O << "#include \n"; O << "#include \n"; O << "#include \n"; - O << "\n"; LangOptions LO; @@ -6502,6 +6604,7 @@ void SYCLIntegrationHeader::emit(raw_ostream &O) { O << "\n"; O << "// Forward declarations of kernel and its argument types:\n"; + Policy.SuppressDefaultTemplateArgs = false; FwdDeclEmitter.Visit(K.SyclKernel->getType()); O << "\n"; @@ -6579,6 +6682,8 @@ void SYCLIntegrationHeader::emit(raw_ostream &O) { } O << ";\n"; O << "}\n"; + Policy.SuppressDefaultTemplateArgs = true; + Policy.EnforceDefaultTemplateArgs = false; // Generate is_kernel, is_single_task_kernel and nd_range_kernel functions. O << "namespace sycl {\n"; diff --git a/clang/test/CodeGenSYCL/free_function_int_header.cpp b/clang/test/CodeGenSYCL/free_function_int_header.cpp index ccaf85aa897ca..6a196dedc2fc2 100644 --- a/clang/test/CodeGenSYCL/free_function_int_header.cpp +++ b/clang/test/CodeGenSYCL/free_function_int_header.cpp @@ -2,7 +2,7 @@ // RUN: FileCheck -input-file=%t.h %s // // This test checks integration header contents for free functions with scalar, -// pointer and non-decomposed struct parameters. +// pointer, non-decomposed struct parameters and work group memory parameters. #include "mock_properties.hpp" #include "sycl.hpp" @@ -96,6 +96,12 @@ void ff_7(KArgWithPtrArray KArg) { template void ff_7(KArgWithPtrArray KArg); +__attribute__((sycl_device)) +[[__sycl_detail__::add_ir_attributes_function("sycl-nd-range-kernel", 0)]] +void ff_8(sycl::work_group_memory) { +} + + // CHECK: const char* const kernel_names[] = { // CHECK-NEXT: {{.*}}__sycl_kernel_ff_2Piii // CHECK-NEXT: {{.*}}__sycl_kernel_ff_2Piiii @@ -105,6 +111,7 @@ template void ff_7(KArgWithPtrArray KArg); // CHECK-NEXT: {{.*}}__sycl_kernel_ff_410NoPointers8Pointers3Agg // CHECK-NEXT: {{.*}}__sycl_kernel_ff_6I3Agg7DerivedEvT_T0_i // CHECK-NEXT: {{.*}}__sycl_kernel_ff_7ILi3EEv16KArgWithPtrArrayIXT_EE +// CHECK-NEXT: {{.*}}__sycl_kernel_ff_8N4sycl3_V117work_group_memoryIiEE // CHECK-NEXT: "" // CHECK-NEXT: }; @@ -148,6 +155,9 @@ template void ff_7(KArgWithPtrArray KArg); // CHECK: //--- _Z18__sycl_kernel_ff_7ILi3EEv16KArgWithPtrArrayIXT_EE // CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 48, 0 }, +// CHECK: //--- _Z18__sycl_kernel_ff_8N4sycl3_V117work_group_memoryIiEE +// CHECK-NEXT: { kernel_param_kind_t::kind_work_group_memory, 8, 0 }, + // CHECK: { kernel_param_kind_t::kind_invalid, -987654321, -987654321 }, // CHECK-NEXT: }; @@ -294,6 +304,26 @@ template void ff_7(KArgWithPtrArray KArg); // CHECK-NEXT: }; // CHECK-NEXT: } +// CHECK: Definition of _Z18__sycl_kernel_ff_8N4sycl3_V117work_group_memoryIiEE as a free function kernel + +// CHECK: Forward declarations of kernel and its argument types: +// CHECK: template class work_group_memory; + +// CHECK: void ff_8(sycl::work_group_memory); +// CHECK-NEXT: static constexpr auto __sycl_shim9() { +// CHECK-NEXT: return (void (*)(class sycl::work_group_memory))ff_8; +// CHECK-NEXT: } +// CHECK-NEXT: namespace sycl { +// CHECK-NEXT: template <> +// CHECK-NEXT: struct ext::oneapi::experimental::is_kernel<__sycl_shim9()> { +// CHECK-NEXT: static constexpr bool value = true; +// CHECK-NEXT: }; +// CHECK-NEXT: template <> +// CHECK-NEXT: struct ext::oneapi::experimental::is_single_task_kernel<__sycl_shim9()> { +// CHECK-NEXT: static constexpr bool value = true; +// CHECK-NEXT: }; +// CHECK-NEXT: } + // CHECK: #include // CHECK: Definition of kernel_id of _Z18__sycl_kernel_ff_2Piii @@ -359,3 +389,11 @@ template void ff_7(KArgWithPtrArray KArg); // CHECK-NEXT: return sycl::detail::get_kernel_id_impl(std::string_view{"_Z18__sycl_kernel_ff_7ILi3EEv16KArgWithPtrArrayIXT_EE"}); // CHECK-NEXT: } // CHECK-NEXT: } + +// CHECK: Definition of kernel_id of _Z18__sycl_kernel_ff_8N4sycl3_V117work_group_memoryIiEE +// CHECK-NEXT: namespace sycl { +// CHECK-NEXT: template <> +// CHECK-NEXT: kernel_id ext::oneapi::experimental::get_kernel_id<__sycl_shim9()>() { +// CHECK-NEXT: return sycl::detail::get_kernel_id_impl(std::string_view{"_Z18__sycl_kernel_ff_8N4sycl3_V117work_group_memoryIiEE"}); +// CHECK-NEXT: } +// CHECK-NEXT: } diff --git a/clang/test/CodeGenSYCL/free_function_kernel_params.cpp b/clang/test/CodeGenSYCL/free_function_kernel_params.cpp index a11d55f483966..2e78116824ad2 100644 --- a/clang/test/CodeGenSYCL/free_function_kernel_params.cpp +++ b/clang/test/CodeGenSYCL/free_function_kernel_params.cpp @@ -1,7 +1,7 @@ // RUN: %clang_cc1 -internal-isystem %S/Inputs -fsycl-is-device -triple spir64 \ // RUN: -emit-llvm %s -o - | FileCheck %s // This test checks parameter IR generation for free functions with parameters -// of non-decomposed struct type. +// of non-decomposed struct type and work group memory type. #include "sycl.hpp" @@ -56,3 +56,18 @@ template void ff_6(KArgWithPtrArray KArg); // CHECK: %struct.KArgWithPtrArray = type { [3 x ptr addrspace(4)], [3 x i32], [3 x i32] } // CHECK: define dso_local spir_kernel void @{{.*}}__sycl_kernel{{.*}}(ptr noundef byval(%struct.NoPointers) align 4 %__arg_S1, ptr noundef byval(%struct.__generated_Pointers) align 8 %__arg_S2, ptr noundef byval(%struct.__generated_Agg) align 8 %__arg_S3) // CHECK: define dso_local spir_kernel void @{{.*}}__sycl_kernel_ff_6{{.*}}(ptr noundef byval(%struct.__generated_KArgWithPtrArray) align 8 %__arg_KArg) + +__attribute__((sycl_device)) +[[__sycl_detail__::add_ir_attributes_function("sycl-nd-range-kernel", 0)]] +void ff_7(sycl::work_group_memory mem) { +} + +// CHECK: define dso_local spir_kernel void @{{.*}}__sycl_kernel_ff_7{{.*}}(ptr addrspace(3) noundef align 4 %__arg_Ptr) +// CHECK: %__arg_Ptr.addr = alloca ptr addrspace(3), align 8 +// CHECK-NEXT: %mem = alloca %"class.sycl::_V1::work_group_memory", align 8 +// CHECK: %__arg_Ptr.addr.ascast = addrspacecast ptr %__arg_Ptr.addr to ptr addrspace(4) +// CHECK-NEXT: %mem.ascast = addrspacecast ptr %mem to ptr addrspace(4) +// CHECK: store ptr addrspace(3) %__arg_Ptr, ptr addrspace(4) %__arg_Ptr.addr.ascast, align 8 +// CHECK-NEXT: [[REGISTER:%[a-zA-Z0-9_]+]] = load ptr addrspace(3), ptr addrspace(4) %__arg_Ptr.addr.ascast, align 8 +// CHECK-NEXT: call spir_func void @{{.*}}work_group_memory{{.*}}__init{{.*}}(ptr addrspace(4) noundef align 8 dereferenceable_or_null(8) %mem.ascast, ptr addrspace(3) noundef [[REGISTER]]) + diff --git a/clang/test/SemaSYCL/free_function_kernel_params.cpp b/clang/test/SemaSYCL/free_function_kernel_params.cpp index 2de4f896a1513..da229145a34ad 100644 --- a/clang/test/SemaSYCL/free_function_kernel_params.cpp +++ b/clang/test/SemaSYCL/free_function_kernel_params.cpp @@ -1,7 +1,7 @@ // RUN: %clang_cc1 -internal-isystem %S/Inputs -fsycl-is-device -ast-dump \ // RUN: %s -o - | FileCheck %s // This test checks parameter rewriting for free functions with parameters -// of type scalar, pointer and non-decomposed struct. +// of type scalar, pointer, non-decomposed struct and work group memory. #include "sycl.hpp" @@ -171,3 +171,23 @@ template void ff_6(Agg S1, Derived1 S2, int); // CHECK-NEXT: DeclRefExpr {{.*}} '__generated_Derived1' lvalue ParmVar {{.*}} '__arg_S2' '__generated_Derived1' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: DeclRefExpr {{.*}} 'int' lvalue ParmVar {{.*}} '__arg_end' 'int' + +__attribute__((sycl_device)) +[[__sycl_detail__::add_ir_attributes_function("sycl-nd-range-kernel", 0)]] +void ff_7(sycl::work_group_memory mem) { +} +// CHECK: FunctionDecl {{.*}}__sycl_kernel{{.*}}'void (__local int *)' +// CHECK-NEXT: ParmVarDecl {{.*}} used __arg_Ptr '__local int *' +// CHECK-NEXT: CompoundStmt +// CHECK-NEXT: DeclStmt +// CHECK-NEXT: VarDecl {{.*}} used mem 'sycl::work_group_memory' callinit +// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::work_group_memory' 'void () noexcept' +// CHECK-NEXT: CXXMemberCallExpr {{.*}} 'void' +// CHECK-NEXT: MemberExpr {{.*}} 'void (__local int *)' lvalue .__init +// CHECK-NEXT: DeclRefExpr {{.*}} 'sycl::work_group_memory' Var {{.*}} 'mem' 'sycl::work_group_memory' +// CHECK-NEXT: ImplicitCastExpr {{.*}} '__local int *' +// CHECK-NEXT: DeclRefExpr {{.*}} '__local int *' lvalue ParmVar {{.*}} '__arg_Ptr' '__local int *' +// CHECK-NEXT: CallExpr {{.*}} 'void' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(sycl::work_group_memory)' +// CHECK-NEXT: DeclRefExpr {{.*}} 'void (sycl::work_group_memory)' lvalue Function {{.*}} 'ff_7' 'void (sycl::work_group_memory)' +// CHECK-NEXT: DeclRefExpr {{.*}} 'sycl::work_group_memory' Var {{.*}} 'mem' 'sycl::work_group_memory' diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_work_group_memory.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_work_group_memory.asciidoc similarity index 98% rename from sycl/doc/extensions/proposed/sycl_ext_oneapi_work_group_memory.asciidoc rename to sycl/doc/extensions/experimental/sycl_ext_oneapi_work_group_memory.asciidoc index 296b77acf82fb..2cbc9d0b2d28b 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_work_group_memory.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_work_group_memory.asciidoc @@ -49,12 +49,10 @@ This extension also depends on the following other SYCL extensions: == Status -This is a proposed extension specification, intended to gather community -feedback. -Interfaces defined in this specification may not be implemented yet or may be -in a preliminary state. -The specification itself may also change in incompatible ways before it is -finalized. +This is an experimental extension specification, intended to provide early +access to features and gather community feedback. Interfaces defined in this +specification are implemented in {dpcpp}, but they are not finalized and may +change incompatibly in future versions of {dpcpp} without prior notice. *Shipping software products should not rely on APIs defined in this specification.* diff --git a/sycl/include/sycl/ext/oneapi/experimental/work_group_memory.hpp b/sycl/include/sycl/ext/oneapi/experimental/work_group_memory.hpp index c156c484f539d..254fd8d877f8e 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/work_group_memory.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/work_group_memory.hpp @@ -1,5 +1,4 @@ //===-------------------- work_group_memory.hpp ---------------------------===// -// // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception @@ -103,6 +102,9 @@ class __SYCL_SPECIAL_CLASS __SYCL_TYPE(work_group_memory) work_group_memory } private: + friend class sycl::handler; // needed in order for handler class to be aware + // of the private inheritance with + // work_group_memory_impl as base class decoratedPtr ptr = nullptr; }; } // namespace ext::oneapi::experimental diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 4e8f62d53c36d..d0a9867ec4c40 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -1849,7 +1849,9 @@ class __SYCL_EXPORT handler { void set_arg( int ArgIndex, ext::oneapi::experimental::work_group_memory &Arg) { - setArgHelper(ArgIndex, Arg); + // slice the base class object out of Arg + detail::work_group_memory_impl &ArgImpl = Arg; + setArgHelper(ArgIndex, ArgImpl); } // set_arg for graph dynamic_parameters diff --git a/sycl/source/feature_test.hpp.in b/sycl/source/feature_test.hpp.in index 8f4fb05752efc..c1e62f5492abe 100644 --- a/sycl/source/feature_test.hpp.in +++ b/sycl/source/feature_test.hpp.in @@ -109,6 +109,7 @@ inline namespace _V1 { #define SYCL_EXT_ONEAPI_PROFILING_TAG 1 #define SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND 1 #define SYCL_EXT_ONEAPI_GET_KERNEL_INFO 1 +#define SYCL_EXT_ONEAPI_WORK_GROUP_MEMORY 1 // In progress yet #define SYCL_EXT_ONEAPI_ATOMIC16 0 diff --git a/sycl/test-e2e/WorkGroupMemory/common_free_function.hpp b/sycl/test-e2e/WorkGroupMemory/common_free_function.hpp index e13f50214593d..7cc1b6008bd78 100644 --- a/sycl/test-e2e/WorkGroupMemory/common_free_function.hpp +++ b/sycl/test-e2e/WorkGroupMemory/common_free_function.hpp @@ -1,7 +1,6 @@ #pragma once #include "common.hpp" -#include "common_lambda.hpp" #include #include #include diff --git a/sycl/test-e2e/WorkGroupMemory/reduction_free_function.cpp b/sycl/test-e2e/WorkGroupMemory/reduction_free_function.cpp index ff2aa8aa19385..1f2f5ccd0c5e1 100644 --- a/sycl/test-e2e/WorkGroupMemory/reduction_free_function.cpp +++ b/sycl/test-e2e/WorkGroupMemory/reduction_free_function.cpp @@ -5,9 +5,6 @@ // UNSUPPORTED: cuda // UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/16004 -// XFAIL: * -// XFAIL-TRACKER: https://github.com/intel/llvm/issues/15927 - #include "common_free_function.hpp" // Basic usage reduction test using free function kernels. From f0899ff9b208221e12e98137a1514c3c4ab73c0d Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Tue, 19 Nov 2024 22:55:08 -0800 Subject: [PATCH 22/36] [NFC][SYCL] Modernize (idiomatic C++17) `SingleNontypePropertyValueBase` (#16128) --- sycl/include/sycl/ext/oneapi/properties/property_value.hpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/properties/property_value.hpp b/sycl/include/sycl/ext/oneapi/properties/property_value.hpp index c3b825e6054d6..813e4ecf964ea 100644 --- a/sycl/include/sycl/ext/oneapi/properties/property_value.hpp +++ b/sycl/include/sycl/ext/oneapi/properties/property_value.hpp @@ -17,16 +17,11 @@ inline namespace _V1 { namespace ext::oneapi::experimental { namespace detail { -// Checks if a type T has a static value member variable. -template struct HasValue : std::false_type {}; -template -struct HasValue : std::true_type {}; - // Base class for property values with a single non-type value template struct SingleNontypePropertyValueBase {}; template -struct SingleNontypePropertyValueBase::value>> { +struct SingleNontypePropertyValueBase> { static constexpr auto value = T::value; }; From 0e2094de551ff3a2c4acb87c5ff250487867468e Mon Sep 17 00:00:00 2001 From: Udit Agarwal Date: Tue, 19 Nov 2024 23:03:23 -0800 Subject: [PATCH 23/36] [SYCL] Implement eviction for in-memory program cache (#16062) Fixes: CMPLRLLVM-27640, https://github.com/intel/llvm/issues/2517 The PR implements LRU cache eviction policy for in-memory program caches. The high-level idea is to store programs in a linked-list, called eviction list. When the program is first added to the cache, it is also added to the eviction list. When a program is fetched from cache, we move the program to the end of the eviction list. So, that the programs at the beginning of the eviction list are always least recently used. When adding a new program to cache, we check if the size of the program cache exceeds the threshold, if so, we evict the program from cache and corresponding kernels from Kernel and fast kernel cache. This PR also adds a new environment variable, `SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD` that user can use to control the size of in-memory cache. By default, cache eviction is disabled. --- sycl/doc/EnvironmentVariables.md | 1 + sycl/source/detail/config.def | 2 + sycl/source/detail/config.hpp | 50 +++ sycl/source/detail/kernel_program_cache.hpp | 341 +++++++++++++++++- .../program_manager/program_manager.cpp | 24 +- sycl/unittests/assert/assert.cpp | 12 + sycl/unittests/config/ConfigTests.cpp | 63 ++++ .../kernel-and-program/CMakeLists.txt | 1 + .../kernel-and-program/InMemCacheEviction.cpp | 225 ++++++++++++ 9 files changed, 708 insertions(+), 11 deletions(-) create mode 100644 sycl/unittests/kernel-and-program/InMemCacheEviction.cpp diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md index 3172bc2446aee..5ee2c40542ced 100644 --- a/sycl/doc/EnvironmentVariables.md +++ b/sycl/doc/EnvironmentVariables.md @@ -14,6 +14,7 @@ compiler and runtime. | `SYCL_CACHE_DISABLE_PERSISTENT (deprecated)` | Any(\*) | Has no effect. | | `SYCL_CACHE_PERSISTENT` | Integer | Controls persistent device compiled code cache. Turns it on if set to '1' and turns it off if set to '0'. When cache is enabled SYCL runtime will try to cache and reuse JIT-compiled binaries. Default is off. | | `SYCL_CACHE_IN_MEM` | '1' or '0' | Enable ('1') or disable ('0') in-memory caching of device compiled code. When cache is enabled SYCL runtime will try to cache and reuse JIT-compiled binaries. Default is '1'. | +| `SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD` | Positive integer | `SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD` accepts an integer that specifies the maximum size of the in-memory program cache in bytes. Eviction is performed when the cache size exceeds the threshold. The default value is 0 which means that eviction is disabled. | | `SYCL_CACHE_EVICTION_DISABLE` | Any(\*) | Switches persistent cache eviction off when the variable is set. | | `SYCL_CACHE_MAX_SIZE` | Positive integer | Persistent cache eviction is triggered once total size of cached images exceeds the value in megabytes (default - 8 192 for 8 GB). Set to 0 to disable size-based cache eviction. | | `SYCL_CACHE_THRESHOLD` | Positive integer | Persistent cache eviction threshold in days (default value is 7 for 1 week). Set to 0 for disabling time-based cache eviction. | diff --git a/sycl/source/detail/config.def b/sycl/source/detail/config.def index 9172df2a1497b..f459a2dffa50d 100644 --- a/sycl/source/detail/config.def +++ b/sycl/source/detail/config.def @@ -27,6 +27,8 @@ CONFIG(SYCL_HOST_UNIFIED_MEMORY, 1, __SYCL_HOST_UNIFIED_MEMORY) // 260 (Windows limit) - 12 (filename) - 84 (cache directory structure) CONFIG(SYCL_CACHE_DIR, 164, __SYCL_CACHE_DIR) CONFIG(SYCL_CACHE_TRACE, 4, __SYCL_CACHE_TRACE) +CONFIG(SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD, 16, + __SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD) CONFIG(SYCL_CACHE_DISABLE_PERSISTENT, 1, __SYCL_CACHE_DISABLE_PERSISTENT) CONFIG(SYCL_CACHE_PERSISTENT, 1, __SYCL_CACHE_PERSISTENT) CONFIG(SYCL_CACHE_EVICTION_DISABLE, 1, __SYCL_CACHE_EVICTION_DISABLE) diff --git a/sycl/source/detail/config.hpp b/sycl/source/detail/config.hpp index 3c1f2f6822807..ace69d0a9420e 100644 --- a/sycl/source/detail/config.hpp +++ b/sycl/source/detail/config.hpp @@ -756,6 +756,56 @@ template <> class SYCLConfig { } }; +// SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD accepts an integer that specifies +// the maximum size of the in-memory Program cache. +// Cache eviction is performed when the cache size exceeds the threshold. +// The thresholds are specified in bytes. +// The default value is "0" which means that eviction is disabled. +template <> class SYCLConfig { + using BaseT = SYCLConfigBase; + +public: + static int get() { return getCachedValue(); } + static void reset() { (void)getCachedValue(true); } + + static int getProgramCacheSize() { return getCachedValue(); } + + static bool isProgramCacheEvictionEnabled() { + return getProgramCacheSize() > 0; + } + +private: + static int getCachedValue(bool ResetCache = false) { + const auto Parser = []() { + const char *ValStr = BaseT::getRawValue(); + + // Disable eviction by default. + if (!ValStr) + return 0; + + int CacheSize = 0; + try { + CacheSize = std::stoi(ValStr); + if (CacheSize < 0) + throw INVALID_CONFIG_EXCEPTION(BaseT, "Value must be non-negative"); + } catch (...) { + std::string Msg = std::string{ + "Invalid input to SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD. Please try " + "a positive integer."}; + throw exception(make_error_code(errc::runtime), Msg); + } + + return CacheSize; + }; + + static auto EvictionThresholds = Parser(); + if (ResetCache) + EvictionThresholds = Parser(); + + return EvictionThresholds; + } +}; + #undef INVALID_CONFIG_EXCEPTION } // namespace detail diff --git a/sycl/source/detail/kernel_program_cache.hpp b/sycl/source/detail/kernel_program_cache.hpp index f58cda059bcce..9f06d0ebcde8d 100644 --- a/sycl/source/detail/kernel_program_cache.hpp +++ b/sycl/source/detail/kernel_program_cache.hpp @@ -21,7 +21,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -36,6 +38,20 @@ namespace sycl { inline namespace _V1 { namespace detail { class context_impl; + +// During SYCL program execution SYCL runtime will create internal objects +// representing kernels and programs, it may also invoke JIT compiler to bring +// kernels in a program to executable state. Those runtime operations are quite +// expensive. To avoid redundant operations and to speed up the execution, SYCL +// runtime employs in-memory cache for kernels and programs. When a kernel is +// invoked multiple times, the runtime will fetch the kernel from the cache +// instead of creating it from scratch. +// By default, there is no upper bound on the cache size. +// When the system runs out of memory, the cache will be cleared. Alternatively, +// the cache size can be limited by setting SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD +// to a positive value. When the cache size exceeds the threshold, the least +// recently used programs, and associated kernels, will be evicted from the +// cache. class KernelProgramCache { public: /// Denotes build error data. The data is filled in from sycl::exception @@ -127,10 +143,51 @@ class KernelProgramCache { using CommonProgramKeyT = std::pair>; + // A custom hashing and equality function for ProgramCacheKeyT. + // These are used to compare and hash the keys in the cache. + struct ProgramCacheKeyHash { + std::size_t operator()(const ProgramCacheKeyT &Key) const { + std::size_t Hash = 0; + // Hash the serialized object, representing spec consts. + for (const auto &Elem : Key.first.first) + Hash ^= std::hash{}(Elem); + + // Hash the imageId. + Hash ^= std::hash{}(Key.first.second); + + // Hash the devices. + for (const auto &Elem : Key.second) + Hash ^= std::hash{}(static_cast(Elem)); + return Hash; + } + }; + + struct ProgramCacheKeyEqual { + bool operator()(const ProgramCacheKeyT &LHS, + const ProgramCacheKeyT &RHS) const { + // Check equality of SerializedObj (Spec const) + return std::equal(LHS.first.first.begin(), LHS.first.first.end(), + RHS.first.first.begin()) && + // Check equality of imageId + LHS.first.second == RHS.first.second && + // Check equality of devices + std::equal(LHS.second.begin(), LHS.second.end(), + RHS.second.begin(), RHS.second.end()); + } + }; + struct ProgramCache { ::boost::unordered_map Cache; ::boost::unordered_multimap KeyMap; + // Mapping between a UR program and its size. + std::unordered_map ProgramSizeMap; + + size_t ProgramCacheSizeInBytes = 0; + inline size_t GetProgramCacheSizeInBytes() const noexcept { + return ProgramCacheSizeInBytes; + } + // Returns number of entries in the cache. size_t size() const noexcept { return Cache.size(); } }; @@ -184,6 +241,62 @@ class KernelProgramCache { using KernelFastCacheT = ::boost::unordered_flat_map; + // DS to hold data and functions related to Program cache eviction. + struct EvictionList { + private: + // Linked list of cache entries to be evicted in case of cache overflow. + std::list MProgramEvictionList; + + // Mapping between program handle and the iterator to the eviction list. + std::unordered_map::iterator, + ProgramCacheKeyHash, ProgramCacheKeyEqual> + MProgramToEvictionListMap; + + public: + std::list &getProgramEvictionList() { + return MProgramEvictionList; + } + + void clear() { + MProgramEvictionList.clear(); + MProgramToEvictionListMap.clear(); + } + + void emplaceBack(const ProgramCacheKeyT &CacheKey) { + MProgramEvictionList.emplace_back(CacheKey); + + // In std::list, the iterators are not invalidated when elements are + // added/removed/moved to the list. So, we can safely store the iterators. + MProgramToEvictionListMap[CacheKey] = + std::prev(MProgramEvictionList.end()); + traceProgram("Program added to the end of eviction list.", CacheKey); + } + + // This function is called on the hot path, whenever a kernel/program + // is accessed. So, it should be very fast. + void moveToEnd(const ProgramCacheKeyT &CacheKey) { + auto It = MProgramToEvictionListMap.find(CacheKey); + if (It != MProgramToEvictionListMap.end()) { + MProgramEvictionList.splice(MProgramEvictionList.end(), + MProgramEvictionList, It->second); + traceProgram("Program moved to the end of eviction list.", CacheKey); + } + // else: This can happen if concurrently the program is removed from + // eviction list by another thread. + } + + bool empty() { return MProgramEvictionList.empty(); } + + size_t size() { return MProgramEvictionList.size(); } + + void popFront() { + if (!MProgramEvictionList.empty()) { + MProgramToEvictionListMap.erase(MProgramEvictionList.front()); + MProgramEvictionList.pop_front(); + } + } + }; + ~KernelProgramCache() = default; void setContextPtr(const ContextPtr &AContext) { MParentContext = AContext; } @@ -197,12 +310,24 @@ class KernelProgramCache { int ImageId = CacheKey.first.second; std::stringstream DeviceList; + std::vector SerializedObjVec = CacheKey.first.first; + + // Convert spec constants to string. Spec constants are stored as + // ASCII values, so we need need to convert them to int and then to + // string. + std::string SerializedObjString; + SerializedObjString.reserve(SerializedObjVec.size() * sizeof(size_t)); + for (unsigned char c : SerializedObjVec) + SerializedObjString += std::to_string((int)c) + ","; + for (const auto &Device : CacheKey.second) DeviceList << "0x" << std::setbase(16) << reinterpret_cast(Device) << ","; std::string Identifier = "[Key:{imageId = " + std::to_string(ImageId) + - ",urDevice = " + DeviceList.str() + "}]: "; + ",urDevice = " + DeviceList.str() + + ", serializedObj = " + SerializedObjString + + "}]: "; std::cerr << "[In-Memory Cache][Thread Id:" << std::this_thread::get_id() << "][Program Cache]" << Identifier << Msg << std::endl; @@ -232,6 +357,10 @@ class KernelProgramCache { return {MKernelsPerProgramCache, MKernelsPerProgramCacheMutex}; } + Locked acquireEvictionList() { + return {MEvictionList, MProgramEvictionListMutex}; + } + std::pair getOrInsertProgram(const ProgramCacheKeyT &CacheKey) { auto LockedCache = acquireCachedPrograms(); @@ -268,8 +397,7 @@ class KernelProgramCache { std::make_pair(CacheKey.first.second, CacheKey.second); ProgCache.KeyMap.emplace(CommonKey, CacheKey); traceProgram("Program inserted.", CacheKey); - } else - traceProgram("Program fetched.", CacheKey); + } return DidInsert; } @@ -300,6 +428,23 @@ class KernelProgramCache { template void saveKernel(KeyT &&CacheKey, ValT &&CacheVal) { + + if (SYCLConfig:: + isProgramCacheEvictionEnabled()) { + + ur_program_handle_t Program = std::get<3>(CacheVal); + // Save kernel in fast cache only if the corresponding program is also + // in the cache. + auto LockedCache = acquireCachedPrograms(); + auto &ProgCache = LockedCache.get(); + if (ProgCache.ProgramSizeMap.find(Program) == + ProgCache.ProgramSizeMap.end()) + return; + + // Save reference between the program and the fast cache key. + MProgramToKernelFastCacheKeyMap[Program].emplace_back(CacheKey); + } + std::unique_lock Lock(MKernelFastCacheMutex); // if no insertion took place, thus some other thread has already inserted // smth in the cache @@ -307,6 +452,167 @@ class KernelProgramCache { MKernelFastCache.emplace(CacheKey, CacheVal); } + // Evict programs from cache to free up space. + void evictPrograms(size_t DesiredCacheSize, size_t CurrentCacheSize) { + + // Figure out how many programs from the beginning we need to evict. + if (CurrentCacheSize < DesiredCacheSize || MCachedPrograms.Cache.empty()) + return; + + // Evict programs from the beginning of the cache. + { + std::lock_guard Lock(MProgramEvictionListMutex); + auto &ProgramEvictionList = MEvictionList.getProgramEvictionList(); + size_t CurrCacheSize = MCachedPrograms.ProgramCacheSizeInBytes; + + // Traverse the eviction list and remove the LRU programs. + // The LRU programs will be at the front of the list. + while (CurrCacheSize > DesiredCacheSize && !MEvictionList.empty()) { + ProgramCacheKeyT CacheKey = ProgramEvictionList.front(); + auto LockedCache = acquireCachedPrograms(); + auto &ProgCache = LockedCache.get(); + auto It = ProgCache.Cache.find(CacheKey); + + if (It != ProgCache.Cache.end()) { + // We are about to remove this program now. + // (1) Remove it from KernelPerProgram cache. + // (2) Remove corresponding entries from KernelFastCache. + // (3) Remove it from ProgramCache KeyMap. + // (4) Remove it from the ProgramCache. + // (5) Remove it from ProgramSizeMap. + // (6) Update the cache size. + + // Remove entry from the KernelsPerProgram cache. + ur_program_handle_t NativePrg = It->second->Val; + { + auto LockedCacheKP = acquireKernelsPerProgramCache(); + // List kernels that are to be removed from the cache, if tracing is + // enabled. + if (SYCLConfig::isTraceInMemCache()) { + for (const auto &Kernel : LockedCacheKP.get()[NativePrg]) + traceKernel("Kernel evicted.", Kernel.first); + } + LockedCacheKP.get().erase(NativePrg); + } + + // Remove corresponding entries from KernelFastCache. + auto FastCacheKeyItr = + MProgramToKernelFastCacheKeyMap.find(NativePrg); + if (FastCacheKeyItr != MProgramToKernelFastCacheKeyMap.end()) { + for (const auto &FastCacheKey : FastCacheKeyItr->second) { + std::unique_lock Lock(MKernelFastCacheMutex); + MKernelFastCache.erase(FastCacheKey); + traceKernel("Kernel evicted.", std::get<2>(FastCacheKey), true); + } + MProgramToKernelFastCacheKeyMap.erase(FastCacheKeyItr); + } + + // Remove entry from ProgramCache KeyMap. + CommonProgramKeyT CommonKey = + std::make_pair(CacheKey.first.second, CacheKey.second); + // Since KeyMap is a multi-map, we need to iterate over all entries + // with this CommonKey and remove those that match the CacheKey. + auto KeyMapItrRange = LockedCache.get().KeyMap.equal_range(CommonKey); + for (auto KeyMapItr = KeyMapItrRange.first; + KeyMapItr != KeyMapItrRange.second; ++KeyMapItr) { + if (KeyMapItr->second == CacheKey) { + LockedCache.get().KeyMap.erase(KeyMapItr); + break; + } + } + + // Get size of the program. + size_t ProgramSize = MCachedPrograms.ProgramSizeMap[It->second->Val]; + // Evict program from the cache. + ProgCache.Cache.erase(It); + // Remove program size from the cache size. + MCachedPrograms.ProgramCacheSizeInBytes -= ProgramSize; + MCachedPrograms.ProgramSizeMap.erase(NativePrg); + + traceProgram("Program evicted.", CacheKey); + } else + // This should never happen. + throw sycl::exception(sycl::make_error_code(sycl::errc::runtime), + "Program not found in the cache."); + + CurrCacheSize = MCachedPrograms.ProgramCacheSizeInBytes; + // Remove the program from the eviction list. + MEvictionList.popFront(); + } + } + } + + // Register that a program has been fetched from the cache. + // If it is the first time the program is fetched, add it to the eviction + // list. + void registerProgramFetch(const ProgramCacheKeyT &CacheKey, + const ur_program_handle_t &Program, + const bool IsBuilt) { + + size_t ProgramCacheEvictionThreshold = + SYCLConfig::getProgramCacheSize(); + + // No need to populate the eviction list if eviction is disabled. + if (ProgramCacheEvictionThreshold == 0) + return; + + // If the program is not in the cache, add it to the cache. + if (IsBuilt) { + // This is the first time we are adding this entry. Add it to the end of + // eviction list. + { + std::lock_guard Lock(MProgramEvictionListMutex); + MEvictionList.emplaceBack(CacheKey); + } + + // Store size of the program and check if we need to evict some entries. + // Get Size of the program. + size_t ProgramSize = 0; + auto Adapter = getAdapter(); + + try { + // Get number of devices this program was built for. + unsigned int DeviceNum = 0; + Adapter->call( + Program, UR_PROGRAM_INFO_NUM_DEVICES, sizeof(DeviceNum), &DeviceNum, + nullptr); + + // Get binary sizes for each device. + std::vector BinarySizes(DeviceNum); + Adapter->call( + Program, UR_PROGRAM_INFO_BINARY_SIZES, + sizeof(size_t) * BinarySizes.size(), BinarySizes.data(), nullptr); + + // Sum up binary sizes. + ProgramSize = + std::accumulate(BinarySizes.begin(), BinarySizes.end(), 0); + } catch (const exception &Ex) { + std::cerr << "Failed to get program size: " << Ex.what() << std::endl; + std::rethrow_exception(std::current_exception()); + } + // Store program size in the cache. + size_t CurrCacheSize = 0; + { + std::lock_guard Lock(MProgramCacheMutex); + MCachedPrograms.ProgramSizeMap[Program] = ProgramSize; + MCachedPrograms.ProgramCacheSizeInBytes += ProgramSize; + CurrCacheSize = MCachedPrograms.ProgramCacheSizeInBytes; + } + + // Evict programs if the cache size exceeds the threshold. + if (CurrCacheSize > ProgramCacheEvictionThreshold) + evictPrograms(ProgramCacheEvictionThreshold, CurrCacheSize); + } + // If the program is already in the cache, move it to the end of the list. + // Since we are following LRU eviction policy, we need to move the program + // to the end of the list. Items in the front of the list are the least + // recently This code path is "hot" and should be very fast. + else { + std::lock_guard Lock(MProgramEvictionListMutex); + MEvictionList.moveToEnd(CacheKey); + } + } + /// Clears cache state. /// /// This member function should only be used in unit tests. @@ -317,6 +623,11 @@ class KernelProgramCache { MCachedPrograms = ProgramCache{}; MKernelsPerProgramCache = KernelCacheT{}; MKernelFastCache = KernelFastCacheT{}; + MProgramToKernelFastCacheKeyMap.clear(); + + // Clear the eviction lists and its mutexes. + std::lock_guard EvictionListLock(MProgramEvictionListMutex); + MEvictionList.clear(); } /// Try to fetch entity (kernel or program) from cache. If there is no such @@ -341,8 +652,10 @@ class KernelProgramCache { /// /// \return a pointer to cached build result, return value must not be /// nullptr. - template - auto getOrBuild(GetCachedBuildFT &&GetCachedBuild, BuildFT &&Build) { + template + auto getOrBuild(GetCachedBuildFT &&GetCachedBuild, BuildFT &&Build, + EvictFT &&EvictFunc = nullptr) { using BuildState = KernelProgramCache::BuildState; constexpr size_t MaxAttempts = 2; for (size_t AttemptCounter = 0;; ++AttemptCounter) { @@ -356,8 +669,11 @@ class KernelProgramCache { BuildState NewState = BuildResult->waitUntilTransition(); // Build succeeded. - if (NewState == BuildState::BS_Done) + if (NewState == BuildState::BS_Done) { + if constexpr (!std::is_same_v) + EvictFunc(BuildResult->Val, /*IsBuilt=*/false); return BuildResult; + } // Build failed, or this is the last attempt. if (NewState == BuildState::BS_Failed || @@ -381,6 +697,9 @@ class KernelProgramCache { try { BuildResult->Val = Build(); + if constexpr (!std::is_same_v) + EvictFunc(BuildResult->Val, /*IsBuilt=*/true); + BuildResult->updateAndNotify(BuildState::BS_Done); return BuildResult; } catch (const exception &Ex) { @@ -414,6 +733,16 @@ class KernelProgramCache { std::mutex MKernelFastCacheMutex; KernelFastCacheT MKernelFastCache; + + // Map between fast kernel cache keys and program handle. + // MKernelFastCacheMutex will be used for synchronization. + std::unordered_map> + MProgramToKernelFastCacheKeyMap; + + EvictionList MEvictionList; + // Mutexes that will be used when accessing the eviction lists. + std::mutex MProgramEvictionListMutex; + friend class ::MockKernelProgramCache; const AdapterPtr &getAdapter(); diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index 8f13c0745ad21..dfc5d019051a9 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -925,7 +925,13 @@ ur_program_handle_t ProgramManager::getBuiltURProgram( if (!SYCLConfig::get()) return BuildF(); - auto BuildResult = Cache.getOrBuild(GetCachedBuildF, BuildF); + auto EvictFunc = [&Cache, &CacheKey](ur_program_handle_t Program, + bool isBuilt) { + return Cache.registerProgramFetch(CacheKey, Program, isBuilt); + }; + + auto BuildResult = + Cache.getOrBuild(GetCachedBuildF, BuildF, EvictFunc); // getOrBuild is not supposed to return nullptr assert(BuildResult != nullptr && "Invalid build result"); @@ -939,10 +945,12 @@ ur_program_handle_t ProgramManager::getBuiltURProgram( // update it here and re-use that lambda. CacheKey.first.second = BImg->getImageID(); bool DidInsert = Cache.insertBuiltProgram(CacheKey, ResProgram); - if (DidInsert) { + + // Add to the eviction list. + Cache.registerProgramFetch(CacheKey, ResProgram, DidInsert); + if (DidInsert) // For every cached copy of the program, we need to increment its refcount Adapter->call(ResProgram); - } } // If caching is enabled, one copy of the program handle will be @@ -2699,7 +2707,13 @@ device_image_plain ProgramManager::build(const device_image_plain &DeviceImage, return Cache.getOrInsertProgram(CacheKey); }; - auto BuildResult = Cache.getOrBuild(GetCachedBuildF, BuildF); + auto EvictFunc = [&Cache, &CacheKey](ur_program_handle_t Program, + bool isBuilt) { + return Cache.registerProgramFetch(CacheKey, Program, isBuilt); + }; + + auto BuildResult = + Cache.getOrBuild(GetCachedBuildF, BuildF, EvictFunc); // getOrBuild is not supposed to return nullptr assert(BuildResult != nullptr && "Invalid build result"); @@ -2728,7 +2742,7 @@ device_image_plain ProgramManager::build(const device_image_plain &DeviceImage, } // Change device in the cache key to reduce copying of spec const data. CacheKey.second = Subset; - Cache.getOrBuild(GetCachedBuildF, CacheSubsets); + Cache.getOrBuild(GetCachedBuildF, CacheSubsets, EvictFunc); // getOrBuild is not supposed to return nullptr assert(BuildResult != nullptr && "Invalid build result"); } diff --git a/sycl/unittests/assert/assert.cpp b/sycl/unittests/assert/assert.cpp index b45996238358f..e11184d3a24d2 100644 --- a/sycl/unittests/assert/assert.cpp +++ b/sycl/unittests/assert/assert.cpp @@ -319,6 +319,18 @@ static ur_result_t redefinedProgramGetInfo(void *pParams) { return UR_RESULT_SUCCESS; } + // Required if program cache eviction is enabled. + if (UR_PROGRAM_INFO_BINARY_SIZES == *params.ppropName) { + size_t BinarySize = 1; + + if (*params.ppPropValue) + memcpy(*params.ppPropValue, &BinarySize, sizeof(size_t)); + if (*params.ppPropSizeRet) + **params.ppPropSizeRet = sizeof(size_t); + + return UR_RESULT_SUCCESS; + } + return UR_RESULT_ERROR_UNKNOWN; } diff --git a/sycl/unittests/config/ConfigTests.cpp b/sycl/unittests/config/ConfigTests.cpp index 3022ccbd52e65..756a340c8f82d 100644 --- a/sycl/unittests/config/ConfigTests.cpp +++ b/sycl/unittests/config/ConfigTests.cpp @@ -324,3 +324,66 @@ TEST(ConfigTests, CheckSyclCacheTraceTest) { sycl::detail::SYCLConfig::reset(); TestConfig(0, 0, 0, 0); } + +// SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD accepts an integer that specifies +// the maximum size of the in-memory Program cache. +// Cache eviction is performed when the cache size exceeds the threshold. +// The thresholds are specified in bytes. +// The default value is "0" which means that eviction is disabled. +TEST(ConfigTests, CheckSyclCacheEvictionThresholdTest) { + + using InMemEvicType = + sycl::detail::SYCLConfig; + + // Lambda to test parsing of SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD. + auto TestConfig = [](int expectedProgramCacheSize) { + EXPECT_EQ(expectedProgramCacheSize, InMemEvicType::getProgramCacheSize()); + EXPECT_EQ(expectedProgramCacheSize > 0, + InMemEvicType::isProgramCacheEvictionEnabled()); + }; + + // Lambda to set SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD. + auto SetSyclInMemCacheEvictionThresholdEnv = [](const char *value) { +#ifdef _WIN32 + _putenv_s("SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD", value); +#else + setenv("SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD", value, 1); +#endif + }; + + // Lambda to test invalid inputs. An exception should be thrown + // when parsing invalid values. + auto TestInvalidValues = [&](const char *value, const char *errMsg) { + SetSyclInMemCacheEvictionThresholdEnv(value); + try { + InMemEvicType::reset(); + TestConfig(0); + FAIL() << errMsg; + } catch (...) { + } + }; + + // Test eviction threshold with zero. + SetSyclInMemCacheEvictionThresholdEnv("0"); + sycl::detail::readConfig(true); + TestConfig(0); + + // Test invalid values. + TestInvalidValues("-1", "Should throw exception for negative value"); + TestInvalidValues("a", "Should throw exception for non-integer value"); + + // Test valid values. + SetSyclInMemCacheEvictionThresholdEnv("1024"); + InMemEvicType::reset(); + TestConfig(1024); + + // When SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD is not set, it should default to + // 0:0:0. +#ifdef _WIN32 + _putenv_s("SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD", ""); +#else + unsetenv("SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD"); +#endif + InMemEvicType::reset(); + TestConfig(0); +} diff --git a/sycl/unittests/kernel-and-program/CMakeLists.txt b/sycl/unittests/kernel-and-program/CMakeLists.txt index 8736f6f60a76a..0d06d2fc29aa0 100644 --- a/sycl/unittests/kernel-and-program/CMakeLists.txt +++ b/sycl/unittests/kernel-and-program/CMakeLists.txt @@ -7,5 +7,6 @@ add_sycl_unittest(KernelAndProgramTests OBJECT PersistentDeviceCodeCache.cpp KernelBuildOptions.cpp OutOfResources.cpp + InMemCacheEviction.cpp ) target_compile_definitions(KernelAndProgramTests PRIVATE -D__SYCL_INTERNAL_API) diff --git a/sycl/unittests/kernel-and-program/InMemCacheEviction.cpp b/sycl/unittests/kernel-and-program/InMemCacheEviction.cpp new file mode 100644 index 0000000000000..70c121053cee9 --- /dev/null +++ b/sycl/unittests/kernel-and-program/InMemCacheEviction.cpp @@ -0,0 +1,225 @@ +//==----- InMemCacheEviction.cpp --- In-memory cache eviction tests -------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This file contains tests covering eviction in in-memory program cache. + +#define SYCL2020_DISABLE_DEPRECATION_WARNINGS + +#include "../thread_safety/ThreadUtils.h" +#include "detail/context_impl.hpp" +#include "detail/kernel_program_cache.hpp" +#include +#include +#include +#include + +#include + +#include + +using namespace sycl; + +class Kernel1; +class Kernel2; +class Kernel3; + +MOCK_INTEGRATION_HEADER(Kernel1) +MOCK_INTEGRATION_HEADER(Kernel2) +MOCK_INTEGRATION_HEADER(Kernel3) + +static sycl::unittest::MockDeviceImage Img[] = { + sycl::unittest::generateDefaultImage({"Kernel1"}), + sycl::unittest::generateDefaultImage({"Kernel2"}), + sycl::unittest::generateDefaultImage({"Kernel3"})}; + +static sycl::unittest::MockDeviceImageArray<3> ImgArray{Img}; + +// Number of times urProgramCreateWithIL is called. This is used to check +// if the program is created or fetched from the cache. +static int NumProgramBuild = 0; + +constexpr int ProgramSize = 10000; + +static ur_result_t redefinedProgramCreateWithIL(void *) { + ++NumProgramBuild; + return UR_RESULT_SUCCESS; +} + +static ur_result_t redefinedProgramGetInfoAfter(void *pParams) { + auto params = *static_cast(pParams); + if (*params.ppropName == UR_PROGRAM_INFO_NUM_DEVICES) { + auto value = reinterpret_cast(*params.ppPropValue); + *value = 1; + } + + if (*params.ppropName == UR_PROGRAM_INFO_BINARY_SIZES) { + auto value = reinterpret_cast(*params.ppPropValue); + value[0] = ProgramSize; + } + + if (*params.ppropName == UR_PROGRAM_INFO_BINARIES) { + auto value = reinterpret_cast(*params.ppPropValue); + value[0] = 0; + } + + return UR_RESULT_SUCCESS; +} + +// Function to set SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD. +static void setCacheEvictionEnv(const char *value) { +#ifdef _WIN32 + _putenv_s("SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD", value); +#else + if (value) + setenv("SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD", value, 1); + else + (void)unsetenv("SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD"); +#endif + + sycl::detail::readConfig(true); + sycl::detail::SYCLConfig< + sycl::detail::SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD>::reset(); +} + +// Function to check number of entries in the cache and eviction list. +static inline void +CheckNumberOfEntriesInCacheAndEvictionList(detail::context_impl &CtxImpl, + size_t ExpectedNumEntries) { + auto &KPCache = CtxImpl.getKernelProgramCache(); + EXPECT_EQ(KPCache.acquireCachedPrograms().get().size(), ExpectedNumEntries) + << "Unexpected number of entries in the cache"; + auto EvcList = KPCache.acquireEvictionList(); + EXPECT_EQ(EvcList.get().size(), ExpectedNumEntries) + << "Unexpected number of entries in the eviction list"; +} + +class InMemCacheEvictionTests : public ::testing::Test { +protected: + void TearDown() override { setCacheEvictionEnv(""); } +}; + +TEST(InMemCacheEvictionTests, TestBasicEvictionAndLRU) { + NumProgramBuild = 0; + sycl::unittest::UrMock<> Mock; + mock::getCallbacks().set_before_callback("urProgramCreateWithIL", + &redefinedProgramCreateWithIL); + mock::getCallbacks().set_after_callback("urProgramGetInfo", + &redefinedProgramGetInfoAfter); + + sycl::platform Plt{sycl::platform()}; + sycl::context Ctx{Plt}; + auto CtxImpl = detail::getSyclObjImpl(Ctx); + queue q(Ctx, default_selector_v); + + // One program is of 10000 bytes, so 20005 eviction threshold can + // accommodate two programs. + setCacheEvictionEnv("20005"); + + // Cache is empty, so one urProgramCreateWithIL call. + q.single_task([] {}); + EXPECT_EQ(NumProgramBuild, 1); + CheckNumberOfEntriesInCacheAndEvictionList(*CtxImpl, 1); + + q.single_task([] {}); + EXPECT_EQ(NumProgramBuild, 2); + CheckNumberOfEntriesInCacheAndEvictionList(*CtxImpl, 2); + + // Move first program to end of eviction list. + q.single_task([] {}); + EXPECT_EQ(NumProgramBuild, 2); + + // Calling Kernel3, Kernel2, and Kernel1 in a cyclic manner to + // verify LRU's working. + + // Kernel2's program should have been evicted. + q.single_task([] {}); + EXPECT_EQ(NumProgramBuild, 3); + CheckNumberOfEntriesInCacheAndEvictionList(*CtxImpl, 2); + + // Calling Kernel2 again should trigger urProgramCreateWithIL and + // should evict Kernel1's program. + q.single_task([] {}); + EXPECT_EQ(NumProgramBuild, 3); + CheckNumberOfEntriesInCacheAndEvictionList(*CtxImpl, 2); + + // Calling Kernel1 again should trigger urProgramCreateWithIL and + // should evict Kernel3's program. + q.single_task([] {}); + EXPECT_EQ(NumProgramBuild, 4); + CheckNumberOfEntriesInCacheAndEvictionList(*CtxImpl, 2); +} + +// Test to verify eviction using concurrent kernel invocation. +TEST(InMemCacheEvictionTests, TestConcurrentEvictionSameQueue) { + NumProgramBuild = 0; + sycl::unittest::UrMock<> Mock; + mock::getCallbacks().set_before_callback("urProgramCreateWithIL", + &redefinedProgramCreateWithIL); + mock::getCallbacks().set_after_callback("urProgramGetInfo", + &redefinedProgramGetInfoAfter); + + sycl::platform Plt{sycl::platform()}; + context Ctx{Plt}; + auto CtxImpl = detail::getSyclObjImpl(Ctx); + queue q(Ctx, default_selector_v); + + // One program is of 10000 bytes, so 20005 eviction threshold can + // accommodate two programs. + setCacheEvictionEnv("20005"); + + constexpr size_t ThreadCount = 200; + Barrier barrier(ThreadCount); + { + auto ConcurrentInvokeKernels = [&](std::size_t threadId) { + barrier.wait(); + q.single_task([] {}); + q.single_task([] {}); + q.single_task([] {}); + }; + + ThreadPool MPool(ThreadCount, ConcurrentInvokeKernels); + } + q.wait_and_throw(); + + CheckNumberOfEntriesInCacheAndEvictionList(*CtxImpl, 2); +} + +// Test to verify eviction using concurrent kernel invocation when +// cache size is very less so as to trigger immediate eviction. +TEST(InMemCacheEvictionTests, TestConcurrentEvictionSmallCache) { + NumProgramBuild = 0; + sycl::unittest::UrMock<> Mock; + mock::getCallbacks().set_before_callback("urProgramCreateWithIL", + &redefinedProgramCreateWithIL); + mock::getCallbacks().set_after_callback("urProgramGetInfo", + &redefinedProgramGetInfoAfter); + + context Ctx{platform()}; + auto CtxImpl = detail::getSyclObjImpl(Ctx); + queue q(Ctx, default_selector_v); + + // One program is of 10000 bytes, so 100 eviction threshold will + // trigger immediate eviction. + setCacheEvictionEnv("100"); + + // Fetch the same kernel concurrently from multiple threads. + // This should cause some threads to insert a program and other + // threads to evict the same program. + constexpr size_t ThreadCount = 300; + Barrier barrier(ThreadCount); + { + auto ConcurrentInvokeKernels = [&](std::size_t threadId) { + barrier.wait(); + q.single_task([] {}); + }; + + ThreadPool MPool(ThreadCount, ConcurrentInvokeKernels); + } + q.wait_and_throw(); + + CheckNumberOfEntriesInCacheAndEvictionList(*CtxImpl, 0); +} From 023cb2b4ca9f7272388d9968ee89f4ca360a0c19 Mon Sep 17 00:00:00 2001 From: Pietro Ghiglio Date: Wed, 20 Nov 2024 10:37:45 +0100 Subject: [PATCH 24/36] [SYCL][NATIVECPU] Fix header inclusion in shuffle_abi test (#16124) Fixes the headers included in `sycl/test/check_device_code/native_cpu/shuffle_abi.cpp`, using just `sycl/sycl.hpp`. --- sycl/test/check_device_code/native_cpu/shuffle_abi.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sycl/test/check_device_code/native_cpu/shuffle_abi.cpp b/sycl/test/check_device_code/native_cpu/shuffle_abi.cpp index cdbaab90ce65c..8a94745f08100 100644 --- a/sycl/test/check_device_code/native_cpu/shuffle_abi.cpp +++ b/sycl/test/check_device_code/native_cpu/shuffle_abi.cpp @@ -13,9 +13,7 @@ // Tests that sub-group shuffles work even when abi is different to what is // expected -#include -#include -#include +#include static constexpr size_t NumElems = VEC_WIDTH; static constexpr size_t NumWorkItems = 64; From 19cd47bcf42358acbe18f7a0e8b3d7b4d7153468 Mon Sep 17 00:00:00 2001 From: Maosu Zhao Date: Wed, 20 Nov 2024 22:37:13 +0800 Subject: [PATCH 25/36] [DevASAN] Fix build failure caused by circle dependency (#16130) --- llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp | 7 +++++-- llvm/lib/Transforms/Instrumentation/CMakeLists.txt | 1 - 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 58d603524cd79..67e98bb67013f 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -64,7 +64,6 @@ #include "llvm/IR/Use.h" #include "llvm/IR/Value.h" #include "llvm/MC/MCSectionMachO.h" -#include "llvm/SYCLLowerIR/DeviceGlobals.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -1566,7 +1565,11 @@ static bool isUnsupportedDeviceGlobal(GlobalVariable *G) { // Non image scope device globals are implemented by device USM, and the // out-of-bounds check for them will be done by sanitizer USM part. So we // exclude them here. - return (!isDeviceGlobalVariable(*G) || !hasDeviceImageScopeProperty(*G)); + if (!G->hasAttribute("sycl-device-image-scope")) + return true; + + Attribute Attr = G->getAttribute("sycl-device-image-scope"); + return (!Attr.isStringAttribute() || Attr.getValueAsString() == "false"); } static bool isUnsupportedSPIRAccess(Value *Addr, Instruction *Inst) { diff --git a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt index e96faba7cc323..f08d936a5bcba 100644 --- a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt +++ b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt @@ -39,7 +39,6 @@ add_llvm_component_library(LLVMInstrumentation Core Demangle MC - SYCLLowerIR Support TargetParser TransformUtils From 090c9f3b320c2cd3e10115167800ff822e332ca8 Mon Sep 17 00:00:00 2001 From: "Yanfeng, Xiao" Date: Wed, 20 Nov 2024 22:55:49 +0800 Subject: [PATCH 26/36] [CI] Uplift oclcpu/oclfpgaemu from 2024.17.3.0.09 to 2024.18.10.0.08 (#15827) Signed-off-by: Sarnie, Nick Co-authored-by: Sarnie, Nick --- devops/dependencies.json | 36 +++++++++++++++---------------- devops/scripts/install_drivers.sh | 4 ++++ 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/devops/dependencies.json b/devops/dependencies.json index 944a3ffb7e8d2..755a1f10625fb 100644 --- a/devops/dependencies.json +++ b/devops/dependencies.json @@ -25,21 +25,21 @@ "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu" }, "tbb": { - "github_tag": "v2021.12.0", - "version": "2021.12.0", - "url": "https://github.com/oneapi-src/oneTBB/releases/download/v2021.12.0/oneapi-tbb-2021.12.0-lin.tgz", + "github_tag": "v2022.0.0", + "version": "2022.0.0", + "url": "https://github.com/oneapi-src/oneTBB/releases/download/v2022.0.0/oneapi-tbb-2022.0.0-lin.tgz", "root": "{DEPS_ROOT}/tbb/lin" }, "oclcpu": { - "github_tag": "2024-WW25", - "version": "2024.18.6.0.02", - "url": "https://github.com/intel/llvm/releases/download/2024-WW25/oclcpuexp-2024.18.6.0.02_rel.tar.gz", + "github_tag": "2024-WW43", + "version": "2024.18.10.0.08", + "url": "https://github.com/intel/llvm/releases/download/2024-WW43/oclcpuexp-2024.18.10.0.08_rel.tar.gz", "root": "{DEPS_ROOT}/opencl/runtime/linux/oclcpu" }, "fpgaemu": { - "github_tag": "2024-WW25", - "version": "2024.18.6.0.02", - "url": "https://github.com/intel/llvm/releases/download/2024-WW25/fpgaemu-2024.18.6.0.02_rel.tar.gz", + "github_tag": "2024-WW43", + "version": "2024.18.10.0.08", + "url": "https://github.com/intel/llvm/releases/download/2024-WW43/fpgaemu-2024.18.10.0.08_rel.tar.gz", "root": "{DEPS_ROOT}/opencl/runtime/linux/oclfpgaemu" }, "fpga": { @@ -53,21 +53,21 @@ "root": "" }, "tbb": { - "github_tag": "v2021.12.0", - "version": "2021.12.0", - "url": "https://github.com/oneapi-src/oneTBB/releases/download/v2021.12.0/oneapi-tbb-2021.12.0-win.zip", + "github_tag": "v2022.0.0", + "version": "2022.0.0", + "url": "https://github.com/oneapi-src/oneTBB/releases/download/v2022.0.0/oneapi-tbb-2022.0.0-win.zip", "root": "{DEPS_ROOT}/tbb/win" }, "oclcpu": { - "github_tag": "2024-WW25", - "version": "2024.18.6.0.02", - "url": "https://github.com/intel/llvm/releases/download/2024-WW25/win-oclcpuexp-2024.18.6.0.02_rel.zip", + "github_tag": "2024-WW43", + "version": "2024.18.10.0.08", + "url": "https://github.com/intel/llvm/releases/download/2024-WW43/win-oclcpuexp-2024.18.10.0.08_rel.zip", "root": "{DEPS_ROOT}/opencl/runtime/linux/oclcpu" }, "fpgaemu": { - "github_tag": "2024-WW25", - "version": "2024.18.6.0.02", - "url": "https://github.com/intel/llvm/releases/download/2024-WW25/win-fpgaemu-2024.18.6.0.02_rel.zip", + "github_tag": "2024-WW43", + "version": "2024.18.10.0.08", + "url": "https://github.com/intel/llvm/releases/download/2024-WW43/win-fpgaemu-2024.18.10.0.08_rel.zip", "root": "{DEPS_ROOT}/opencl/runtime/linux/oclfpgaemu" }, "fpga": { diff --git a/devops/scripts/install_drivers.sh b/devops/scripts/install_drivers.sh index f27c7f9c471e7..6efbc792d9ffc 100755 --- a/devops/scripts/install_drivers.sh +++ b/devops/scripts/install_drivers.sh @@ -81,6 +81,10 @@ InstallTBB () { if [ "$TBB_INSTALLED" = false ]; then mkdir -p $INSTALL_LOCATION cd $INSTALL_LOCATION + if [ -d "$INSTALL_LOCATION/oneapi-tbb" ]; then + echo "$INSTALL_LOCATION/oneapi-tbb exists and will be removed!" + rm -Rf $INSTALL_LOCATION/oneapi-tbb; + fi echo "Installing TBB..." echo "TBB version $TBB_TAG" get_release oneapi-src/onetbb $TBB_TAG \ From a4a28bd4584cfc241025c19bfce2fa672cd2d89f Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Wed, 20 Nov 2024 07:08:57 -0800 Subject: [PATCH 27/36] [NFC][SYCL][ESIMD] Don't use boost/mp11 in e2e tests (#16127) I believe that was the last explicit usage of it in the project. Can't remove CMake support for it still because we use boost's `unordered_*map` that depends on mp11. --- sycl/test-e2e/InvokeSimd/Spec/simd_mask.cpp | 19 +++++++++++-------- .../InvokeSimd/Spec/simd_mask_merge.cpp | 1 - 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/sycl/test-e2e/InvokeSimd/Spec/simd_mask.cpp b/sycl/test-e2e/InvokeSimd/Spec/simd_mask.cpp index eeabbe0f52774..9d603ec8289e7 100644 --- a/sycl/test-e2e/InvokeSimd/Spec/simd_mask.cpp +++ b/sycl/test-e2e/InvokeSimd/Spec/simd_mask.cpp @@ -14,7 +14,6 @@ * This test also runs with all types of VISA link time optimizations enabled. */ -#include #include #include #include @@ -141,18 +140,22 @@ int main() { << "\n"; bool passed = true; const bool SupportsDouble = dev.has(aspect::fp64); - using namespace sycl::detail::boost::mp11; using MaskTypes = std::tuple; - tuple_for_each(MaskTypes{}, [&](auto &&x) { - using T = std::remove_reference_t; - if (std::is_same_v && !SupportsDouble) - return; - passed &= !test(q); - }); + std::apply( + [&](auto &&...xs) { + auto f = [&](auto &&x) { + using T = std::remove_reference_t; + if (std::is_same_v && !SupportsDouble) + return; + passed &= !test(q); + }; + ((f(std::forward(xs)), ...)); + }, + MaskTypes{}); std::cout << (passed ? "Test passed\n" : "TEST FAILED\n"); return passed ? 0 : 1; } diff --git a/sycl/test-e2e/InvokeSimd/Spec/simd_mask_merge.cpp b/sycl/test-e2e/InvokeSimd/Spec/simd_mask_merge.cpp index 982a8259783b4..245bb5557869b 100644 --- a/sycl/test-e2e/InvokeSimd/Spec/simd_mask_merge.cpp +++ b/sycl/test-e2e/InvokeSimd/Spec/simd_mask_merge.cpp @@ -1,7 +1,6 @@ // Check that full compilation works: // RUN: %{build} -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr -o %t.out // RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out -#include #include #include #include From 10044a3a7536e688e471db40fedb743d8d39ae32 Mon Sep 17 00:00:00 2001 From: aarongreig Date: Wed, 20 Nov 2024 16:44:21 +0000 Subject: [PATCH 28/36] [UR] Use extension version of clGetKernelSubGroupInfo when necessary (#15896) Co-authored-by: Callum Fare --- sycl/cmake/modules/UnifiedRuntimeTag.cmake | 12 ++++++------ sycl/test-e2e/SubGroup/attributes.cpp | 6 ++---- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake index b7c21f96e389d..95a7b4dc86bce 100644 --- a/sycl/cmake/modules/UnifiedRuntimeTag.cmake +++ b/sycl/cmake/modules/UnifiedRuntimeTag.cmake @@ -1,7 +1,7 @@ -# commit 1675f053176f0860388aa67fd009750a7e03b2c2 -# Merge: 0ea47d7c 22ca5ee2 +# commit 9937d029c7fdcbf101e89f8515f640c145e059c5 +# Merge: 9ac6d5d9 10b0e101 # Author: Callum Fare -# Date: Tue Nov 19 14:41:15 2024 +0000 -# Merge pull request #2337 from aarongreig/aaron/fixCoreFuncMacroWindows -# Fix the CL_CORE_FUNCTION macro on windows. -set(UNIFIED_RUNTIME_TAG 1675f053176f0860388aa67fd009750a7e03b2c2) +# Date: Wed Nov 20 14:49:17 2024 +0000 +# Merge pull request #2258 from aarongreig/aaron/tryUseExtensionSubgroupInfo +# Use extension version of clGetKernelSubGroupInfo when necessary. +set(UNIFIED_RUNTIME_TAG 9937d029c7fdcbf101e89f8515f640c145e059c5) diff --git a/sycl/test-e2e/SubGroup/attributes.cpp b/sycl/test-e2e/SubGroup/attributes.cpp index 51a58751784a8..118349321b4b0 100644 --- a/sycl/test-e2e/SubGroup/attributes.cpp +++ b/sycl/test-e2e/SubGroup/attributes.cpp @@ -1,7 +1,5 @@ -// TODO: Despite using a supported required subgroup size compile_sub_group_size -// reports as 0 on fpga emu, cuda and hip -// XFAIL: accelerator -// XFAIL-TRACKER: URT-697 +// TODO: Despite using a supported required subgroup size compile_sub_group_size +// reports as 0 on cuda and hip // XFAIL: cuda || hip // XFAIL-TRACKER: https://github.com/intel/llvm/issues/14357 From 5636a2ea06db293f351ee903512fa5bfa893eb48 Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Wed, 20 Nov 2024 17:52:50 +0000 Subject: [PATCH 29/36] [SYCL] Pass OCL_ICD_VENDORS to LIT (#16133) Tests can fail if `OCL_ICD_VENDORS` is needed but not set. --- sycl/test-e2e/lit.cfg.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sycl/test-e2e/lit.cfg.py b/sycl/test-e2e/lit.cfg.py index f76a9935ea108..c0e68f2a051ca 100644 --- a/sycl/test-e2e/lit.cfg.py +++ b/sycl/test-e2e/lit.cfg.py @@ -75,6 +75,7 @@ [ "PATH", "OCL_ICD_FILENAMES", + "OCL_ICD_VENDORS", "CL_CONFIG_DEVICES", "SYCL_DEVICE_ALLOWLIST", "SYCL_CONFIG_FILE_NAME", From d9c7bcb0687a4edc308860c1cc939c7daf50af04 Mon Sep 17 00:00:00 2001 From: Nick Sarnie Date: Thu, 21 Nov 2024 03:06:20 +0900 Subject: [PATCH 30/36] [SYCL][E2E] Remove XFAIL from newly passing FPGA test (#16136) We bumped the fpga driver and this test is passing now. Closes: https://github.com/intel/llvm/issues/13887 Signed-off-by: Sarnie, Nick --- sycl/test-e2e/Basic/fpga_tests/fpga_pipes_mixed_usage.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/sycl/test-e2e/Basic/fpga_tests/fpga_pipes_mixed_usage.cpp b/sycl/test-e2e/Basic/fpga_tests/fpga_pipes_mixed_usage.cpp index a9817fab44397..060de2e07adb1 100644 --- a/sycl/test-e2e/Basic/fpga_tests/fpga_pipes_mixed_usage.cpp +++ b/sycl/test-e2e/Basic/fpga_tests/fpga_pipes_mixed_usage.cpp @@ -9,8 +9,6 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out -// XFAIL: * -// XFAIL-TRACKER: https://github.com/intel/llvm/issues/13887 // If users need to use host pipe feature provided by experimental::pipe, all // pipes in their design should use the experimental::pipe (as a workround). From 8ad42d55b5eb911907b7de70db6e024c3f89e139 Mon Sep 17 00:00:00 2001 From: Dmitry Sidorov Date: Wed, 20 Nov 2024 19:39:30 +0100 Subject: [PATCH 31/36] [SYCL] Fix invalid interator after removal in SYCLJointMatrixTransform (#16134) Can be exposed in windows debug build on the tests added by the original patch: LLVM::SYCLLowerIR/JointMatrixTransform/access-chain-no-uses.ll Author: Jinsong Ji Signed-off-by: Sidorov, Dmitry --- llvm/lib/SYCLLowerIR/SYCLJointMatrixTransform.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/SYCLLowerIR/SYCLJointMatrixTransform.cpp b/llvm/lib/SYCLLowerIR/SYCLJointMatrixTransform.cpp index 629b27d61f24b..231ec9a818c19 100644 --- a/llvm/lib/SYCLLowerIR/SYCLJointMatrixTransform.cpp +++ b/llvm/lib/SYCLLowerIR/SYCLJointMatrixTransform.cpp @@ -26,8 +26,9 @@ static constexpr char MATRIX_TYPE[] = "spirv.CooperativeMatrixKHR"; // its users and operands to make LLVM IR more SPIR-V friendly. bool transformAccessChain(Function *F) { bool ModuleChanged = false; - for (auto I : F->users()) { - auto *CI = dyn_cast(I); + for (auto I = F->user_begin(), E = F->user_end(); I != E;) { + User *U = *I++; + auto *CI = dyn_cast(U); if (!CI) continue; From 42e63c1f6125c353403138fd626bd7ba40023433 Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Wed, 20 Nov 2024 10:41:21 -0800 Subject: [PATCH 32/36] [NFCI][SYCL] More `properties`-related refactoring (#16126) * Modify `detail::ConflictingProperties` to accept `properties` list instead of `std::tuple` with individual property values * Remove some "useless" helpers * Change `detail::ValueOrDefault` type-trait to `detail::get_property_or` as it seems a better interface (and can, in theory, work with runtime properties too) --- .../experimental/grf_size_properties.hpp | 27 ++-- .../sycl/ext/intel/experimental/pipes.hpp | 32 +++-- .../ext/intel/experimental/task_sequence.hpp | 35 +++--- .../experimental/annotated_usm/alloc_base.hpp | 34 ++--- .../experimental/annotated_usm/alloc_util.hpp | 57 +-------- .../oneapi/kernel_properties/properties.hpp | 111 +++++++--------- .../sycl/ext/oneapi/properties/properties.hpp | 118 +++++------------- .../ext/oneapi/properties/property_utils.hpp | 22 ---- .../mock_compile_time_properties.hpp | 4 +- 9 files changed, 159 insertions(+), 281 deletions(-) diff --git a/sycl/include/sycl/ext/intel/experimental/grf_size_properties.hpp b/sycl/include/sycl/ext/intel/experimental/grf_size_properties.hpp index 40b36f7bc9383..e63cc02d60b96 100644 --- a/sycl/include/sycl/ext/intel/experimental/grf_size_properties.hpp +++ b/sycl/include/sycl/ext/intel/experimental/grf_size_properties.hpp @@ -56,29 +56,26 @@ template struct ConflictingProperties : std::bool_constant< - ContainsProperty< - sycl::ext::intel::experimental::grf_size_automatic_key, - Properties>::value || - ContainsProperty::value> {}; + Properties::template has_property< + sycl::ext::intel::experimental::grf_size_automatic_key>() || + Properties::template has_property< + sycl::detail::register_alloc_mode_key>()> {}; template struct ConflictingProperties< sycl::ext::intel::experimental::grf_size_automatic_key, Properties> - : std::bool_constant< - ContainsProperty::value || - ContainsProperty::value> {}; + : std::bool_constant() || + Properties::template has_property< + sycl::detail::register_alloc_mode_key>()> {}; template struct ConflictingProperties : std::bool_constant< - ContainsProperty::value || - ContainsProperty< - sycl::ext::intel::experimental::grf_size_automatic_key, - Properties>::value> {}; + Properties::template has_property< + sycl::ext::intel::experimental::grf_size_key>() || + Properties::template has_property< + sycl::ext::intel::experimental::grf_size_automatic_key>()> {}; } // namespace ext::oneapi::experimental::detail } // namespace _V1 diff --git a/sycl/include/sycl/ext/intel/experimental/pipes.hpp b/sycl/include/sycl/ext/intel/experimental/pipes.hpp index 3311d7cd66e07..c09ad05547759 100644 --- a/sycl/include/sycl/ext/intel/experimental/pipes.hpp +++ b/sycl/include/sycl/ext/intel/experimental/pipes.hpp @@ -376,21 +376,29 @@ class pipe : public pipe_base { static constexpr int32_t m_Capacity = _min_capacity; static constexpr int32_t m_ready_latency = - oneapi::experimental::detail::ValueOrDefault< - _propertiesT, ready_latency_key>::template get(0); + oneapi::experimental::detail::get_property_or( + ready_latency<0>) + .value; + static constexpr int32_t m_bits_per_symbol = - oneapi::experimental::detail::ValueOrDefault< - _propertiesT, bits_per_symbol_key>::template get(8); + oneapi::experimental::detail::get_property_or( + bits_per_symbol<8>) + .value; static constexpr bool m_uses_valid = - oneapi::experimental::detail::ValueOrDefault< - _propertiesT, uses_valid_key>::template get(true); + oneapi::experimental::detail::get_property_or(uses_valid_on) + .value; static constexpr bool m_first_symbol_in_high_order_bits = - oneapi::experimental::detail::ValueOrDefault< - _propertiesT, - first_symbol_in_high_order_bits_key>::template get(0); - static constexpr protocol_name m_protocol = oneapi::experimental::detail:: - ValueOrDefault<_propertiesT, protocol_key>::template get( - protocol_name::avalon_streaming_uses_ready); + oneapi::experimental::detail::get_property_or< + first_symbol_in_high_order_bits_key, _propertiesT>( + first_symbol_in_high_order_bits_off) + .value; + static constexpr protocol_name m_protocol = + oneapi::experimental::detail::get_property_or( + protocol_avalon_streaming_uses_ready) + .value; public: static constexpr struct ConstantPipeStorageExp m_Storage = { diff --git a/sycl/include/sycl/ext/intel/experimental/task_sequence.hpp b/sycl/include/sycl/ext/intel/experimental/task_sequence.hpp index b62fca60fef57..cb3434d377a5f 100644 --- a/sycl/include/sycl/ext/intel/experimental/task_sequence.hpp +++ b/sycl/include/sycl/ext/intel/experimental/task_sequence.hpp @@ -109,23 +109,28 @@ class task_sequence< __spv::__spirv_TaskSequenceINTEL *taskSequence; #endif static constexpr int32_t pipelined = - oneapi::experimental::detail::ValueOrDefault< - property_list_t, pipelined_key>::template get(-1); - static constexpr int32_t fpga_cluster = - has_property() - ? static_cast< - typename std::underlying_type::type>( - oneapi::experimental::detail::ValueOrDefault:: - template get( - fpga_cluster_options_enum::stall_free)) - : -1; + oneapi::experimental::detail::get_property_or( + intel::experimental::pipelined<-1>) + .value; + static constexpr int32_t fpga_cluster = []() constexpr { + if constexpr (has_property()) + return static_cast< + typename std::underlying_type::type>( + get_property().value); + else + return -1; + }(); static constexpr uint32_t response_capacity = - oneapi::experimental::detail::ValueOrDefault< - property_list_t, response_capacity_key>::template get(0); + oneapi::experimental::detail::get_property_or( + intel::experimental::response_capacity<0>) + .value; static constexpr uint32_t invocation_capacity = - oneapi::experimental::detail::ValueOrDefault< - property_list_t, invocation_capacity_key>::template get(0); + oneapi::experimental::detail::get_property_or( + intel::experimental::invocation_capacity<0>) + .value; }; } // namespace ext::intel::experimental diff --git a/sycl/include/sycl/ext/oneapi/experimental/annotated_usm/alloc_base.hpp b/sycl/include/sycl/ext/oneapi/experimental/annotated_usm/alloc_base.hpp index 6f2b408e43ffe..bde1b32f602e9 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/annotated_usm/alloc_base.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/annotated_usm/alloc_base.hpp @@ -41,9 +41,8 @@ template ::value, annotated_ptr> -aligned_alloc_annotated(size_t alignment, size_t numBytes, - const device &syclDevice, const context &syclContext, - sycl::usm::alloc kind, +aligned_alloc_annotated(size_t align, size_t numBytes, const device &syclDevice, + const context &syclContext, sycl::usm::alloc kind, const propertyListA &propList = propertyListA{}) { detail::ValidAllocPropertyList::value; @@ -53,12 +52,12 @@ aligned_alloc_annotated(size_t alignment, size_t numBytes, static_cast(propList); constexpr size_t alignFromPropList = - detail::GetAlignFromPropList::value; + detail::get_property_or(alignment<0>).value; const property_list &usmPropList = get_usm_property_list(); - if constexpr (detail::HasUsmKind::value) { + if constexpr (propertyListA::template has_property()) { constexpr sycl::usm::alloc usmKind = - detail::GetUsmKindFromPropList::value; + propertyListA::template get_property().value; if (usmKind != kind) { throw sycl::exception( sycl::make_error_code(sycl::errc::invalid), @@ -72,7 +71,7 @@ aligned_alloc_annotated(size_t alignment, size_t numBytes, "Unknown USM allocation kind was specified."); void *rawPtr = - sycl::aligned_alloc(combine_align(alignment, alignFromPropList), numBytes, + sycl::aligned_alloc(combine_align(align, alignFromPropList), numBytes, syclDevice, syclContext, kind, usmPropList); return annotated_ptr(rawPtr); } @@ -83,9 +82,8 @@ template ::value, annotated_ptr> -aligned_alloc_annotated(size_t alignment, size_t count, - const device &syclDevice, const context &syclContext, - sycl::usm::alloc kind, +aligned_alloc_annotated(size_t align, size_t count, const device &syclDevice, + const context &syclContext, sycl::usm::alloc kind, const propertyListA &propList = propertyListA{}) { detail::ValidAllocPropertyList::value; @@ -95,12 +93,12 @@ aligned_alloc_annotated(size_t alignment, size_t count, static_cast(propList); constexpr size_t alignFromPropList = - detail::GetAlignFromPropList::value; + detail::get_property_or(alignment<0>).value; const property_list &usmPropList = get_usm_property_list(); - if constexpr (detail::HasUsmKind::value) { + if constexpr (propertyListA::template has_property()) { constexpr sycl::usm::alloc usmKind = - detail::GetUsmKindFromPropList::value; + propertyListA::template get_property().value; if (usmKind != kind) { throw sycl::exception( sycl::make_error_code(sycl::errc::invalid), @@ -113,7 +111,7 @@ aligned_alloc_annotated(size_t alignment, size_t count, throw sycl::exception(sycl::make_error_code(sycl::errc::invalid), "Unknown USM allocation kind was specified."); - size_t combinedAlign = combine_align(alignment, alignFromPropList); + size_t combinedAlign = combine_align(align, alignFromPropList); T *rawPtr = sycl::aligned_alloc(combinedAlign, count, syclDevice, syclContext, kind, usmPropList); return annotated_ptr(rawPtr); @@ -212,7 +210,9 @@ std::enable_if_t< malloc_annotated(size_t numBytes, const device &syclDevice, const context &syclContext, const propertyListA &propList) { constexpr sycl::usm::alloc usmKind = - detail::GetUsmKindFromPropList::value; + detail::get_property_or( + usm_kind) + .value; static_assert(usmKind != sycl::usm::alloc::unknown, "USM kind is not specified. Please specify it as an argument " "or in the input property list."); @@ -228,7 +228,9 @@ std::enable_if_t< malloc_annotated(size_t count, const device &syclDevice, const context &syclContext, const propertyListA &propList) { constexpr sycl::usm::alloc usmKind = - detail::GetUsmKindFromPropList::value; + detail::get_property_or( + usm_kind) + .value; static_assert(usmKind != sycl::usm::alloc::unknown, "USM kind is not specified. Please specify it as an argument " "or in the input property list."); diff --git a/sycl/include/sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp b/sycl/include/sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp index 450f6b087d9c3..9217a9c567299 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp @@ -25,52 +25,6 @@ namespace detail { // Type traits for USM allocation with property support //// -// Merge a property list with the usm_kind property -template -using MergeUsmKind = - detail::merged_properties_t})>; - -// Check if a property list contains the a certain property -template -struct HasProperty - : std::bool_constant()> {}; - -template -using HasAlign = HasProperty; -template -using HasUsmKind = HasProperty; -template -using HasBufferLocation = HasProperty; - -template -struct GetPropertyValueFromPropList> - : GetPropertyValueFromPropList> {}; - -// Get the value of alignment from a property list -// If alignment is not present in the property list, set to default value 0 -template -using GetAlignFromPropList = - GetPropertyValueFromPropList), - PropertyListT>; -// Get the value of usm_kind from a property list -// The usm_kind is sycl::usm::alloc::unknown by default -template -using GetUsmKindFromPropList = - GetPropertyValueFromPropList), - PropertyListT>; -// Get the value of buffer_location from a property list -// The buffer location is -1 by default -template -using GetBufferLocationFromPropList = GetPropertyValueFromPropList< - buffer_location_key, int, - decltype(sycl::ext::intel::experimental::buffer_location<-1>), - PropertyListT>; - // Check if a runtime property is valid template struct IsRuntimePropertyValid : std::false_type {}; @@ -143,9 +97,10 @@ struct GetAnnotatedPtrPropertiesWithUsmKind::type; - static_assert(!HasUsmKind::value || - GetUsmKindFromPropList::value == Kind, - "Input property list contains conflicting USM kind."); + static_assert( + detail::get_property_or(usm_kind) + .value == Kind, + "Input property list contains conflicting USM kind."); using type = detail::merged_properties_t, // runtime). Right now only the `buffer_location` has its corresponding USM // runtime property and is transformable template inline property_list get_usm_property_list() { - if constexpr (detail::HasBufferLocation::value) { + if constexpr (PropertyListT::template has_property()) { return property_list{ sycl::ext::intel::experimental::property::usm::buffer_location( - detail::GetBufferLocationFromPropList::value)}; + PropertyListT::template get_property().value)}; } return {}; } diff --git a/sycl/include/sycl/ext/oneapi/kernel_properties/properties.hpp b/sycl/include/sycl/ext/oneapi/kernel_properties/properties.hpp index 9f91607456dd6..f79f56f698a22 100644 --- a/sycl/include/sycl/ext/oneapi/kernel_properties/properties.hpp +++ b/sycl/include/sycl/ext/oneapi/kernel_properties/properties.hpp @@ -104,6 +104,11 @@ struct property_value, constexpr size_t operator[](int Dim) const { return std::array{Dim0, Dims...}[Dim]; } + +private: + constexpr size_t size() const { return sizeof...(Dims) + 1; } + + template friend struct detail::ConflictingProperties; }; template @@ -190,6 +195,11 @@ struct property_value{Dim0, Dims...}[Dim]; } + +private: + constexpr size_t size() const { return sizeof...(Dims) + 1; } + + template friend struct detail::ConflictingProperties; }; template <> @@ -389,78 +399,53 @@ struct HasKernelPropertiesGetMethod().get(std::declval())); }; -// Trait for property compile-time meta names and values. -template struct WGSizePropertyMetaInfo { - static constexpr std::array WGSize = {}; - static constexpr size_t LinearSize = 0; -}; - -template -struct WGSizePropertyMetaInfo> { - static constexpr std::array WGSize = {Dim0, - Dims...}; - static constexpr size_t LinearSize = (Dim0 * ... * Dims); -}; - -template -struct WGSizePropertyMetaInfo> { - static constexpr std::array WGSize = {Dim0, - Dims...}; - static constexpr size_t LinearSize = (Dim0 * ... * Dims); -}; - -// Get the value of a work-group size related property from a property list -template -struct GetWGPropertyFromPropList {}; - -template -struct GetWGPropertyFromPropList> { - using prop_val_t = std::conditional_t< - ContainsProperty>::value, - typename FindCompileTimePropertyValueType< - PropKey, std::tuple>::type, - void>; - static constexpr auto WGSize = - WGSizePropertyMetaInfo>::WGSize; - static constexpr size_t LinearSize = - WGSizePropertyMetaInfo>::LinearSize; -}; - // If work_group_size and max_work_group_size coexist, check that the // dimensionality matches and that the required work-group size doesn't // trivially exceed the maximum size. template -struct ConflictingProperties - : std::false_type { - using WGSizeVal = GetWGPropertyFromPropList; - using MaxWGSizeVal = - GetWGPropertyFromPropList; - // If work_group_size_key doesn't exist in the list of properties, WGSize is - // an empty array and so Dims == 0. - static constexpr size_t Dims = WGSizeVal::WGSize.size(); - static_assert( - Dims == 0 || Dims == MaxWGSizeVal::WGSize.size(), - "work_group_size and max_work_group_size dimensionality must match"); - static_assert(Dims < 1 || WGSizeVal::WGSize[0] <= MaxWGSizeVal::WGSize[0], - "work_group_size must not exceed max_work_group_size"); - static_assert(Dims < 2 || WGSizeVal::WGSize[1] <= MaxWGSizeVal::WGSize[1], - "work_group_size must not exceed max_work_group_size"); - static_assert(Dims < 3 || WGSizeVal::WGSize[2] <= MaxWGSizeVal::WGSize[2], - "work_group_size must not exceed max_work_group_size"); +struct ConflictingProperties { + static constexpr bool value = []() constexpr { + if constexpr (Properties::template has_property()) { + constexpr auto wg_size = + Properties::template get_property(); + constexpr auto max_wg_size = + Properties::template get_property(); + static_assert( + wg_size.size() == max_wg_size.size(), + "work_group_size and max_work_group_size dimensionality must match"); + if constexpr (wg_size.size() == max_wg_size.size()) { + constexpr auto Dims = wg_size.size(); + static_assert(Dims < 1 || wg_size[0] <= max_wg_size[0], + "work_group_size must not exceed max_work_group_size"); + static_assert(Dims < 2 || wg_size[1] <= max_wg_size[1], + "work_group_size must not exceed max_work_group_size"); + static_assert(Dims < 3 || wg_size[2] <= max_wg_size[2], + "work_group_size must not exceed max_work_group_size"); + } + } + return false; + }(); }; // If work_group_size and max_linear_work_group_size coexist, check that the // required linear work-group size doesn't trivially exceed the maximum size. template -struct ConflictingProperties - : std::false_type { - using WGSizeVal = GetWGPropertyFromPropList; - using MaxLinearWGSizeVal = - GetPropertyValueFromPropList; - static_assert(WGSizeVal::WGSize.empty() || - WGSizeVal::LinearSize <= MaxLinearWGSizeVal::value, - "work_group_size must not exceed max_linear_work_group_size"); +struct ConflictingProperties { + static constexpr bool value = []() constexpr { + if constexpr (Properties::template has_property()) { + constexpr auto wg_size = + Properties::template get_property(); + constexpr auto dims = wg_size.size(); + constexpr auto linear_size = wg_size[0] * (dims > 1 ? wg_size[1] : 1) * + (dims > 2 ? wg_size[2] : 1); + constexpr auto max_linear_wg_size = + Properties::template get_property(); + static_assert( + linear_size < max_linear_wg_size.value, + "work_group_size must not exceed max_linear_work_group_size"); + } + return false; + }(); }; } // namespace detail diff --git a/sycl/include/sycl/ext/oneapi/properties/properties.hpp b/sycl/include/sycl/ext/oneapi/properties/properties.hpp index 12c0af87a0e87..ef31714cb126a 100644 --- a/sycl/include/sycl/ext/oneapi/properties/properties.hpp +++ b/sycl/include/sycl/ext/oneapi/properties/properties.hpp @@ -23,70 +23,6 @@ namespace ext::oneapi::experimental { namespace detail { -// Checks if a tuple of properties contains a property. -template -struct ContainsProperty : std::false_type {}; -template -struct ContainsProperty> - : ContainsProperty> {}; -template -struct ContainsProperty> : std::true_type {}; -template -struct ContainsProperty< - PropT, std::tuple, Rest...>> - : std::true_type {}; - -// Finds the full property_value type of a property in a tuple of properties. -// type is void if the type was not found in the tuple of properties. -template -struct FindCompileTimePropertyValueType { - using type = void; -}; -template -struct FindCompileTimePropertyValueType> { - using type = - typename FindCompileTimePropertyValueType>::type; -}; -template -struct FindCompileTimePropertyValueType< - CTPropertyT, - std::tuple, Rest...>> { - using type = property_value; -}; - -template -static constexpr std::enable_if_t< - HasProperty, - typename FindCompileTimePropertyValueType::type> -get_property() { - return {}; -} - -template -static constexpr std::enable_if_t get_property() { - return; -} - -// Get the value of a property from a property list -template -struct GetPropertyValueFromPropList {}; - -template -struct GetPropertyValueFromPropList> { - using prop_val_t = std::conditional_t< - ContainsProperty>::value, - typename FindCompileTimePropertyValueType< - PropKey, std::tuple>::type, - DefaultPropVal>; - static constexpr ConstType value = - PropertyMetaInfo>::value; -}; - template inline constexpr bool properties_are_unique = []() constexpr { if constexpr (sizeof...(property_tys) == 0) { @@ -241,6 +177,10 @@ class __SYCL_EBO } } } + + template static constexpr bool has_property() { + return false; + } }; // NOTE: Meta-function to implement CTAD rules isn't allowed to return @@ -252,9 +192,6 @@ class __SYCL_EBO properties> : private property_tys... { static_assert(detail::properties_are_sorted, "Properties must be sorted!"); - static_assert( - detail::NoConflictingProperties>::value, - "Conflicting properties in property list."); using property_tys::get_property_impl...; template friend class __SYCL_EBO properties; @@ -282,6 +219,9 @@ class __SYCL_EBO properties> detail::property_key_tag{})); public: + // Definition is out-of-class so that `properties` would be complete there and + // its interfaces could be used in `ConflictingProperties`' partial + // specializations. template < typename... unsorted_property_tys, typename = std::enable_if_t< @@ -291,8 +231,7 @@ class __SYCL_EBO properties> ...))>, typename = std::enable_if_t< detail::properties_are_unique>> - constexpr properties(unsorted_property_tys... props) - : unsorted_property_tys(props)... {} + constexpr properties(unsorted_property_tys... props); template static constexpr bool has_property() { return std::is_base_of_v, @@ -318,6 +257,17 @@ class __SYCL_EBO properties> } }; +template +template +constexpr properties>::properties( + unsorted_property_tys... props) + : unsorted_property_tys(props)... { + static_assert(((!detail::ConflictingProperties::value && + ...)), + "Conflicting properties in property list."); +} + // Deduction guides template using merged_properties_t = decltype(merge_properties( std::declval(), std::declval())); -template -struct ValueOrDefault { - template static constexpr ValT get(ValT Default) { - return Default; - } -}; - -template -struct ValueOrDefault< - Properties, PropertyKey, - std::enable_if_t && - Properties::template has_property()>> { - template static constexpr ValT get(ValT) { - return Properties::template get_property().value; - } -}; +template +constexpr auto get_property_or(default_t value, const prop_list_t &props) { + if constexpr (prop_list_t::template has_property()) + return props.template get_property(); + else + return value; +} +template +constexpr auto get_property_or(default_t value) { + if constexpr (prop_list_t::template has_property()) + return prop_list_t::template get_property(); + else + return value; +} // helper: check_all_props_are_keys_of template constexpr bool check_all_props_are_keys_of() { diff --git a/sycl/include/sycl/ext/oneapi/properties/property_utils.hpp b/sycl/include/sycl/ext/oneapi/properties/property_utils.hpp index 0fdbc1e82e518..be07fe5d4e9fa 100644 --- a/sycl/include/sycl/ext/oneapi/properties/property_utils.hpp +++ b/sycl/include/sycl/ext/oneapi/properties/property_utils.hpp @@ -160,28 +160,6 @@ struct SizeListToStr : SizeListToStrHelper, CharList<>> {}; template struct ConflictingProperties : std::false_type {}; -template -struct NoConflictingPropertiesHelper {}; - -template -struct NoConflictingPropertiesHelper> - : std::true_type {}; - -template -struct NoConflictingPropertiesHelper> - : NoConflictingPropertiesHelper> {}; - -template -struct NoConflictingPropertiesHelper< - Properties, std::tuple, Rest...>> - : std::conditional_t< - ConflictingProperties::value, std::false_type, - NoConflictingPropertiesHelper>> {}; -template -struct NoConflictingProperties - : NoConflictingPropertiesHelper {}; - //****************************************************************************** // Conditional property meta-info //****************************************************************************** diff --git a/sycl/test/extensions/properties/mock_compile_time_properties.hpp b/sycl/test/extensions/properties/mock_compile_time_properties.hpp index ea8d98ffa5e58..6d167d7c8ecf9 100644 --- a/sycl/test/extensions/properties/mock_compile_time_properties.hpp +++ b/sycl/test/extensions/properties/mock_compile_time_properties.hpp @@ -101,11 +101,11 @@ struct is_property_key_of : std::true_type {}; namespace detail { template struct ConflictingProperties - : ContainsProperty {}; + : std::bool_constant()> {}; template struct ConflictingProperties - : ContainsProperty {}; + : std::bool_constant()> {}; } // namespace detail } // namespace experimental From 9e3aabf45db24ee207b71774e4b550b939229d34 Mon Sep 17 00:00:00 2001 From: Nick Sarnie Date: Thu, 21 Nov 2024 06:29:14 +0900 Subject: [PATCH 33/36] [SYCL][ESIMD][E2E] Fix LSC USM store test failure (#16122) `lsc_usm_store_u32.cpp` currently fails in syclos but passes in the internal compiler. The reason is that `rand` returns 0 and when `sycl::bit_cast` to `float`, it ends up as a very very small floating point number, like `1.4e-41`. In the internal compiler, this gets optimized to zero, probably due to unsafe fp math optimizations. It is also zero on-device. In syclos the host remains as that small number and ends up screwing up the correctness check because we need 0. Just explicitly return zero when the bit-casted result is below epsilon for the type. Signed-off-by: Sarnie, Nick --- sycl/test-e2e/ESIMD/lsc/Inputs/common.hpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sycl/test-e2e/ESIMD/lsc/Inputs/common.hpp b/sycl/test-e2e/ESIMD/lsc/Inputs/common.hpp index 45c0a99840d93..661594d296e14 100644 --- a/sycl/test-e2e/ESIMD/lsc/Inputs/common.hpp +++ b/sycl/test-e2e/ESIMD/lsc/Inputs/common.hpp @@ -8,6 +8,7 @@ #pragma once +#include #include #include @@ -23,5 +24,7 @@ template T get_rand() { Tuint v = rand(); if constexpr (sizeof(Tuint) > 4) v = (v << 32) | rand(); - return sycl::bit_cast(v); + T bitcast_v = sycl::bit_cast(v); + return bitcast_v <= std::numeric_limits::epsilon() ? static_cast(0) + : bitcast_v; } From 5e61f8fbc14274a63376d1e23ffe67a105839785 Mon Sep 17 00:00:00 2001 From: David Garcia Orozco Date: Wed, 20 Nov 2024 14:50:56 -0700 Subject: [PATCH 34/36] [SYCL][E2E] Use `/clang:` when using MSVC driver on bindless images test (#16140) this should fix the compfail on this test when using clang-cl. --- .../bindless_images/dx12_interop/read_write_unsampled.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sycl/test-e2e/bindless_images/dx12_interop/read_write_unsampled.cpp b/sycl/test-e2e/bindless_images/dx12_interop/read_write_unsampled.cpp index 3f4e6e433643f..9985b27a0c4a9 100644 --- a/sycl/test-e2e/bindless_images/dx12_interop/read_write_unsampled.cpp +++ b/sycl/test-e2e/bindless_images/dx12_interop/read_write_unsampled.cpp @@ -1,7 +1,8 @@ // REQUIRES: aspect-ext_oneapi_bindless_images // REQUIRES: windows -// RUN: %{build} -l d3d12 -l dxgi -l dxguid -o %t.out +// DEFINE: %{link-flags}=%if cl_options %{ /clang:-ld3d12 /clang:-ldxgi /clang:-ldxguid %} %else %{ -ld3d12 -ldxgi -ldxguid %} +// RUN: %{build} %{link-flags} -o %t.out // RUN: %{run-unfiltered-devices} env NEOReadDebugKeys=1 UseBindlessMode=1 UseExternalAllocatorForSshAndDsh=1 %t.out #pragma clang diagnostic ignored "-Waddress-of-temporary" From 8b719e96871593bcc03b158f215a739d54e921ba Mon Sep 17 00:00:00 2001 From: Alexey Sachkov Date: Thu, 21 Nov 2024 09:57:22 +0100 Subject: [PATCH 35/36] [SYCL][NFC] Re-enable, fix and upgrade headers testing (#16117) We have a suite to check that every header contains enough `#include` directives and forward declarations so that it can be included standalone and used. However, that suite got accidentally disabled some time ago in #14879 This PR re-enabled the suite and fixed all issues it detected. Besides that, the suite is upgraded: - it now scans source directory, meaning that it won't complain about headers which were removed from the codebase, but left in build folder - it is now possible to have add a custom test instead of auto-generated one: this is useful when you want to trigger certain template instantiations to make sure that all code paths are covered --- .../sycl/detail/id_queries_fit_in_int.hpp | 2 + sycl/include/sycl/ext/oneapi/backend/hip.hpp | 2 +- .../oneapi/experimental/raw_kernel_arg.hpp | 3 + .../oneapi/experimental/virtual_functions.hpp | 3 + .../oneapi/experimental/work_group_memory.hpp | 12 +++- sycl/test/format.py | 57 ++++++++++++++--- sycl/test/lit.site.cfg.py.in | 1 + sycl/test/self-contained-headers/README.md | 63 ++++++++++++++----- .../sycl/handler.hpp.cpp | 16 +++++ 9 files changed, 136 insertions(+), 23 deletions(-) create mode 100644 sycl/test/self-contained-headers/sycl/handler.hpp.cpp diff --git a/sycl/include/sycl/detail/id_queries_fit_in_int.hpp b/sycl/include/sycl/detail/id_queries_fit_in_int.hpp index d3ce74dfdfc0a..3f12b47bd4296 100644 --- a/sycl/include/sycl/detail/id_queries_fit_in_int.hpp +++ b/sycl/include/sycl/detail/id_queries_fit_in_int.hpp @@ -23,6 +23,8 @@ #ifndef __SYCL_DEVICE_ONLY__ #include +#include +#include #include #include diff --git a/sycl/include/sycl/ext/oneapi/backend/hip.hpp b/sycl/include/sycl/ext/oneapi/backend/hip.hpp index 86f22d74e78d9..0f59dd2f4116a 100644 --- a/sycl/include/sycl/ext/oneapi/backend/hip.hpp +++ b/sycl/include/sycl/ext/oneapi/backend/hip.hpp @@ -8,7 +8,7 @@ #pragma once -#include +#include #include namespace sycl { diff --git a/sycl/include/sycl/ext/oneapi/experimental/raw_kernel_arg.hpp b/sycl/include/sycl/ext/oneapi/experimental/raw_kernel_arg.hpp index e744181906a24..d53095d066e77 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/raw_kernel_arg.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/raw_kernel_arg.hpp @@ -14,6 +14,9 @@ namespace sycl { inline namespace _V1 { class handler; +namespace detail { +class dynamic_parameter_impl; +} namespace ext::oneapi::experimental { diff --git a/sycl/include/sycl/ext/oneapi/experimental/virtual_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/virtual_functions.hpp index 4e1d0e13eb623..9b13f6e3ed123 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/virtual_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/virtual_functions.hpp @@ -1,8 +1,11 @@ #pragma once #include +#include #include +#include + namespace sycl { inline namespace _V1 { namespace ext::oneapi::experimental { diff --git a/sycl/include/sycl/ext/oneapi/experimental/work_group_memory.hpp b/sycl/include/sycl/ext/oneapi/experimental/work_group_memory.hpp index 254fd8d877f8e..c03bdef7efceb 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/work_group_memory.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/work_group_memory.hpp @@ -7,10 +7,18 @@ #pragma once +#include +#include +#include +#include + +#include #include namespace sycl { inline namespace _V1 { +class handler; + namespace detail { template struct is_unbounded_array : std::false_type {}; @@ -37,8 +45,10 @@ namespace ext::oneapi::experimental { struct indeterminate_t {}; inline constexpr indeterminate_t indeterminate; - template +class work_group_memory; + +template class __SYCL_SPECIAL_CLASS __SYCL_TYPE(work_group_memory) work_group_memory : sycl::detail::work_group_memory_impl { public: diff --git a/sycl/test/format.py b/sycl/test/format.py index eb3a497ecd004..de957dc87ed53 100644 --- a/sycl/test/format.py +++ b/sycl/test/format.py @@ -5,26 +5,57 @@ import os import re -SUFFIXES = {".hpp"} - - class SYCLHeadersTest(lit.formats.TestFormat): def getTestsForPath( self, testSuite, path_in_suite, filepath, litConfig, localConfig ): + # path_in_suite is a tuple like: + # ('self-contained-headers', 'path/to', 'file.hpp') + test_path = testSuite.getSourcePath(path_in_suite) + ".cpp" + if os.path.exists(test_path): + # We have a dedicated special test for a header, let's use a file + # from the suite itself + + # None is a special value we use to distinguish those two cases + filepath = None + # The actual file has .cpp extension as every other test + path_in_suite = path_in_suite[:-1] + (path_in_suite[-1] + ".cpp",) + else: + # We don't have a dedicated special test for a header, therefore we + # fallback to a generalized version of it + + # SYCL headers may depend on some generated files and therefore we + # use headers from the build folder for testing + filepath = os.path.join(localConfig.sycl_include, *path_in_suite[1:]) + yield lit.Test.Test(testSuite, path_in_suite, localConfig, file_path=filepath) def getTestsInDirectory(self, testSuite, path_in_suite, litConfig, localConfig): - # We traverse build/sycl/include/sycl directory - source_path = os.path.join(localConfig.sycl_include, "sycl") + # To respect SYCL_LIB_DUMPS_ONLY mode + if ".cpp" not in localConfig.suffixes: + return + + # As we add more files and folders into 'self-contained-headers', this + # method will be recursivelly called for them by lit discovery. + # However, we don't use the test folder as the source of tests but + # instead we use SYCL_SOURCE_DIR/include/sycl directory. + # Therefore, we exit early from here if `path_in_suite` conatins more + # than one element + assert path_in_suite[0] == "self-contained-headers" + if len(path_in_suite) > 1: + return + + source_path = os.path.join(localConfig.sycl_include_source_dir, "sycl") # Optional filter can be passed through command line options headers_filter = localConfig.sycl_headers_filter for dirpath, _, filenames in os.walk(source_path): - relative_dirpath = dirpath[len(localConfig.sycl_include) + 1 :] + relative_dirpath = dirpath[len(localConfig.sycl_include_source_dir) + 1 :] for filename in filenames: suffix = os.path.splitext(filename)[1] - if suffix not in SUFFIXES or suffix not in litConfig.suffixes: + # We only look at actual header files and not at their .inc/.def + # components + if suffix != ".hpp": continue filepath = os.path.join(dirpath, filename) @@ -46,6 +77,18 @@ def getTestsInDirectory(self, testSuite, path_in_suite, litConfig, localConfig): yield t def execute(self, test, litConfig): + if test.file_path is None: + # It means that we have a special test case for a header and we need + # to execute it as a regular lit sh test + return lit.TestRunner.executeShTest( + test, + litConfig, + False, # execute_external + [], # extra_substitutions + [], # preamble_commands + ) + + # Otherwise we generate the test on the fly command = [ test.config.clang, "-fsycl", diff --git a/sycl/test/lit.site.cfg.py.in b/sycl/test/lit.site.cfg.py.in index 67b3ad912d81d..cc9043f71bf3e 100644 --- a/sycl/test/lit.site.cfg.py.in +++ b/sycl/test/lit.site.cfg.py.in @@ -9,6 +9,7 @@ config.sycl_tools_dir = lit_config.params.get('SYCL_TOOLS_DIR', "@LLVM_TOOLS_DIR config.sycl_include = lit_config.params.get('SYCL_INCLUDE', "@SYCL_INCLUDE@") config.sycl_obj_root = "@SYCL_BINARY_DIR@" config.sycl_source_dir = "@SYCL_SOURCE_DIR@/source" +config.sycl_include_source_dir = "@SYCL_SOURCE_DIR@/include" config.sycl_libs_dir = lit_config.params.get('SYCL_LIBS_DIR', "@LLVM_LIBS_DIR@") config.target_triple = "@LLVM_TARGET_TRIPLE@" config.host_triple = "@LLVM_HOST_TRIPLE@" diff --git a/sycl/test/self-contained-headers/README.md b/sycl/test/self-contained-headers/README.md index cddb6c5f930c1..7b510e28c5677 100644 --- a/sycl/test/self-contained-headers/README.md +++ b/sycl/test/self-contained-headers/README.md @@ -14,23 +14,38 @@ still be sure that we haven't accidentally removed a necessary `#include`. meaning that any warnings coming out of them may be turned into errors and will affect test results. This is considered as an extra feature of the suite. +**One more note:** due to templated nature of SYCL headers, not every code path +may be instantiated by a mere `#include` and therefore not every dependency will +be highlighted by a simple test. To overcome this, there is an ability to write +dedicated tests for certain headers which are more exhaustive than a simple +`#include`, see more details below. + ## Implementation There was a couple of iterations on the suite design and its current shape features the following: -- each header in `build/include/sycl` is checked as a separate test -- each such test is generated on the fly dynamically during LIT discovery phase +- each header in `build/include/sycl` is checked as a separate test, unless: + - it doesn't exists in `source/include/sycl`, meaning that it is likely + removed from the codebase, but still resides in `build/` directory + - **TODO:** we also have some auto-generated headers which could be skipped + this way, we need to consider a mechanism to handle them as well + - **TODO:** presence of outdated headers in `build` directory should also be + detected, or otherwise it can lead to compilation issues being hidden in + local setup +- each such test is generated on the fly dynamically during LIT discovery phase, + unless: + - there is a special/dedicated test for a header, more details below That is done to allow for massive parallelism and keep those tests small and quick. -Absolute most of the magic is happenning within +Absolute most of the magic is happening within [`sycl/test/format.py`](/sycl/test/format.py): we define a custom test format in there which overrides standard discovery and test execution rules. ## How to use and maintain -Those tests are part of `check-sycl` target and you can pass a regexp acepted +Those tests are part of `check-sycl` target and you can pass a regexp accepted by Python's `re` package as `SYCL_HEADERS_FILTER` parameter to LIT to filter which headers you would like to see checked (only those that match the passed regexp will be used to generate tests). @@ -47,11 +62,38 @@ Documentation for Python's regexp can be found [here][python-3-re]. [python-3-re]: https://docs.python.org/3/library/re.html#regular-expression-syntax -Since there are no dedicated files for each test, `XFAIL`ing them using regular -method is impossible, but it is still supported. To do so, open +Since there are no dedicated files for auto-generated tests, `XFAIL`ing them +using regular method is impossible, but it is still supported. To do so, open [the local config](/sycl/test/self-contained-headers/lit.local.cfg) and modify list of files which should be treated as expected to fail. +### Special tests + +As noted above, to truly ensure that SYCL headers are self-contained, we need +not only include them, but also use them +(read: instantiate all classes and methods). + +To support that, for every SYCL header we have in `source/include/sycl` the tool +first checks if there is a corresponding test file in +`source/test/self-contained-headers` and if so, it is used instead of an +auto-generated one. + +Those special tests should be named and located in certain place to be detected, +or otherwise they will be ignored. For a header +`source/include/sycl/path/to/header.hpp` its special test should be placed under +`source/test/sycl/self-contained-headers/sycl/path/to/header.hpp.cpp`. + +Note a few things: directory structure should exactly match, the filename should +be the same as the header file name, but with `.cpp` extension added on top of +it. + +Those special tests will be treated as any other regular Sh-based tests, i.e. +you should write your regular `RUN` lines in there. It is expected that those +tests will run a compilation under `-fsyntax-only` mode and verify absence of +any compilation errors or warnings through `-Xclang -verify` mechanism. + +Special tests can be `XFAIL`-ed using a regular LIT mechanism. + ## Known issues and quirks ### To launch the suite directly, use `LIT_FILTER` env variable @@ -70,14 +112,7 @@ Instead, the following approach should be used: LIT_FILTER='self-contained-headers' llvm-lit sycl/test ``` -### Old legacy files in build/ area are still checked - -The custom discovery script uses `build/include/sycl/` folder contents to -generate tests for each header it finds there. It means that if some header was -removed from the codebase, it may still be present in `build` folder unless -some cleanup is performed. - -### No OS-specific `XFAIL` mechanism is implemented +### No OS-specific `XFAIL` mechanism is implemented for auto-generated tests `XFAIL` mechanism mentioned in "How to use and maintain" section does not support marking a test as expected to fail only in certain environment, which diff --git a/sycl/test/self-contained-headers/sycl/handler.hpp.cpp b/sycl/test/self-contained-headers/sycl/handler.hpp.cpp new file mode 100644 index 0000000000000..28a23b0dd03cf --- /dev/null +++ b/sycl/test/self-contained-headers/sycl/handler.hpp.cpp @@ -0,0 +1,16 @@ +// RUN: %clangxx -fsycl -fsyntax-only -Xclang -verify %s +// expected-no-diagnostics +// +// The purpose of this test is to ensure that the header containing +// sycl::handler class definition is self-contained, i.e. we can use handler +// and no extra headers are needed. +// +// TODO: the test should be expanded to use various methods of the class. Due +// to their template nature we may not test all code paths until we trigger +// instantiation of a corresponding method. + +#include + +class kernel_name; + +void foo(sycl::handler &h) {} From 925ff7656e57d78fcf587e7b93b81dcf9f490128 Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Thu, 21 Nov 2024 01:08:31 -0800 Subject: [PATCH 36/36] [NFC][SYCL] More consistent usage of `detail::properties_t` helper (#16142) This also moves `filter|merge_properties` to `properties.hpp` to make this change a tiny bit easier/smaller, but that is a more suitable location for them anyway - it's strange to define operations on `properties` in a header that is included from a header implementing that same `properties` class. --- .../annotated_arg/annotated_arg.hpp | 3 +- .../annotated_ptr/annotated_ptr.hpp | 3 +- .../sycl/ext/oneapi/properties/properties.hpp | 115 +++++++++++++++--- .../ext/oneapi/properties/property_utils.hpp | 83 ------------- .../properties/properties_kernel_negative.cpp | 46 +++---- 5 files changed, 126 insertions(+), 124 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/annotated_arg/annotated_arg.hpp b/sycl/include/sycl/ext/oneapi/experimental/annotated_arg/annotated_arg.hpp index 851497b7ead3b..b71f4fc4e0f08 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/annotated_arg/annotated_arg.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/annotated_arg/annotated_arg.hpp @@ -66,8 +66,7 @@ annotated_arg(T, Args...) -> annotated_arg::type>; template -annotated_arg(annotated_arg, - properties>) +annotated_arg(annotated_arg, detail::properties_t) -> annotated_arg< T, detail::merged_properties_t>>; diff --git a/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr.hpp b/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr.hpp index 13db0b377aecc..28318364b33f2 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr.hpp @@ -236,8 +236,7 @@ annotated_ptr(T *, Args...) -> annotated_ptr::type>; template -annotated_ptr(annotated_ptr, - properties>) +annotated_ptr(annotated_ptr, detail::properties_t) -> annotated_ptr< T, detail::merged_properties_t>>; #endif // __cpp_deduction_guides diff --git a/sycl/include/sycl/ext/oneapi/properties/properties.hpp b/sycl/include/sycl/ext/oneapi/properties/properties.hpp index ef31714cb126a..7a840df39f2fc 100644 --- a/sycl/include/sycl/ext/oneapi/properties/properties.hpp +++ b/sycl/include/sycl/ext/oneapi/properties/properties.hpp @@ -21,8 +21,30 @@ namespace sycl { inline namespace _V1 { namespace ext::oneapi::experimental { +template class __SYCL_EBO properties; + namespace detail { +// NOTE: Meta-function to implement CTAD rules isn't allowed to return +// `properties` and it's impossible to return a pack as well. As +// such, we're forced to have an extra level of `detail::properties_type_list` +// for the purpose of providing CTAD rules. +template struct properties_type_list; + +// This is used in a separate `properties` specialization to report friendlier +// errors. +template struct invalid_properties_type_list {}; + +// Helper for reconstructing a properties type. This assumes that +// PropertyValueTs is sorted and contains only valid properties. +// +// It also allows us to hide details of `properties` implementation from the +// code that uses/defines them (with the exception of ESIMD which is extremely +// hacky in its own esimd::properties piggybacking on these ones). +template +using properties_t = + properties>; + template inline constexpr bool properties_are_unique = []() constexpr { if constexpr (sizeof...(property_tys) == 0) { @@ -66,9 +88,6 @@ constexpr bool properties_are_valid_for_ctad = []() constexpr { } }(); -template struct properties_type_list; -template struct invalid_properties_type_list {}; - template struct properties_sorter { // Not using "auto" due to MSVC bug in v19.36 and older. v19.37 and later is // able to compile "auto" just fine. See https://godbolt.org/z/eW3rjjs7n. @@ -118,8 +137,6 @@ template <> struct properties_sorter<> { } // namespace detail -template class __SYCL_EBO properties; - // Empty property list. template <> class __SYCL_EBO properties> { template @@ -183,10 +200,6 @@ class __SYCL_EBO } }; -// NOTE: Meta-function to implement CTAD rules isn't allowed to return -// `properties` and it's impossible to return a pack as well. As -// such, we're forced to have an extra level of `detail::properties_type_list` -// for the purpose of providing CTAD rules. template class __SYCL_EBO properties> : private property_tys... { @@ -287,11 +300,85 @@ using empty_properties_t = decltype(properties{}); namespace detail { -// Helper for reconstructing a properties type. This assumes that -// PropertyValueTs is sorted and contains only valid properties. -template -using properties_t = - properties>; +template