Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update the kernel_queue_specific::max_num_work_group query #16051

Open
wants to merge 13 commits into
base: sycl
Choose a base branch
from
2 changes: 1 addition & 1 deletion sycl/cmake/modules/FetchUnifiedRuntime.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ if(SYCL_UR_USE_FETCH_CONTENT)
CACHE PATH "Path to external '${name}' adapter source dir" FORCE)
endfunction()

set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
set(UNIFIED_RUNTIME_REPO "https://github.com/0x12CC/unified-runtime.git")
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/UnifiedRuntimeTag.cmake)

set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES")
Expand Down
2 changes: 1 addition & 1 deletion sycl/cmake/modules/UnifiedRuntimeTag.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
# Date: Tue Nov 26 13:32:11 2024 +0000
# Merge pull request #2351 from nrspruit/mcl_1_1
# [L0] Add support for the MCL 1.1 apis thru the spec extensions
set(UNIFIED_RUNTIME_TAG e0f22b54080411537e8a12fe42f06dbac5e7a2d2)
set(UNIFIED_RUNTIME_TAG ca0a0e3e3b3ad933605a35850ebc11bbf5d5f367)
Original file line number Diff line number Diff line change
@@ -1,4 +1 @@
// TODO: Revisit 'max_num_work_group_sync' and align it with the
// 'sycl_ext_oneapi_forward_progress' extension once #7598 is merged.
__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_group_sync, size_t,)
__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_groups, size_t,)
26 changes: 26 additions & 0 deletions sycl/include/sycl/kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,32 @@ class __SYCL_EXPORT kernel : public detail::OwnerLessBase<kernel> {
typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
ext_oneapi_get_info(queue Queue) const;

/// Query queue/launch-specific information from a kernel using the
/// info::kernel_queue_specific descriptor for a specific Queue and values.
/// max_num_work_groups is the only valid descriptor for this function.
///
/// \param Queue is a valid SYCL queue.
/// \param WorkGroupSize is the work-group size the number of work-groups is
/// requested for.
/// \return depends on information being queried.
template <typename Param>
typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
ext_oneapi_get_info(queue Queue, const range<1> &WorkGroupSize,
size_t DynamicLocalMemorySize) const;

/// Query queue/launch-specific information from a kernel using the
/// info::kernel_queue_specific descriptor for a specific Queue and values.
/// max_num_work_groups is the only valid descriptor for this function.
///
/// \param Queue is a valid SYCL queue.
/// \param WorkGroupSize is the work-group size the number of work-groups is
/// requested for.
/// \return depends on information being queried.
template <typename Param>
typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
ext_oneapi_get_info(queue Queue, const range<2> &WorkGroupSize,
size_t DynamicLocalMemorySize) const;

/// Query queue/launch-specific information from a kernel using the
/// info::kernel_queue_specific descriptor for a specific Queue and values.
/// max_num_work_groups is the only valid descriptor for this function.
Expand Down
32 changes: 0 additions & 32 deletions sycl/source/detail/kernel_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,38 +106,6 @@ void kernel_impl::checkIfValidForNumArgsInfoQuery() const {
"interoperability function or to query a device built-in kernel");
}

bool kernel_impl::exceedsOccupancyResourceLimits(
const device &Device, const range<3> &WorkGroupSize,
size_t DynamicLocalMemorySize) const {
// Respect occupancy limits for WorkGroupSize and DynamicLocalMemorySize.
// Generally, exceeding hardware resource limits will yield in an error when
// the kernel is launched.
const size_t MaxWorkGroupSize =
get_info<info::kernel_device_specific::work_group_size>(Device);
const size_t MaxLocalMemorySizeInBytes =
Device.get_info<info::device::local_mem_size>();

if (WorkGroupSize.size() > MaxWorkGroupSize)
return true;

if (DynamicLocalMemorySize > MaxLocalMemorySizeInBytes)
return true;

// It will be impossible to launch a kernel for Cuda when the hardware limit
// for the 32-bit registers page file size is exceeded.
if (Device.get_backend() == backend::ext_oneapi_cuda) {
const uint32_t RegsPerWorkItem =
get_info<info::kernel_device_specific::ext_codeplay_num_regs>(Device);
const uint32_t MaxRegsPerWorkGroup =
Device.get_info<ext::codeplay::experimental::info::device::
max_registers_per_work_group>();
if ((MaxWorkGroupSize * RegsPerWorkItem) > MaxRegsPerWorkGroup)
return true;
}

return false;
}

template <>
typename info::platform::version::return_type
kernel_impl::get_backend_info<info::platform::version>() const {
Expand Down
128 changes: 101 additions & 27 deletions sycl/source/detail/kernel_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,32 @@ class kernel_impl {
template <typename Param>
typename Param::return_type ext_oneapi_get_info(queue Queue) const;

/// Query queue/launch-specific information from a kernel using the
/// info::kernel_queue_specific descriptor for a specific Queue and values.
/// max_num_work_groups is the only valid descriptor for this function.
///
/// \param Queue is a valid SYCL queue.
/// \param WorkGroupSize is the work-group size the number of work-groups is
/// requested for.
/// \return depends on information being queried.
template <typename Param>
typename Param::return_type
ext_oneapi_get_info(queue Queue, const range<1> &MaxWorkGroupSize,
size_t DynamicLocalMemorySize) const;

/// Query queue/launch-specific information from a kernel using the
/// info::kernel_queue_specific descriptor for a specific Queue and values.
/// max_num_work_groups is the only valid descriptor for this function.
///
/// \param Queue is a valid SYCL queue.
/// \param WorkGroupSize is the work-group size the number of work-groups is
/// requested for.
/// \return depends on information being queried.
template <typename Param>
typename Param::return_type
ext_oneapi_get_info(queue Queue, const range<2> &MaxWorkGroupSize,
size_t DynamicLocalMemorySize) const;

/// Query queue/launch-specific information from a kernel using the
/// info::kernel_queue_specific descriptor for a specific Queue and values.
/// max_num_work_groups is the only valid descriptor for this function.
Expand Down Expand Up @@ -193,11 +219,49 @@ class kernel_impl {

/// Check if the occupancy limits are exceeded for the given kernel launch
/// configuration.
template <int Dimensions>
bool exceedsOccupancyResourceLimits(const device &Device,
const range<3> &WorkGroupSize,
const range<Dimensions> &WorkGroupSize,
size_t DynamicLocalMemorySize) const;
template <int Dimensions>
size_t queryMaxNumWorkGroups(queue Queue,
const range<Dimensions> &WorkGroupSize,
size_t DynamicLocalMemorySize) const;
};

template <int Dimensions>
bool kernel_impl::exceedsOccupancyResourceLimits(
const device &Device, const range<Dimensions> &WorkGroupSize,
size_t DynamicLocalMemorySize) const {
// Respect occupancy limits for WorkGroupSize and DynamicLocalMemorySize.
// Generally, exceeding hardware resource limits will yield in an error when
// the kernel is launched.
const size_t MaxWorkGroupSize =
get_info<info::kernel_device_specific::work_group_size>(Device);
const size_t MaxLocalMemorySizeInBytes =
Device.get_info<info::device::local_mem_size>();

if (WorkGroupSize.size() > MaxWorkGroupSize)
return true;

if (DynamicLocalMemorySize > MaxLocalMemorySizeInBytes)
return true;

// It will be impossible to launch a kernel for Cuda when the hardware limit
// for the 32-bit registers page file size is exceeded.
if (Device.get_backend() == backend::ext_oneapi_cuda) {
const uint32_t RegsPerWorkItem =
get_info<info::kernel_device_specific::ext_codeplay_num_regs>(Device);
const uint32_t MaxRegsPerWorkGroup =
Device.get_info<ext::codeplay::experimental::info::device::
max_registers_per_work_group>();
if ((MaxWorkGroupSize * RegsPerWorkItem) > MaxRegsPerWorkGroup)
return true;
}

return false;
}

template <typename Param>
inline typename Param::return_type kernel_impl::get_info() const {
static_assert(is_kernel_info_desc<Param>::value,
Expand Down Expand Up @@ -244,13 +308,11 @@ kernel_impl::get_info(const device &Device,

namespace syclex = ext::oneapi::experimental;

template <>
inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
return_type
kernel_impl::ext_oneapi_get_info<
syclex::info::kernel_queue_specific::max_num_work_groups>(
queue Queue, const range<3> &WorkGroupSize,
size_t DynamicLocalMemorySize) const {
template <int Dimensions>
size_t
kernel_impl::queryMaxNumWorkGroups(queue Queue,
const range<Dimensions> &WorkGroupSize,
size_t DynamicLocalMemorySize) const {
if (WorkGroupSize.size() == 0)
throw exception(sycl::make_error_code(errc::invalid),
"The launch work-group size cannot be zero.");
Expand All @@ -259,12 +321,21 @@ inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
const auto &Handle = getHandleRef();
auto Device = Queue.get_device();

size_t WG[Dimensions];
WG[0] = WorkGroupSize[0];
if constexpr (Dimensions >= 2)
WG[1] = WorkGroupSize[1];
if constexpr (Dimensions == 3)
WG[2] = WorkGroupSize[2];

uint32_t GroupCount{0};
if (auto Result = Adapter->call_nocheck<
UrApiKind::urKernelSuggestMaxCooperativeGroupCountExp>(
Handle, WorkGroupSize.size(), DynamicLocalMemorySize, &GroupCount);
Result != UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
// The feature is supported. Check for other errors and throw if any.
Handle, Dimensions, WG, DynamicLocalMemorySize, &GroupCount);
Result != UR_RESULT_ERROR_UNSUPPORTED_FEATURE &&
Result != UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE) {
// The feature is supported and the group size is valid. Check for other
// errors and throw if any.
Adapter->checkUrResult(Result);
return GroupCount;
}
Expand All @@ -278,30 +349,33 @@ inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
}

template <>
inline typename syclex::info::kernel_queue_specific::max_num_work_group_sync::
inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
return_type
kernel_impl::ext_oneapi_get_info<
syclex::info::kernel_queue_specific::max_num_work_group_sync>(
queue Queue, const range<3> &WorkGroupSize,
syclex::info::kernel_queue_specific::max_num_work_groups>(
queue Queue, const range<1> &WorkGroupSize,
size_t DynamicLocalMemorySize) const {
return ext_oneapi_get_info<
syclex::info::kernel_queue_specific::max_num_work_groups>(
Queue, WorkGroupSize, DynamicLocalMemorySize);
return queryMaxNumWorkGroups(Queue, WorkGroupSize, DynamicLocalMemorySize);
}

template <>
inline typename syclex::info::kernel_queue_specific::max_num_work_group_sync::
inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
return_type
kernel_impl::ext_oneapi_get_info<
syclex::info::kernel_queue_specific::max_num_work_group_sync>(
queue Queue) const {
auto Device = Queue.get_device();
const auto MaxWorkGroupSize =
get_info<info::kernel_device_specific::work_group_size>(Device);
const sycl::range<3> WorkGroupSize{MaxWorkGroupSize, 1, 1};
return ext_oneapi_get_info<
syclex::info::kernel_queue_specific::max_num_work_group_sync>(
Queue, WorkGroupSize, /* DynamicLocalMemorySize */ 0);
syclex::info::kernel_queue_specific::max_num_work_groups>(
queue Queue, const range<2> &WorkGroupSize,
size_t DynamicLocalMemorySize) const {
return queryMaxNumWorkGroups(Queue, WorkGroupSize, DynamicLocalMemorySize);
}

template <>
inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
return_type
kernel_impl::ext_oneapi_get_info<
syclex::info::kernel_queue_specific::max_num_work_groups>(
queue Queue, const range<3> &WorkGroupSize,
size_t DynamicLocalMemorySize) const {
return queryMaxNumWorkGroups(Queue, WorkGroupSize, DynamicLocalMemorySize);
}

} // namespace detail
Expand Down
29 changes: 22 additions & 7 deletions sycl/source/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,26 +113,41 @@ kernel::ext_oneapi_get_info(queue Queue) const {

template <typename Param>
typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
kernel::ext_oneapi_get_info(queue Queue, const range<3> &WorkGroupSize,
kernel::ext_oneapi_get_info(queue Queue, const range<1> &WorkGroupSize,
size_t DynamicLocalMemorySize) const {
return impl->ext_oneapi_get_info<Param>(Queue, WorkGroupSize,
DynamicLocalMemorySize);
}

template __SYCL_EXPORT typename ext::oneapi::experimental::info::
kernel_queue_specific::max_num_work_group_sync::return_type
kernel::ext_oneapi_get_info<
ext::oneapi::experimental::info::kernel_queue_specific::
max_num_work_group_sync>(queue Queue) const;
template <typename Param>
typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
kernel::ext_oneapi_get_info(queue Queue, const range<2> &WorkGroupSize,
size_t DynamicLocalMemorySize) const {
return impl->ext_oneapi_get_info<Param>(Queue, WorkGroupSize,
DynamicLocalMemorySize);
}

template <typename Param>
typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
kernel::ext_oneapi_get_info(queue Queue, const range<3> &WorkGroupSize,
size_t DynamicLocalMemorySize) const {
return impl->ext_oneapi_get_info<Param>(Queue, WorkGroupSize,
DynamicLocalMemorySize);
}

#define __SYCL_PARAM_TRAITS_SPEC(Namespace, DescType, Desc, ReturnT) \
template __SYCL_EXPORT ReturnT \
kernel::ext_oneapi_get_info<Namespace::info::DescType::Desc>( \
queue, const range<1> &, size_t) const; \
template __SYCL_EXPORT ReturnT \
kernel::ext_oneapi_get_info<Namespace::info::DescType::Desc>( \
queue, const range<2> &, size_t) const; \
template __SYCL_EXPORT ReturnT \
kernel::ext_oneapi_get_info<Namespace::info::DescType::Desc>( \
queue, const range<3> &, size_t) const;
// Not including "ext_oneapi_kernel_queue_specific_traits.def" because not all
// kernel_queue_specific queries require the above-defined get_info interface.
// clang-format off
__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_group_sync, size_t)
__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_groups, size_t)
// clang-format on
#undef __SYCL_PARAM_TRAITS_SPEC
Expand Down
12 changes: 4 additions & 8 deletions sycl/test-e2e/Basic/launch_queries/max_num_work_groups.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -161,14 +161,10 @@ int test_max_num_work_groups(sycl::queue &q, const sycl::device &dev) {
// It cannot be possible to launch a kernel successfully with a configuration
// that exceeds the available resources as in the above defined workGroupSize.
// workGroupSize is larger than maxWorkGroupSize, hence maxWGs must equal 0.
// Note: Level-Zero currently always returns a non-zero value.
// TODO: Remove the backend condition once the Level-Zero API issue is fixed.
if (dev.get_backend() != sycl::backend::ext_oneapi_level_zero) {
assert(maxWGs == 0 &&
"max_num_work_groups query failed.\n"
"It should return 0 possible groups when the requested resources "
"by the lanuch config exceed those available in the hardware.");
}
assert(maxWGs == 0 &&
"max_num_work_groups query failed.\n"
"It should return 0 possible groups when the requested resources "
"by the lanuch config exceed those available in the hardware.");

// As we ensured that the 'max_num_work_groups' query correctly
// returns 0 possible work-groups, test that the kernel launch will fail.
Expand Down
35 changes: 22 additions & 13 deletions sycl/test-e2e/GroupAlgorithm/root_group.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,18 @@ void testQueriesAndProperties() {
const auto bundle =
sycl::get_kernel_bundle<sycl::bundle_state::executable>(q.get_context());
const auto kernel = bundle.get_kernel<class QueryKernel>();
const auto maxWGs = kernel.ext_oneapi_get_info<
sycl::ext::oneapi::experimental::info::kernel_queue_specific::
max_num_work_group_sync>(q);
const auto wgRange = sycl::range{WorkGroupSize, 1, 1};
const auto maxWGsWithLimits = kernel.ext_oneapi_get_info<
sycl::ext::oneapi::experimental::info::kernel_queue_specific::
max_num_work_group_sync>(q, wgRange, wgRange.size() * sizeof(int));
const auto local_range = sycl::range<1>(1);
const auto maxWGs =
kernel
.ext_oneapi_get_info<sycl::ext::oneapi::experimental::info::
kernel_queue_specific::max_num_work_groups>(
q, local_range, 0);
const auto wgRange = sycl::range<3>{WorkGroupSize, 1, 1};
const auto maxWGsWithLimits =
kernel
.ext_oneapi_get_info<sycl::ext::oneapi::experimental::info::
kernel_queue_specific::max_num_work_groups>(
q, wgRange, wgRange.size() * sizeof(int));
const auto props = sycl::ext::oneapi::experimental::properties{
sycl::ext::oneapi::experimental::use_root_sync};
q.single_task<class QueryKernel>(props, []() {});
Expand All @@ -52,9 +57,11 @@ void testRootGroup() {
const auto bundle =
sycl::get_kernel_bundle<sycl::bundle_state::executable>(q.get_context());
const auto kernel = bundle.get_kernel<class RootGroupKernel>();
const auto maxWGs = kernel.ext_oneapi_get_info<
sycl::ext::oneapi::experimental::info::kernel_queue_specific::
max_num_work_group_sync>(q);
const auto maxWGs =
kernel
.ext_oneapi_get_info<sycl::ext::oneapi::experimental::info::
kernel_queue_specific::max_num_work_groups>(
q, WorkGroupSize, 0);
const auto props = sycl::ext::oneapi::experimental::properties{
sycl::ext::oneapi::experimental::use_root_sync};
sycl::buffer<int> dataBuf{sycl::range{maxWGs * WorkGroupSize}};
Expand Down Expand Up @@ -94,9 +101,11 @@ void testRootGroupFunctions() {
const auto bundle =
sycl::get_kernel_bundle<sycl::bundle_state::executable>(q.get_context());
const auto kernel = bundle.get_kernel<class RootGroupFunctionsKernel>();
const auto maxWGs = kernel.ext_oneapi_get_info<
sycl::ext::oneapi::experimental::info::kernel_queue_specific::
max_num_work_group_sync>(q);
const auto maxWGs =
kernel
.ext_oneapi_get_info<sycl::ext::oneapi::experimental::info::
kernel_queue_specific::max_num_work_groups>(
q, WorkGroupSize, 0);
const auto props = sycl::ext::oneapi::experimental::properties{
sycl::ext::oneapi::experimental::use_root_sync};

Expand Down
Loading