intel · 0x12CC · Nov 11, 2024 · Nov 21, 2024 · Nov 21, 2024 · Nov 22, 2024
@@ -116,7 +116,7 @@ if(SYCL_UR_USE_FETCH_CONTENT)
       CACHE PATH "Path to external '${name}' adapter source dir" FORCE)
   endfunction()
 
-  set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
+  set(UNIFIED_RUNTIME_REPO "https://github.com/0x12CC/unified-runtime.git")
   include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/UnifiedRuntimeTag.cmake)
 
   set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES")

@@ -4,4 +4,4 @@
 # Date:   Tue Nov 26 13:32:11 2024 +0000
 #     Merge pull request #2351 from nrspruit/mcl_1_1
 #     [L0] Add support for the MCL 1.1 apis thru the spec extensions
-set(UNIFIED_RUNTIME_TAG e0f22b54080411537e8a12fe42f06dbac5e7a2d2)
+set(UNIFIED_RUNTIME_TAG ca0a0e3e3b3ad933605a35850ebc11bbf5d5f367)
@@ -1,4 +1 @@
-// TODO: Revisit 'max_num_work_group_sync' and align it with the
-// 'sycl_ext_oneapi_forward_progress' extension once #7598 is merged.
-__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_group_sync, size_t,)
 __SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_groups, size_t,)
@@ -168,6 +168,32 @@ class __SYCL_EXPORT kernel : public detail::OwnerLessBase<kernel> {
   typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
   ext_oneapi_get_info(queue Queue) const;
 
+  /// Query queue/launch-specific information from a kernel using the
+  /// info::kernel_queue_specific descriptor for a specific Queue and values.
+  /// max_num_work_groups is the only valid descriptor for this function.
+  ///
+  /// \param Queue is a valid SYCL queue.
+  /// \param WorkGroupSize is the work-group size the number of work-groups is
+  /// requested for.
+  /// \return depends on information being queried.
+  template <typename Param>
+  typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
+  ext_oneapi_get_info(queue Queue, const range<1> &WorkGroupSize,
+                      size_t DynamicLocalMemorySize) const;
+
+  /// Query queue/launch-specific information from a kernel using the
+  /// info::kernel_queue_specific descriptor for a specific Queue and values.
+  /// max_num_work_groups is the only valid descriptor for this function.
+  ///
+  /// \param Queue is a valid SYCL queue.
+  /// \param WorkGroupSize is the work-group size the number of work-groups is
+  /// requested for.
+  /// \return depends on information being queried.
+  template <typename Param>
+  typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
+  ext_oneapi_get_info(queue Queue, const range<2> &WorkGroupSize,
+                      size_t DynamicLocalMemorySize) const;
+
   /// Query queue/launch-specific information from a kernel using the
   /// info::kernel_queue_specific descriptor for a specific Queue and values.
   /// max_num_work_groups is the only valid descriptor for this function.

@@ -106,38 +106,6 @@ void kernel_impl::checkIfValidForNumArgsInfoQuery() const {
       "interoperability function or to query a device built-in kernel");
 }
 
-bool kernel_impl::exceedsOccupancyResourceLimits(
-    const device &Device, const range<3> &WorkGroupSize,
-    size_t DynamicLocalMemorySize) const {
-  // Respect occupancy limits for WorkGroupSize and DynamicLocalMemorySize.
-  // Generally, exceeding hardware resource limits will yield in an error when
-  // the kernel is launched.
-  const size_t MaxWorkGroupSize =
-      get_info<info::kernel_device_specific::work_group_size>(Device);
-  const size_t MaxLocalMemorySizeInBytes =
-      Device.get_info<info::device::local_mem_size>();
-
-  if (WorkGroupSize.size() > MaxWorkGroupSize)
-    return true;
-
-  if (DynamicLocalMemorySize > MaxLocalMemorySizeInBytes)
-    return true;
-
-  // It will be impossible to launch a kernel for Cuda when the hardware limit
-  // for the 32-bit registers page file size is exceeded.
-  if (Device.get_backend() == backend::ext_oneapi_cuda) {
-    const uint32_t RegsPerWorkItem =
-        get_info<info::kernel_device_specific::ext_codeplay_num_regs>(Device);
-    const uint32_t MaxRegsPerWorkGroup =
-        Device.get_info<ext::codeplay::experimental::info::device::
-                            max_registers_per_work_group>();
-    if ((MaxWorkGroupSize * RegsPerWorkItem) > MaxRegsPerWorkGroup)
-      return true;
-  }
-
-  return false;
-}
-
 template <>
 typename info::platform::version::return_type
 kernel_impl::get_backend_info<info::platform::version>() const {

@@ -123,6 +123,32 @@ class kernel_impl {
   template <typename Param>
   typename Param::return_type ext_oneapi_get_info(queue Queue) const;
 
+  /// Query queue/launch-specific information from a kernel using the
+  /// info::kernel_queue_specific descriptor for a specific Queue and values.
+  /// max_num_work_groups is the only valid descriptor for this function.
+  ///
+  /// \param Queue is a valid SYCL queue.
+  /// \param WorkGroupSize is the work-group size the number of work-groups is
+  /// requested for.
+  /// \return depends on information being queried.
+  template <typename Param>
+  typename Param::return_type
+  ext_oneapi_get_info(queue Queue, const range<1> &MaxWorkGroupSize,
+                      size_t DynamicLocalMemorySize) const;
+
+  /// Query queue/launch-specific information from a kernel using the
+  /// info::kernel_queue_specific descriptor for a specific Queue and values.
+  /// max_num_work_groups is the only valid descriptor for this function.
+  ///
+  /// \param Queue is a valid SYCL queue.
+  /// \param WorkGroupSize is the work-group size the number of work-groups is
+  /// requested for.
+  /// \return depends on information being queried.
+  template <typename Param>
+  typename Param::return_type
+  ext_oneapi_get_info(queue Queue, const range<2> &MaxWorkGroupSize,
+                      size_t DynamicLocalMemorySize) const;
+
   /// Query queue/launch-specific information from a kernel using the
   /// info::kernel_queue_specific descriptor for a specific Queue and values.
   /// max_num_work_groups is the only valid descriptor for this function.
@@ -193,11 +219,49 @@ class kernel_impl {
 
   /// Check if the occupancy limits are exceeded for the given kernel launch
   /// configuration.
+  template <int Dimensions>
   bool exceedsOccupancyResourceLimits(const device &Device,
-                                      const range<3> &WorkGroupSize,
+                                      const range<Dimensions> &WorkGroupSize,
                                       size_t DynamicLocalMemorySize) const;
+  template <int Dimensions>
+  size_t queryMaxNumWorkGroups(queue Queue,
+                               const range<Dimensions> &WorkGroupSize,
+                               size_t DynamicLocalMemorySize) const;
 };
 
+template <int Dimensions>
+bool kernel_impl::exceedsOccupancyResourceLimits(
+    const device &Device, const range<Dimensions> &WorkGroupSize,
+    size_t DynamicLocalMemorySize) const {
+  // Respect occupancy limits for WorkGroupSize and DynamicLocalMemorySize.
+  // Generally, exceeding hardware resource limits will yield in an error when
+  // the kernel is launched.
+  const size_t MaxWorkGroupSize =
+      get_info<info::kernel_device_specific::work_group_size>(Device);
+  const size_t MaxLocalMemorySizeInBytes =
+      Device.get_info<info::device::local_mem_size>();
+
+  if (WorkGroupSize.size() > MaxWorkGroupSize)
+    return true;
+
+  if (DynamicLocalMemorySize > MaxLocalMemorySizeInBytes)
+    return true;
+
+  // It will be impossible to launch a kernel for Cuda when the hardware limit
+  // for the 32-bit registers page file size is exceeded.
+  if (Device.get_backend() == backend::ext_oneapi_cuda) {
+    const uint32_t RegsPerWorkItem =
+        get_info<info::kernel_device_specific::ext_codeplay_num_regs>(Device);
+    const uint32_t MaxRegsPerWorkGroup =
+        Device.get_info<ext::codeplay::experimental::info::device::
+                            max_registers_per_work_group>();
+    if ((MaxWorkGroupSize * RegsPerWorkItem) > MaxRegsPerWorkGroup)
+      return true;
+  }
+
+  return false;
+}
+
 template <typename Param>
 inline typename Param::return_type kernel_impl::get_info() const {
   static_assert(is_kernel_info_desc<Param>::value,
@@ -244,13 +308,11 @@ kernel_impl::get_info(const device &Device,
 
 namespace syclex = ext::oneapi::experimental;
 
-template <>
-inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
-    return_type
-    kernel_impl::ext_oneapi_get_info<
-        syclex::info::kernel_queue_specific::max_num_work_groups>(
-        queue Queue, const range<3> &WorkGroupSize,
-        size_t DynamicLocalMemorySize) const {
+template <int Dimensions>
+size_t
+kernel_impl::queryMaxNumWorkGroups(queue Queue,
+                                   const range<Dimensions> &WorkGroupSize,
+                                   size_t DynamicLocalMemorySize) const {
   if (WorkGroupSize.size() == 0)
     throw exception(sycl::make_error_code(errc::invalid),
                     "The launch work-group size cannot be zero.");
@@ -259,12 +321,21 @@ inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
   const auto &Handle = getHandleRef();
   auto Device = Queue.get_device();
 
+  size_t WG[Dimensions];
+  WG[0] = WorkGroupSize[0];
+  if constexpr (Dimensions >= 2)
+    WG[1] = WorkGroupSize[1];
+  if constexpr (Dimensions == 3)
+    WG[2] = WorkGroupSize[2];
+
   uint32_t GroupCount{0};
   if (auto Result = Adapter->call_nocheck<
                     UrApiKind::urKernelSuggestMaxCooperativeGroupCountExp>(
-          Handle, WorkGroupSize.size(), DynamicLocalMemorySize, &GroupCount);
-      Result != UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
-    // The feature is supported. Check for other errors and throw if any.
+          Handle, Dimensions, WG, DynamicLocalMemorySize, &GroupCount);
+      Result != UR_RESULT_ERROR_UNSUPPORTED_FEATURE &&
+      Result != UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE) {
+    // The feature is supported and the group size is valid. Check for other
+    // errors and throw if any.
     Adapter->checkUrResult(Result);
     return GroupCount;
   }
@@ -278,30 +349,33 @@ inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
 }
 
 template <>
-inline typename syclex::info::kernel_queue_specific::max_num_work_group_sync::
+inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
     return_type
     kernel_impl::ext_oneapi_get_info<
-        syclex::info::kernel_queue_specific::max_num_work_group_sync>(
-        queue Queue, const range<3> &WorkGroupSize,
+        syclex::info::kernel_queue_specific::max_num_work_groups>(
+        queue Queue, const range<1> &WorkGroupSize,
         size_t DynamicLocalMemorySize) const {
-  return ext_oneapi_get_info<
-      syclex::info::kernel_queue_specific::max_num_work_groups>(
-      Queue, WorkGroupSize, DynamicLocalMemorySize);
+  return queryMaxNumWorkGroups(Queue, WorkGroupSize, DynamicLocalMemorySize);
 }
 
 template <>
-inline typename syclex::info::kernel_queue_specific::max_num_work_group_sync::
+inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
     return_type
     kernel_impl::ext_oneapi_get_info<
-        syclex::info::kernel_queue_specific::max_num_work_group_sync>(
-        queue Queue) const {
-  auto Device = Queue.get_device();
-  const auto MaxWorkGroupSize =
-      get_info<info::kernel_device_specific::work_group_size>(Device);
-  const sycl::range<3> WorkGroupSize{MaxWorkGroupSize, 1, 1};
-  return ext_oneapi_get_info<
-      syclex::info::kernel_queue_specific::max_num_work_group_sync>(
-      Queue, WorkGroupSize, /* DynamicLocalMemorySize */ 0);
+        syclex::info::kernel_queue_specific::max_num_work_groups>(
+        queue Queue, const range<2> &WorkGroupSize,
+        size_t DynamicLocalMemorySize) const {
+  return queryMaxNumWorkGroups(Queue, WorkGroupSize, DynamicLocalMemorySize);
+}
+
+template <>
+inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
+    return_type
+    kernel_impl::ext_oneapi_get_info<
+        syclex::info::kernel_queue_specific::max_num_work_groups>(
+        queue Queue, const range<3> &WorkGroupSize,
+        size_t DynamicLocalMemorySize) const {
+  return queryMaxNumWorkGroups(Queue, WorkGroupSize, DynamicLocalMemorySize);
 }
 
 } // namespace detail

@@ -113,26 +113,41 @@ kernel::ext_oneapi_get_info(queue Queue) const {
 
 template <typename Param>
 typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
-kernel::ext_oneapi_get_info(queue Queue, const range<3> &WorkGroupSize,
+kernel::ext_oneapi_get_info(queue Queue, const range<1> &WorkGroupSize,
                             size_t DynamicLocalMemorySize) const {
   return impl->ext_oneapi_get_info<Param>(Queue, WorkGroupSize,
                                           DynamicLocalMemorySize);
 }
 
-template __SYCL_EXPORT typename ext::oneapi::experimental::info::
-    kernel_queue_specific::max_num_work_group_sync::return_type
-    kernel::ext_oneapi_get_info<
-        ext::oneapi::experimental::info::kernel_queue_specific::
-            max_num_work_group_sync>(queue Queue) const;
+template <typename Param>
+typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
+kernel::ext_oneapi_get_info(queue Queue, const range<2> &WorkGroupSize,
+                            size_t DynamicLocalMemorySize) const {
+  return impl->ext_oneapi_get_info<Param>(Queue, WorkGroupSize,
+                                          DynamicLocalMemorySize);
+}
+
+template <typename Param>
+typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
+kernel::ext_oneapi_get_info(queue Queue, const range<3> &WorkGroupSize,
+                            size_t DynamicLocalMemorySize) const {
+  return impl->ext_oneapi_get_info<Param>(Queue, WorkGroupSize,
+                                          DynamicLocalMemorySize);
+}
 
 #define __SYCL_PARAM_TRAITS_SPEC(Namespace, DescType, Desc, ReturnT)           \
   template __SYCL_EXPORT ReturnT                                               \
+  kernel::ext_oneapi_get_info<Namespace::info::DescType::Desc>(                \
+      queue, const range<1> &, size_t) const;                                  \
+  template __SYCL_EXPORT ReturnT                                               \
+  kernel::ext_oneapi_get_info<Namespace::info::DescType::Desc>(                \
+      queue, const range<2> &, size_t) const;                                  \
+  template __SYCL_EXPORT ReturnT                                               \
   kernel::ext_oneapi_get_info<Namespace::info::DescType::Desc>(                \
       queue, const range<3> &, size_t) const;
 // Not including "ext_oneapi_kernel_queue_specific_traits.def" because not all
 // kernel_queue_specific queries require the above-defined get_info interface.
 // clang-format off
-__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_group_sync, size_t)
 __SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_groups, size_t)
 // clang-format on
 #undef __SYCL_PARAM_TRAITS_SPEC

@@ -161,14 +161,10 @@ int test_max_num_work_groups(sycl::queue &q, const sycl::device &dev) {
   // It cannot be possible to launch a kernel successfully with a configuration
   // that exceeds the available resources as in the above defined workGroupSize.
   // workGroupSize is larger than maxWorkGroupSize, hence maxWGs must equal 0.
-  // Note: Level-Zero currently always returns a non-zero value.
-  // TODO: Remove the backend condition once the Level-Zero API issue is fixed.
-  if (dev.get_backend() != sycl::backend::ext_oneapi_level_zero) {
-    assert(maxWGs == 0 &&
-           "max_num_work_groups query failed.\n"
-           "It should return 0 possible groups when the requested resources "
-           "by the lanuch config exceed those available in the hardware.");
-  }
+  assert(maxWGs == 0 &&
+         "max_num_work_groups query failed.\n"
+         "It should return 0 possible groups when the requested resources "
+         "by the lanuch config exceed those available in the hardware.");
 
   // As we ensured that the 'max_num_work_groups' query correctly
   // returns 0 possible work-groups, test that the kernel launch will fail.

@@ -27,13 +27,18 @@ void testQueriesAndProperties() {
   const auto bundle =
       sycl::get_kernel_bundle<sycl::bundle_state::executable>(q.get_context());
   const auto kernel = bundle.get_kernel<class QueryKernel>();
-  const auto maxWGs = kernel.ext_oneapi_get_info<
-      sycl::ext::oneapi::experimental::info::kernel_queue_specific::
-          max_num_work_group_sync>(q);
-  const auto wgRange = sycl::range{WorkGroupSize, 1, 1};
-  const auto maxWGsWithLimits = kernel.ext_oneapi_get_info<
-      sycl::ext::oneapi::experimental::info::kernel_queue_specific::
-          max_num_work_group_sync>(q, wgRange, wgRange.size() * sizeof(int));
+  const auto local_range = sycl::range<1>(1);
+  const auto maxWGs =
+      kernel
+          .ext_oneapi_get_info<sycl::ext::oneapi::experimental::info::
+                                   kernel_queue_specific::max_num_work_groups>(
+              q, local_range, 0);
+  const auto wgRange = sycl::range<3>{WorkGroupSize, 1, 1};
+  const auto maxWGsWithLimits =
+      kernel
+          .ext_oneapi_get_info<sycl::ext::oneapi::experimental::info::
+                                   kernel_queue_specific::max_num_work_groups>(
+              q, wgRange, wgRange.size() * sizeof(int));
   const auto props = sycl::ext::oneapi::experimental::properties{
       sycl::ext::oneapi::experimental::use_root_sync};
   q.single_task<class QueryKernel>(props, []() {});
@@ -52,9 +57,11 @@ void testRootGroup() {
   const auto bundle =
       sycl::get_kernel_bundle<sycl::bundle_state::executable>(q.get_context());
   const auto kernel = bundle.get_kernel<class RootGroupKernel>();
-  const auto maxWGs = kernel.ext_oneapi_get_info<
-      sycl::ext::oneapi::experimental::info::kernel_queue_specific::
-          max_num_work_group_sync>(q);
+  const auto maxWGs =
+      kernel
+          .ext_oneapi_get_info<sycl::ext::oneapi::experimental::info::
+                                   kernel_queue_specific::max_num_work_groups>(
+              q, WorkGroupSize, 0);
   const auto props = sycl::ext::oneapi::experimental::properties{
       sycl::ext::oneapi::experimental::use_root_sync};
   sycl::buffer<int> dataBuf{sycl::range{maxWGs * WorkGroupSize}};
@@ -94,9 +101,11 @@ void testRootGroupFunctions() {
   const auto bundle =
       sycl::get_kernel_bundle<sycl::bundle_state::executable>(q.get_context());
   const auto kernel = bundle.get_kernel<class RootGroupFunctionsKernel>();
-  const auto maxWGs = kernel.ext_oneapi_get_info<
-      sycl::ext::oneapi::experimental::info::kernel_queue_specific::
-          max_num_work_group_sync>(q);
+  const auto maxWGs =
+      kernel
+          .ext_oneapi_get_info<sycl::ext::oneapi::experimental::info::
+                                   kernel_queue_specific::max_num_work_groups>(
+              q, WorkGroupSize, 0);
   const auto props = sycl::ext::oneapi::experimental::properties{
       sycl::ext::oneapi::experimental::use_root_sync};