From 436e687fc40630c1ed81242a70b316f1a001f579 Mon Sep 17 00:00:00 2001
From: Vyacheslav Klochkov <vyacheslav.n.klochkov@intel.com>
Date: Tue, 20 Feb 2024 09:27:34 -0600
Subject: [PATCH 01/30] [ESIMD][NFC][E2E] Fix 570 compilation warnings in ESIMD
 E2E tests (#12748)

Warnings fixed:
- deprecated scatter_rgba
- deprecated get_cl_code
- deprecated lsc_fence
- deprecated uchar type usage
- deprecated get_access on HOST
- deprecated get_pointer
- usage of isfinite with -ffast-math
- deprecated dpas_argument_type::s1
- deprecated gpu_selector()

Also, the memory alloc/free in historgram*.cpp tests were updated to
simplify the potential memory leak avoidance.

Signed-off-by: Klochkov, Vyacheslav N <vyacheslav.n.klochkov@intel.com>
---
 sycl/test-e2e/ESIMD/accessor_global.cpp       |  10 +-
 sycl/test-e2e/ESIMD/accessor_local.cpp        |   8 +-
 .../ESIMD/api/simd_view_copy_move_assign.cpp  |   2 +-
 sycl/test-e2e/ESIMD/dpas/dpas_common.hpp      |  10 +-
 sycl/test-e2e/ESIMD/esimd_test_utils.hpp      |   4 +
 sycl/test-e2e/ESIMD/ext_math.cpp              |   5 +-
 sycl/test-e2e/ESIMD/grf.cpp                   |  16 +--
 sycl/test-e2e/ESIMD/histogram.cpp             |  31 ++---
 sycl/test-e2e/ESIMD/histogram_256_slm.cpp     |  28 ++--
 .../ESIMD/histogram_256_slm_spec_2020.cpp     |  39 +++---
 sycl/test-e2e/ESIMD/histogram_2d.cpp          |  36 ++---
 sycl/test-e2e/ESIMD/histogram_raw_send.cpp    |  23 ++--
 sycl/test-e2e/ESIMD/lsc/Inputs/lsc_surf.hpp   |  15 +--
 .../ESIMD/lsc/lsc_argument_type_deduction.cpp |   3 +-
 sycl/test-e2e/ESIMD/lsc/lsc_predicate.cpp     |   6 +-
 .../ESIMD/lsc/lsc_predicate_stateless.cpp     |   6 +-
 sycl/test-e2e/ESIMD/lsc/lsc_slm.cpp           |  15 +--
 .../ESIMD/lsc/lsc_slm_atomic_smoke.cpp        |   8 ++
 .../ESIMD/named_barriers/exec_in_order.cpp    |  12 +-
 .../named_barriers/exec_in_order_branched.cpp |  26 ++--
 sycl/test-e2e/ESIMD/named_barriers/loop.cpp   |  14 +-
 .../ESIMD/named_barriers/loop_extended.cpp    | 127 +++++++++---------
 .../ESIMD/named_barriers/multiple_wg.cpp      |  10 +-
 .../ESIMD/named_barriers/single_wg.cpp        |  10 +-
 .../ESIMD/noinline_call_from_func.cpp         |   6 +-
 sycl/test-e2e/ESIMD/radix_sort.cpp            |   4 +-
 .../regression/Inputs/complex-lib-sycl.cpp    |   3 +-
 .../ESIMD/regression/Inputs/dgetrf.hpp        |   5 +-
 .../regression/big_const_initializer.cpp      |   5 +-
 sycl/test-e2e/ESIMD/regression/dgetrf_8x8.cpp |   5 +-
 30 files changed, 219 insertions(+), 273 deletions(-)

diff --git a/sycl/test-e2e/ESIMD/accessor_global.cpp b/sycl/test-e2e/ESIMD/accessor_global.cpp
index 238edd08a6a49..83f525dab94d2 100644
--- a/sycl/test-e2e/ESIMD/accessor_global.cpp
+++ b/sycl/test-e2e/ESIMD/accessor_global.cpp
@@ -2,7 +2,7 @@
 // RUN: %{build} -fsycl-esimd-force-stateless-mem -o %t.out
 // RUN: %{run} %t.out
 
-// This test verifies usage of accessor methods operator[] and get_pointer().
+// This test verifies usage of accessor methods operator[] and get_multi_ptr().
 
 #include "esimd_test_utils.hpp"
 
@@ -39,7 +39,8 @@ bool test(queue Q, uint32_t LocalRange, uint32_t GlobalRange) {
            for (int I = 0; I < VL; I++)
              TmpAcc[GID * VL + I] = GID * 100 + I;
          } else {
-           T *Ptr = TmpAcc.get_pointer();
+           T *Ptr =
+               TmpAcc.template get_multi_ptr<access::decorated::yes>().get();
            simd<int, VL> IntValues(GID * 100, 1);
            simd<T, VL> Values = IntValues;
            block_store(Ptr + GID * VL, Values);
@@ -53,12 +54,13 @@ bool test(queue Q, uint32_t LocalRange, uint32_t GlobalRange) {
                for (int I = 0; I < VL; I++)
                  Out[(GID + LID) * VL + I] = TmpAcc[(GID + LID) * VL + I];
              } else {
-               T *Ptr = TmpAcc.get_pointer();
+               T *Ptr = TmpAcc.template get_multi_ptr<access::decorated::yes>()
+                            .get();
                simd<T, VL> Values = block_load<T, VL>(Ptr + (GID + LID) * VL);
                Values.template copy_to(Out + (GID + LID) * VL);
              }
            } // end for (int LID = 0; LID < LocalRange; LID++)
-         }   // end if (LID == 0)
+         } // end if (LID == 0)
        });
      }).wait();
   } catch (sycl::exception const &e) {
diff --git a/sycl/test-e2e/ESIMD/accessor_local.cpp b/sycl/test-e2e/ESIMD/accessor_local.cpp
index 8e65fe5da055b..1c5b469589e9b 100644
--- a/sycl/test-e2e/ESIMD/accessor_local.cpp
+++ b/sycl/test-e2e/ESIMD/accessor_local.cpp
@@ -43,8 +43,10 @@ bool test(queue Q, uint32_t LocalRange, uint32_t GlobalRange) {
        CGH.parallel_for(NDRange, [=](nd_item<1> Item) SYCL_ESIMD_KERNEL {
          uint32_t GID = Item.get_global_id(0);
          uint32_t LID = Item.get_local_id(0);
-         uint32_t LocalAccOffset = static_cast<uint32_t>(
-             reinterpret_cast<std::uintptr_t>(LocalAcc.get_pointer().get()));
+         uint32_t LocalAccOffset =
+             static_cast<uint32_t>(reinterpret_cast<std::uintptr_t>(
+                 LocalAcc.template get_multi_ptr<access::decorated::yes>()
+                     .get()));
          if constexpr (TestSubscript) {
            for (int I = 0; I < VL; I++)
              LocalAcc[LID * VL + I] = GID * 100 + I;
@@ -67,7 +69,7 @@ bool test(queue Q, uint32_t LocalRange, uint32_t GlobalRange) {
                ValuesFromSLM.copy_to(Out + (GID + LID) * VL);
              }
            } // end for (int LID = 0; LID < LocalRange; LID++)
-         }   // end if (LID == 0)
+         } // end if (LID == 0)
        });
      }).wait();
   } catch (sycl::exception const &e) {
diff --git a/sycl/test-e2e/ESIMD/api/simd_view_copy_move_assign.cpp b/sycl/test-e2e/ESIMD/api/simd_view_copy_move_assign.cpp
index c02cca27430a3..8e8e62138c1e1 100644
--- a/sycl/test-e2e/ESIMD/api/simd_view_copy_move_assign.cpp
+++ b/sycl/test-e2e/ESIMD/api/simd_view_copy_move_assign.cpp
@@ -24,7 +24,7 @@ using namespace sycl::ext::intel::esimd;
 template <unsigned VL, class T, class F>
 bool test(queue q, std::string str, F funcUnderTest) {
   std::cout << "Testing " << str << ", VL = " << VL << " ...\n";
-  size_t Size = 4 * VL;
+  constexpr size_t Size = 4 * VL;
   T A[Size];
   T B[Size];
   constexpr unsigned HalfVL = VL > 1 ? (VL / 2) : 1;
diff --git a/sycl/test-e2e/ESIMD/dpas/dpas_common.hpp b/sycl/test-e2e/ESIMD/dpas/dpas_common.hpp
index 4c9e528646545..56d0f282d2920 100644
--- a/sycl/test-e2e/ESIMD/dpas/dpas_common.hpp
+++ b/sycl/test-e2e/ESIMD/dpas/dpas_common.hpp
@@ -51,9 +51,7 @@ std::string toString(dpas_argument_type T) {
     return "bf16";
   case dpas_argument_type::tf32:
     return "tf32";
-  case dpas_argument_type::s1:
-  case dpas_argument_type::u1:
-  case dpas_argument_type::Invalid:
+  default:
     return "UNSUPPORTED";
   }
   return "UNRECOGNIZED";
@@ -127,9 +125,7 @@ template <dpas_argument_type T> constexpr int getBitSize() {
   case dpas_argument_type::tf32:
     return 32;
 
-  case dpas_argument_type::Invalid:
-  case dpas_argument_type::s1:
-  case dpas_argument_type::u1:
+  default:
     break;
   }
   return 0;
@@ -405,7 +401,7 @@ bool test(queue &Q, bool Print) {
                   << ") != expected (" << GoldRes << ")" << std::endl;
       }
     } // end for JJ
-  }   // end for II
+  } // end for II
 
   free(Res, Q);
   free(APacked, Q);
diff --git a/sycl/test-e2e/ESIMD/esimd_test_utils.hpp b/sycl/test-e2e/ESIMD/esimd_test_utils.hpp
index 941005fde942d..36742bc83bc66 100644
--- a/sycl/test-e2e/ESIMD/esimd_test_utils.hpp
+++ b/sycl/test-e2e/ESIMD/esimd_test_utils.hpp
@@ -27,6 +27,10 @@ using namespace sycl;
 
 namespace esimd_test {
 
+template <typename T>
+using shared_allocator = sycl::usm_allocator<T, sycl::usm::alloc::shared>;
+template <typename T> using shared_vector = std::vector<T, shared_allocator<T>>;
+
 // This is the function provided to SYCL runtime by the application to decide
 // on which device to run, or whether to run at all.
 // When selecting a device, SYCL runtime first takes (1) a selector provided by
diff --git a/sycl/test-e2e/ESIMD/ext_math.cpp b/sycl/test-e2e/ESIMD/ext_math.cpp
index a5dd98de8c6aa..f71ffeafb9762 100644
--- a/sycl/test-e2e/ESIMD/ext_math.cpp
+++ b/sycl/test-e2e/ESIMD/ext_math.cpp
@@ -388,7 +388,10 @@ bool test(queue &Q, const std::string &Name, InitF Init = InitNarrow<T>{},
     if constexpr (sizeof(T) <= 2)
       delta = delta + delta;
 
-    bool BothFinite = std::isfinite(Test) && std::isfinite(Gold);
+    bool BothFinite = true;
+#ifndef TEST_FAST_MATH
+    BothFinite = std::isfinite(Test) && std::isfinite(Gold);
+#endif
     if (BothFinite && std::abs(Test - Gold) > delta) {
       if (++ErrCnt < 10) {
         std::cout << "    failed at index " << I << ", " << Test
diff --git a/sycl/test-e2e/ESIMD/grf.cpp b/sycl/test-e2e/ESIMD/grf.cpp
index 976fd2de1eace..1aa4ed6d600cf 100644
--- a/sycl/test-e2e/ESIMD/grf.cpp
+++ b/sycl/test-e2e/ESIMD/grf.cpp
@@ -71,13 +71,10 @@ int main(void) {
     A[i] = i;
   }
 
+  queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());
+  esimd_test::printTestLabel(q);
   try {
     buffer<float, 1> bufa(A.data(), range<1>(Size));
-    queue q(gpu_selector{}, esimd_test::createExceptionHandler());
-
-    auto dev = q.get_device();
-    std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
-
     auto e = q.submit([&](handler &cgh) {
       auto PA = bufa.get_access<access::mode::read_write>(cgh);
       cgh.parallel_for<class SyclKernel>(Size,
@@ -98,11 +95,6 @@ int main(void) {
 
   try {
     buffer<float, 1> bufa(A.data(), range<1>(Size));
-    queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());
-
-    auto dev = q.get_device();
-    std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
-
     auto e = q.submit([&](handler &cgh) {
       auto PA = bufa.get_access<access::mode::read_write>(cgh);
       cgh.parallel_for<class EsimdKernel>(Size, [=](id<1> i) SYCL_ESIMD_KERNEL {
@@ -128,7 +120,6 @@ int main(void) {
 
   try {
     buffer<float, 1> bufa(A.data(), range<1>(Size));
-    queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());
 #ifdef USE_AUTO
     sycl::ext::oneapi::experimental::properties prop{grf_size_automatic};
 #elif defined(USE_NEW_API)
@@ -137,9 +128,6 @@ int main(void) {
     sycl::ext::oneapi::experimental::properties prop{
         register_alloc_mode<register_alloc_mode_enum::large>};
 #endif
-    auto dev = q.get_device();
-    std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
-
     auto e = q.submit([&](handler &cgh) {
       auto PA = bufa.get_access<access::mode::read_write>(cgh);
       cgh.parallel_for<class EsimdKernelSpecifiedGRF>(
diff --git a/sycl/test-e2e/ESIMD/histogram.cpp b/sycl/test-e2e/ESIMD/histogram.cpp
index 19949c22fb4d1..555ea8d3c73f9 100644
--- a/sycl/test-e2e/ESIMD/histogram.cpp
+++ b/sycl/test-e2e/ESIMD/histogram.cpp
@@ -85,15 +85,15 @@ int main(int argc, char *argv[]) {
 
   // Allocate Input Buffer
   queue q = esimd_test::createQueue();
+  esimd_test::printTestLabel(q);
 
-  auto dev = q.get_device();
-  unsigned char *srcY = malloc_shared<unsigned char>(width * height, q);
-  if (srcY == NULL) {
-    std::cerr << "Out of memory\n";
-    exit(1);
-  }
-  unsigned int *bins = malloc_shared<unsigned int>(NUM_BINS, q);
-  std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
+  esimd_test::shared_vector<uint8_t> srcY_vec(
+      width * height, esimd_test::shared_allocator<uint8_t>{q});
+  esimd_test::shared_vector<unsigned int> bins_vec(
+      NUM_BINS, esimd_test::shared_allocator<unsigned int>{q});
+  uint8_t *srcY = srcY_vec.data();
+  ;
+  unsigned int *bins = bins_vec.data();
 
   uint range_width = width / BLOCK_WIDTH;
   uint range_height = height / BLOCK_HEIGHT;
@@ -106,16 +106,12 @@ int main(int argc, char *argv[]) {
     FILE *f = fopen(input_file, "rb");
     if (f == NULL) {
       std::cerr << "Error opening file " << input_file;
-      free(srcY, q);
-      free(bins, q);
       std::exit(1);
     }
 
     unsigned int cnt = fread(srcY, sizeof(unsigned char), input_size, f);
     if (cnt != input_size) {
       std::cerr << "Error reading input from " << input_file;
-      free(srcY, q);
-      free(bins, q);
       std::exit(1);
     }
   } else {
@@ -171,9 +167,9 @@ int main(int argc, char *argv[]) {
               uint h_pos = (tid % range_width) * BLOCK_WIDTH;
               uint v_pos = (tid / range_width) * BLOCK_HEIGHT;
 
-              // Declare a 8x32 uchar matrix to store the input block pixel
+              // Declare a 8x32 uint8_t matrix to store the input block pixel
               // value
-              simd<unsigned char, 8 * 32> in;
+              simd<uint8_t, 8 * 32> in;
 
               // Declare a vector to store the local histogram
               simd<unsigned int, NUM_BINS> histogram(0);
@@ -181,8 +177,7 @@ int main(int argc, char *argv[]) {
               // Each thread handles BLOCK_HEIGHTxBLOCK_WIDTH pixel block
               for (int y = 0; y < BLOCK_HEIGHT / 8; y++) {
                 // Perform 2D media block read to load 8x32 pixel block
-                in = media_block_load<unsigned char, 8, 32>(readAcc, h_pos,
-                                                            v_pos);
+                in = media_block_load<uint8_t, 8, 32>(readAcc, h_pos, v_pos);
 
             // Accumulate local histogram for each pixel value
 #pragma unroll
@@ -236,8 +231,6 @@ int main(int argc, char *argv[]) {
     // make sure data is given back to the host at the end of this scope
   } catch (sycl::exception const &e) {
     std::cerr << "SYCL exception caught: " << e.what() << '\n';
-    free(srcY, q);
-    free(bins, q);
     return 1;
   }
 
@@ -251,8 +244,6 @@ int main(int argc, char *argv[]) {
   writeHist(cpuHistogram);
   // Checking Histogram
   bool Success = checkHistogram(cpuHistogram, bins);
-  free(srcY, q);
-  free(bins, q);
 
   if (!Success) {
     std::cerr << "FAILED\n";
diff --git a/sycl/test-e2e/ESIMD/histogram_256_slm.cpp b/sycl/test-e2e/ESIMD/histogram_256_slm.cpp
index 8ea5bd16810df..9c2a7d7a0237a 100644
--- a/sycl/test-e2e/ESIMD/histogram_256_slm.cpp
+++ b/sycl/test-e2e/ESIMD/histogram_256_slm.cpp
@@ -45,7 +45,7 @@ ESIMD_INLINE void histogram_atomic(const uint32_t *input_ptr, uint32_t *output,
     auto start_addr = ((unsigned int *)input_ptr) + start_off;
     simd<uint, 32> data;
     data.copy_from(start_addr);
-    auto in = data.bit_cast_view<uchar>();
+    auto in = data.bit_cast_view<uint8_t>();
 
 #pragma unroll
     for (int j = 0; j < BLOCK_WIDTH * sizeof(int); j += 16) {
@@ -75,7 +75,7 @@ void HistogramCPU(unsigned int size, unsigned int *src,
                   unsigned int *cpu_histogram) {
   for (int i = 0; i < size; i++) {
     unsigned int x = src[i];
-    cpu_histogram[(x)&0xFFU] += 1;
+    cpu_histogram[(x) & 0xFFU] += 1;
     cpu_histogram[(x >> 8) & 0xFFU] += 1;
     cpu_histogram[(x >> 16) & 0xFFU] += 1;
     cpu_histogram[(x >> 24) & 0xFFU] += 1;
@@ -104,6 +104,7 @@ int CheckHistogram(unsigned int *cpu_histogram, unsigned int *gpu_histogram) {
 
 int main() {
   queue q = esimd_test::createQueue();
+  esimd_test::printTestLabel(q);
 
   const char *input_file = nullptr;
   unsigned int width = 1024;
@@ -111,7 +112,10 @@ int main() {
 
   // Initializes input.
   unsigned int input_size = width * height;
-  unsigned int *input_ptr = malloc_shared<unsigned int>(input_size, q);
+
+  esimd_test::shared_vector<unsigned int> input_vec(
+      input_size, esimd_test::shared_allocator<unsigned int>{q});
+  unsigned int *input_ptr = input_vec.data();
   printf("Processing %dx%d inputs\n", width, height);
 
   srand(2009);
@@ -124,13 +128,8 @@ int main() {
 
   // Allocates system memory for output buffer.
   int buffer_size = sizeof(unsigned int) * NUM_BINS;
-  unsigned int *hist = new unsigned int[buffer_size];
-  if (hist == nullptr) {
-    free(input_ptr, q);
-    std::cerr << "Out of memory\n";
-    exit(1);
-  }
-  memset(hist, 0, buffer_size);
+  std::vector<unsigned int> hist_vec(buffer_size, 0);
+  unsigned int *hist = hist_vec.data();
 
   // Uses the CPU to calculate the histogram output data.
   unsigned int cpu_histogram[NUM_BINS];
@@ -141,7 +140,9 @@ int main() {
   std::cout << "finish cpu_histogram\n";
 
   // Uses the GPU to calculate the histogram output data.
-  unsigned int *output_surface = malloc_shared<unsigned int>(NUM_BINS, q);
+  esimd_test::shared_vector<unsigned int> output_vec(
+      NUM_BINS, esimd_test::shared_allocator<unsigned int>{q});
+  unsigned int *output_surface = output_vec.data();
 
   unsigned int num_threads;
   num_threads = width * height / (NUM_BLOCKS * BLOCK_WIDTH);
@@ -194,9 +195,6 @@ int main() {
 
   memcpy(hist, output_surface, 4 * NUM_BINS);
 
-  free(output_surface, q);
-  free(input_ptr, q);
-
   // Compares the CPU histogram output data with the
   // GPU histogram output data.
   // If there is no difference, the result is correct.
@@ -207,7 +205,5 @@ int main() {
   else
     std::cout << "FAILED\n";
 
-  delete[] hist;
-
   return res ? 0 : -1;
 }
diff --git a/sycl/test-e2e/ESIMD/histogram_256_slm_spec_2020.cpp b/sycl/test-e2e/ESIMD/histogram_256_slm_spec_2020.cpp
index 5a9b6ea519b69..4252adca8ab3b 100644
--- a/sycl/test-e2e/ESIMD/histogram_256_slm_spec_2020.cpp
+++ b/sycl/test-e2e/ESIMD/histogram_256_slm_spec_2020.cpp
@@ -41,7 +41,7 @@ ESIMD_INLINE void histogram_atomic(const uint32_t *input_ptr, uint32_t *output,
   for (int y = 0; y < num_blocks; y++) {
     auto start_addr = ((unsigned int *)input_ptr) + start_off;
     auto data = block_load<uint, 32>(start_addr);
-    auto in = data.bit_cast_view<uchar>();
+    auto in = data.bit_cast_view<uint8_t>();
 
 #pragma unroll
     for (int j = 0; j < BLOCK_WIDTH * sizeof(int); j += 16) {
@@ -71,7 +71,7 @@ void HistogramCPU(unsigned int size, unsigned int *src,
                   unsigned int *cpu_histogram) {
   for (int i = 0; i < size; i++) {
     unsigned int x = src[i];
-    cpu_histogram[(x)&0xFFU] += 1;
+    cpu_histogram[(x) & 0xFFU] += 1;
     cpu_histogram[(x >> 8) & 0xFFU] += 1;
     cpu_histogram[(x >> 16) & 0xFFU] += 1;
     cpu_histogram[(x >> 24) & 0xFFU] += 1;
@@ -103,8 +103,7 @@ class histogram_slm;
 
 int main(int argc, char **argv) {
   queue q = esimd_test::createQueue();
-  auto dev = q.get_device();
-  auto ctxt = q.get_context();
+  esimd_test::printTestLabel(q);
 
   const char *input_file = nullptr;
   unsigned int width = 1024 * sizeof(unsigned int);
@@ -112,8 +111,10 @@ int main(int argc, char **argv) {
 
   // Initializes input.
   unsigned int input_size = width * height;
-  unsigned int *input_ptr =
-      (unsigned int *)malloc_shared(input_size, dev, ctxt);
+  esimd_test::shared_vector<unsigned int> input_vec(
+      input_size, esimd_test::shared_allocator<unsigned int>{q});
+  unsigned int *input_ptr = input_vec.data();
+
   printf("Processing %dx%d inputs\n", (int)(width / sizeof(unsigned int)),
          height);
 
@@ -128,12 +129,8 @@ int main(int argc, char **argv) {
 
   // Allocates system memory for output buffer.
   int buffer_size = sizeof(unsigned int) * NUM_BINS;
-  unsigned int *hist = new unsigned int[buffer_size];
-  if (hist == nullptr) {
-    std::cerr << "Out of memory\n";
-    exit(1);
-  }
-  memset(hist, 0, buffer_size);
+  std::vector<unsigned int> hist_vec(buffer_size, 0);
+  unsigned int *hist = hist_vec.data();
 
   // Uses the CPU to calculate the histogram output data.
   unsigned int cpu_histogram[NUM_BINS];
@@ -144,9 +141,9 @@ int main(int argc, char **argv) {
   std::cout << "finish cpu_histogram\n";
 
   // Uses the GPU to calculate the histogram output data.
-  unsigned int *output_surface =
-      (uint32_t *)malloc_shared(4 * NUM_BINS, dev, ctxt);
-  memset(output_surface, 0, 4 * NUM_BINS);
+  esimd_test::shared_vector<unsigned int> output_vec(
+      NUM_BINS, esimd_test::shared_allocator<unsigned int>{q});
+  unsigned int *output_surface = output_vec.data();
 
   unsigned int num_blocks{NUM_BLOCKS};
   if (argc == 2) {
@@ -188,11 +185,11 @@ int main(int argc, char **argv) {
       e.wait();
       if (profiling) {
         etime = esimd_test::report_time("kernel time", e, e);
-      if (iter > 0)
-        kernel_times += etime;
+        if (iter > 0)
+          kernel_times += etime;
       }
       if (iter == 0)
-      start = timer.Elapsed();
+        start = timer.Elapsed();
     }
   } catch (sycl::exception const &e) {
     std::cout << "SYCL exception caught: " << e.what() << '\n';
@@ -209,10 +206,6 @@ int main(int argc, char **argv) {
 
   memcpy(hist, output_surface, 4 * NUM_BINS);
 
-  free(output_surface, ctxt);
-
-  free(input_ptr, ctxt);
-
   // Compares the CPU histogram output data with the
   // GPU histogram output data.
   // If there is no difference, the result is correct.
@@ -223,7 +216,5 @@ int main(int argc, char **argv) {
   else
     std::cout << "FAILED\n";
 
-  delete[] hist;
-
   return res ? 0 : -1;
 }
diff --git a/sycl/test-e2e/ESIMD/histogram_2d.cpp b/sycl/test-e2e/ESIMD/histogram_2d.cpp
index edca385e147aa..73001218f19a0 100644
--- a/sycl/test-e2e/ESIMD/histogram_2d.cpp
+++ b/sycl/test-e2e/ESIMD/histogram_2d.cpp
@@ -34,7 +34,7 @@ typedef uint64_t Toffset;
 typedef uint32_t Toffset;
 #endif
 
-void histogram_CPU(unsigned int width, unsigned int height, unsigned char *srcY,
+void histogram_CPU(unsigned int width, unsigned int height, uint8_t *srcY,
                    unsigned int *cpuHistogram) {
   int i;
   for (i = 0; i < width * height; i++) {
@@ -85,16 +85,15 @@ int main(int argc, char *argv[]) {
 
   // Allocate Input Buffer
   queue q = esimd_test::createQueue();
+  esimd_test::printTestLabel(q);
 
-  auto dev = q.get_device();
-  unsigned char *srcY = malloc_shared<unsigned char>(width * height, q);
-  if (srcY == NULL) {
-    std::cerr << "Out of memory\n";
-    exit(1);
-  }
-
-  unsigned int *bins = malloc_shared<unsigned int>(NUM_BINS, q);
-  std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
+  esimd_test::shared_vector<uint8_t> srcY_vec(
+      width * height, esimd_test::shared_allocator<uint8_t>{q});
+  esimd_test::shared_vector<unsigned int> bins_vec(
+      NUM_BINS, esimd_test::shared_allocator<unsigned int>{q});
+  uint8_t *srcY = srcY_vec.data();
+  ;
+  unsigned int *bins = bins_vec.data();
 
   uint range_width = width / BLOCK_WIDTH;
   uint range_height = height / BLOCK_HEIGHT;
@@ -107,16 +106,12 @@ int main(int argc, char *argv[]) {
     FILE *f = fopen(input_file, "rb");
     if (f == NULL) {
       std::cerr << "Error opening file " << input_file;
-      free(srcY, q);
-      free(bins, q);
       std::exit(1);
     }
 
-    unsigned int cnt = fread(srcY, sizeof(unsigned char), input_size, f);
+    unsigned int cnt = fread(srcY, sizeof(uint8_t), input_size, f);
     if (cnt != input_size) {
       std::cerr << "Error reading input from " << input_file;
-      free(srcY, q);
-      free(bins, q);
       std::exit(1);
     }
   } else {
@@ -173,9 +168,9 @@ int main(int argc, char *argv[]) {
               uint h_pos = ndi.get_group(0) * BLOCK_WIDTH;
               uint v_pos = ndi.get_group(1) * BLOCK_HEIGHT;
 
-              // Declare a 8x32 uchar matrix to store the input block pixel
+              // Declare a 8x32 uint8_t matrix to store the input block pixel
               // value
-              simd<unsigned char, 8 * 32> in;
+              simd<uint8_t, 8 * 32> in;
 
               // Declare a vector to store the local histogram
               simd<unsigned int, NUM_BINS> histogram(0);
@@ -183,8 +178,7 @@ int main(int argc, char *argv[]) {
               // Each thread handles BLOCK_HEIGHTxBLOCK_WIDTH pixel block
               for (int y = 0; y < BLOCK_HEIGHT / 8; y++) {
                 // Perform 2D media block read to load 8x32 pixel block
-                in = media_block_load<unsigned char, 8, 32>(readAcc, h_pos,
-                                                            v_pos);
+                in = media_block_load<uint8_t, 8, 32>(readAcc, h_pos, v_pos);
 
             // Accumulate local histogram for each pixel value
 #pragma unroll
@@ -239,8 +233,6 @@ int main(int argc, char *argv[]) {
     // make sure data is given back to the host at the end of this scope
   } catch (sycl::exception const &e) {
     std::cout << "SYCL exception caught: " << e.what() << '\n';
-    free(srcY, q);
-    free(bins, q);
     return 1;
   }
 
@@ -254,8 +246,6 @@ int main(int argc, char *argv[]) {
   writeHist(cpuHistogram);
   // Checking Histogram
   int result = checkHistogram(cpuHistogram, bins);
-  free(srcY, q);
-  free(bins, q);
   if (result) {
     std::cerr << "PASSED\n";
     return 0;
diff --git a/sycl/test-e2e/ESIMD/histogram_raw_send.cpp b/sycl/test-e2e/ESIMD/histogram_raw_send.cpp
index 5300bded17d32..3f574225b0973 100644
--- a/sycl/test-e2e/ESIMD/histogram_raw_send.cpp
+++ b/sycl/test-e2e/ESIMD/histogram_raw_send.cpp
@@ -36,7 +36,7 @@ using namespace sycl;
 #define BLOCK_WIDTH 32
 #define BLOCK_HEIGHT 64
 
-void histogram_CPU(unsigned int width, unsigned int height, unsigned char *srcY,
+void histogram_CPU(unsigned int width, unsigned int height, uint8_t *srcY,
                    unsigned int *cpuHistogram) {
   int i;
   for (i = 0; i < width * height; i++) {
@@ -124,24 +124,19 @@ int main(int argc, char *argv[]) {
 
   // Allocate Input Buffer
   queue q = esimd_test::createQueue();
+  esimd_test::printTestLabel(q);
 
-  auto dev = q.get_device();
-  auto ctxt = q.get_context();
-  unsigned char *srcY =
-      static_cast<unsigned char *>(malloc_shared(width * height, dev, ctxt));
-  unsigned int *bins = static_cast<unsigned int *>(
-      malloc_shared(NUM_BINS * sizeof(unsigned int), dev, ctxt));
-  std::cout << "Running on " << dev.get_info<sycl::info::device::name>()
-            << "\n";
+  esimd_test::shared_vector<uint8_t> srcY_vec(
+      width * height, esimd_test::shared_allocator<uint8_t>{q});
+  esimd_test::shared_vector<unsigned int> bins_vec(
+      NUM_BINS, esimd_test::shared_allocator<unsigned int>{q});
+  uint8_t *srcY = srcY_vec.data();
+  ;
+  unsigned int *bins = bins_vec.data();
 
   uint range_width = width / BLOCK_WIDTH;
   uint range_height = height / BLOCK_HEIGHT;
 
-  if (srcY == NULL) {
-    std::cerr << "Out of memory\n";
-    exit(1);
-  }
-
   // Initializes input.
   unsigned int input_size = width * height;
   std::cerr << "Processing inputs\n";
diff --git a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_surf.hpp b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_surf.hpp
index 5a953950b5666..b4750840af837 100644
--- a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_surf.hpp
+++ b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_surf.hpp
@@ -37,13 +37,13 @@ int main() {
   std::iota(vec_2.begin(), vec_2.end(), 0);
   std::iota(vec_3.begin(), vec_3.end(), 0);
   std::iota(vec_4.begin(), vec_4.end(), 0);
-  auto buf_0 = buffer{vec_0};
-  auto buf_1 = buffer{vec_1};
-  auto buf_2 = buffer{vec_2};
-  auto buf_3 = buffer{vec_3};
-  auto buf_4 = buffer{vec_4};
 
   try {
+    auto buf_0 = buffer{vec_0};
+    auto buf_1 = buffer{vec_1};
+    auto buf_2 = buffer{vec_2};
+    auto buf_3 = buffer{vec_3};
+    auto buf_4 = buffer{vec_4};
     q.submit([&](handler &h) {
       auto access_0 = buf_0.template get_access<access::mode::read_write>(h);
       auto access_1 = buf_1.template get_access<access::mode::read_write>(h);
@@ -80,11 +80,6 @@ int main() {
           });
     });
     q.wait();
-    buf_0.template get_access<access::mode::read_write>();
-    buf_1.template get_access<access::mode::read_write>();
-    buf_2.template get_access<access::mode::read_write>();
-    buf_3.template get_access<access::mode::read_write>();
-    buf_4.template get_access<access::mode::read_write>();
   } catch (sycl::exception e) {
     std::cout << "SYCL exception caught: " << e.what();
     return 1;
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_argument_type_deduction.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_argument_type_deduction.cpp
index e61e016ef61fc..237f76235e009 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_argument_type_deduction.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_argument_type_deduction.cpp
@@ -28,9 +28,9 @@ template <unsigned SIMDSize> int testAccessor(queue q) {
   auto vec_0 = std::vector<int>(size);
 
   std::iota(vec_0.begin(), vec_0.end(), 0);
-  auto buf_0 = buffer{vec_0};
 
   try {
+    auto buf_0 = buffer{vec_0};
     q.submit([&](handler &h) {
       auto access_0 = buf_0.template get_access<access::mode::read_write>(h);
 
@@ -44,7 +44,6 @@ template <unsigned SIMDSize> int testAccessor(queue q) {
           });
     });
     q.wait();
-    buf_0.template get_access<access::mode::read_write>();
   } catch (sycl::exception e) {
     std::cout << "SYCL exception caught: " << e.what();
     return 1;
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_predicate.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_predicate.cpp
index 4abfc98389c6c..3dd3c8d93d0e9 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_predicate.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_predicate.cpp
@@ -30,10 +30,10 @@ template <unsigned SIMDSize> int testAccessor(queue q) {
 
   std::iota(vec_0.begin(), vec_0.end(), 0);
   std::iota(vec_2.begin(), vec_2.end(), 0);
-  auto buf_0 = buffer{vec_0};
-  auto buf_2 = buffer{vec_2};
 
   try {
+    auto buf_0 = buffer{vec_0};
+    auto buf_2 = buffer{vec_2};
     q.submit([&](handler &h) {
       auto access_0 = buf_0.template get_access<access::mode::read_write>(h);
       auto access_2 = buf_2.template get_access<access::mode::read_write>(h);
@@ -58,8 +58,6 @@ template <unsigned SIMDSize> int testAccessor(queue q) {
           });
     });
     q.wait();
-    buf_0.template get_access<access::mode::read_write>();
-    buf_2.template get_access<access::mode::read_write>();
   } catch (sycl::exception e) {
     std::cout << "SYCL exception caught: " << e.what();
     return 1;
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_predicate_stateless.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_predicate_stateless.cpp
index 8a71debbfe3a7..e389043ee3245 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_predicate_stateless.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_predicate_stateless.cpp
@@ -32,10 +32,10 @@ template <unsigned SIMDSize> int testAccessor(queue q) {
 
   std::iota(vec_0.begin(), vec_0.end(), 0);
   std::iota(vec_2.begin(), vec_2.end(), 0);
-  auto buf_0 = buffer{vec_0};
-  auto buf_2 = buffer{vec_2};
 
   try {
+    auto buf_0 = buffer{vec_0};
+    auto buf_2 = buffer{vec_2};
     q.submit([&](handler &h) {
       auto access_0 = buf_0.template get_access<access::mode::read_write>(h);
       auto access_2 = buf_2.template get_access<access::mode::read_write>(h);
@@ -60,8 +60,6 @@ template <unsigned SIMDSize> int testAccessor(queue q) {
           });
     });
     q.wait();
-    buf_0.template get_access<access::mode::read_write>();
-    buf_2.template get_access<access::mode::read_write>();
   } catch (sycl::exception e) {
     std::cout << "SYCL exception caught: " << e.what();
     return 1;
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_slm.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_slm.cpp
index 7bcefa32b6a7c..becb0265bfd5c 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_slm.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_slm.cpp
@@ -35,13 +35,13 @@ int main() {
   auto vec_2 = std::vector<int>(size);
   auto vec_3 = std::vector<int>(size);
   auto vec_4 = std::vector<int>(size);
-  auto buf_0 = buffer{vec_0};
-  auto buf_1 = buffer{vec_1};
-  auto buf_2 = buffer{vec_2};
-  auto buf_3 = buffer{vec_3};
-  auto buf_4 = buffer{vec_4};
 
   try {
+    auto buf_0 = buffer{vec_0};
+    auto buf_1 = buffer{vec_1};
+    auto buf_2 = buffer{vec_2};
+    auto buf_3 = buffer{vec_3};
+    auto buf_4 = buffer{vec_4};
     q.submit([&](handler &h) {
       auto access_0 = buf_0.template get_access<access::mode::read_write>(h);
       auto access_1 = buf_1.template get_access<access::mode::read_write>(h);
@@ -86,11 +86,6 @@ int main() {
           });
     });
     q.wait();
-    buf_0.template get_access<access::mode::read_write>();
-    buf_1.template get_access<access::mode::read_write>();
-    buf_2.template get_access<access::mode::read_write>();
-    buf_3.template get_access<access::mode::read_write>();
-    buf_4.template get_access<access::mode::read_write>();
   } catch (sycl::exception e) {
     std::cout << "SYCL exception caught: " << e.what();
     return 1;
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_slm_atomic_smoke.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_slm_atomic_smoke.cpp
index 4fde6446bdff1..7ad7fdc63d298 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_slm_atomic_smoke.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_slm_atomic_smoke.cpp
@@ -41,10 +41,16 @@ template <class, int, template <class, int> class> class TestID;
 
 const char *to_string(LSCAtomicOp op) {
   switch (op) {
+  case LSCAtomicOp::predec:
+    return "lsc::predec";
   case LSCAtomicOp::add:
     return "lsc::add";
   case LSCAtomicOp::sub:
     return "lsc::sub";
+  case LSCAtomicOp::fadd:
+    return "lsc::fadd";
+  case LSCAtomicOp::fsub:
+    return "lsc::fsub";
   case LSCAtomicOp::inc:
     return "lsc::inc";
   case LSCAtomicOp::dec:
@@ -53,6 +59,8 @@ const char *to_string(LSCAtomicOp op) {
     return "lsc::umin";
   case LSCAtomicOp::umax:
     return "lsc::umax";
+  case LSCAtomicOp::xchg:
+    return "lsc::xchg";
   case LSCAtomicOp::cmpxchg:
     return "lsc::cmpxchg";
   case LSCAtomicOp::bit_and:
diff --git a/sycl/test-e2e/ESIMD/named_barriers/exec_in_order.cpp b/sycl/test-e2e/ESIMD/named_barriers/exec_in_order.cpp
index b9fdc062349a8..8d5ba1ecc013b 100644
--- a/sycl/test-e2e/ESIMD/named_barriers/exec_in_order.cpp
+++ b/sycl/test-e2e/ESIMD/named_barriers/exec_in_order.cpp
@@ -15,10 +15,11 @@
 // stores data to addresses that partially overlap with addresses used by
 // previous thread.
 
+#include <iostream>
 #include <sycl/ext/intel/esimd.hpp>
 #include <sycl/sycl.hpp>
 
-#include <iostream>
+#include "../esimd_test_utils.hpp"
 
 using namespace sycl;
 using namespace sycl::ext::intel::esimd;
@@ -100,7 +101,7 @@ bool test(QueueTY q) {
             else
               lsc_block_store<int, VL>(acc, off, val);
 
-            lsc_fence();
+            fence();
 
             // idx == 0 arrives here first and signals barrier 1
             // idx == 1 arrives here next and signals barrier 2
@@ -143,11 +144,8 @@ bool test(QueueTY q) {
 }
 
 int main() {
-  auto GPUSelector = gpu_selector{};
-  auto q = queue{GPUSelector};
-  auto dev = q.get_device();
-  std::cout << "Running on " << dev.get_info<sycl::info::device::name>()
-            << "\n";
+  queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());
+  esimd_test::printTestLabel(q);
 
   bool passed = true;
 
diff --git a/sycl/test-e2e/ESIMD/named_barriers/exec_in_order_branched.cpp b/sycl/test-e2e/ESIMD/named_barriers/exec_in_order_branched.cpp
index 7ac6c53fb5933..cda6e1aef39c8 100644
--- a/sycl/test-e2e/ESIMD/named_barriers/exec_in_order_branched.cpp
+++ b/sycl/test-e2e/ESIMD/named_barriers/exec_in_order_branched.cpp
@@ -16,10 +16,11 @@
 // previous thread. Same as "exec_in_order.cpp", but each thread in separate
 // 'if' branch.
 
+#include <iostream>
 #include <sycl/ext/intel/esimd.hpp>
 #include <sycl/sycl.hpp>
 
-#include <iostream>
+#include "../esimd_test_utils.hpp"
 
 using namespace sycl;
 using namespace sycl::ext::intel::esimd;
@@ -89,9 +90,9 @@ bool test(QueueTY q) {
               if constexpr (UseSLM) {
                 lsc_slm_block_store<int, VL>(off, val);
               } else {
-                lsc_fence();
+                fence();
                 lsc_block_store<int, VL>(acc, off, val);
-                lsc_fence();
+                fence();
               }
 
               // T0 signals barrier 1 and locks
@@ -108,9 +109,9 @@ bool test(QueueTY q) {
               if constexpr (UseSLM) {
                 lsc_slm_block_store<int, VL>(off, val);
               } else {
-                lsc_fence();
+                fence();
                 lsc_block_store<int, VL>(acc, off, val);
-                lsc_fence();
+                fence();
               }
 
               // T1 signals barrier 2 and locks
@@ -128,9 +129,9 @@ bool test(QueueTY q) {
               if constexpr (UseSLM) {
                 lsc_slm_block_store<int, VL>(off, val);
               } else {
-                lsc_fence();
+                fence();
                 lsc_block_store<int, VL>(acc, off, val);
-                lsc_fence();
+                fence();
               }
 
               // T2 signals barrier 3 and locks, waiting for signal from T3
@@ -147,9 +148,9 @@ bool test(QueueTY q) {
               if constexpr (UseSLM) {
                 lsc_slm_block_store<int, VL>(off, val);
               } else {
-                lsc_fence();
+                fence();
                 lsc_block_store<int, VL>(acc, off, val);
-                lsc_fence();
+                fence();
               }
             }
 
@@ -184,11 +185,8 @@ bool test(QueueTY q) {
 }
 
 int main() {
-  auto GPUSelector = gpu_selector{};
-  auto q = queue{GPUSelector};
-  auto dev = q.get_device();
-  std::cout << "Running on " << dev.get_info<sycl::info::device::name>()
-            << "\n";
+  queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());
+  esimd_test::printTestLabel(q);
 
   bool passed = true;
 
diff --git a/sycl/test-e2e/ESIMD/named_barriers/loop.cpp b/sycl/test-e2e/ESIMD/named_barriers/loop.cpp
index 916eaff8cb649..9d59cedbbb939 100644
--- a/sycl/test-e2e/ESIMD/named_barriers/loop.cpp
+++ b/sycl/test-e2e/ESIMD/named_barriers/loop.cpp
@@ -15,10 +15,11 @@
 // Each iteration has 1 barrier and 1 producer. Producer stores data to SLM,
 // then all threads read SLM and store data to surface.
 
+#include <iostream>
 #include <sycl/ext/intel/esimd.hpp>
 #include <sycl/sycl.hpp>
 
-#include <iostream>
+#include "../esimd_test_utils.hpp"
 
 using namespace sycl;
 using namespace sycl::ext::intel::esimd;
@@ -97,9 +98,9 @@ bool test(QueueTY q) {
               auto val = lsc_slm_block_load<int, VL>(off);
               // and storing it to output surface
               unsigned int store_off = off + i * SlmSize * sizeof(int);
-              lsc_fence();
+              fence();
               lsc_block_store<int, VL>(acc, store_off, val);
-              lsc_fence();
+              fence();
             }
           });
     });
@@ -134,11 +135,8 @@ bool test(QueueTY q) {
 }
 
 int main() {
-  auto GPUSelector = gpu_selector{};
-  auto q = queue{GPUSelector};
-  auto dev = q.get_device();
-  std::cout << "Running on " << dev.get_info<sycl::info::device::name>()
-            << "\n";
+  queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());
+  esimd_test::printTestLabel(q);
 
   bool passed = true;
 
diff --git a/sycl/test-e2e/ESIMD/named_barriers/loop_extended.cpp b/sycl/test-e2e/ESIMD/named_barriers/loop_extended.cpp
index a6e476c16d95a..f30caaacf4cba 100644
--- a/sycl/test-e2e/ESIMD/named_barriers/loop_extended.cpp
+++ b/sycl/test-e2e/ESIMD/named_barriers/loop_extended.cpp
@@ -15,10 +15,11 @@
 // producers. Producer stores data to SLM, then all threads read SLM and store
 // data to surface.
 
+#include <iostream>
 #include <sycl/ext/intel/esimd.hpp>
 #include <sycl/sycl.hpp>
 
-#include <iostream>
+#include "../esimd_test_utils.hpp"
 
 using namespace sycl;
 using namespace sycl::ext::intel::esimd;
@@ -47,63 +48,66 @@ bool test(QueueTY q) {
 
     auto e = q.submit([&](handler &cgh) {
       auto acc = buf.get_access<access::mode::write>(cgh);
-      cgh.parallel_for<KernelID<
-          case_num>>(Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL {
-        // 2 named barriers, id 0 reserved for unnamed
-        constexpr unsigned bnum = 3;
-
-        // SLM size is half of output surface size so
-        // content of SLM can be copied to out buffer on each iteration
-        constexpr unsigned SlmSize = Size / 2;     // 32
-        constexpr unsigned VL = SlmSize / Threads; // 4
-
-        named_barrier_init<bnum>();
-
-        unsigned int idx = ndi.get_local_id(0);
-        unsigned int off = idx * VL * sizeof(int);
-
-        // 2 producers on first iteration, 1 producer on second
-        unsigned int indexes[2][2] = {{1, 2}, {3, 3}}; // local ids of producers
-        unsigned int prods[2] = {2, 1};                // number of producers
-
-        slm_init(SlmSize * sizeof(int));
-        lsc_slm_block_store<int, VL>(off, simd<int, VL>(0));
-        barrier();
-
-        for (int b = bnum - 1; b > 0; b--) {
-          int j = bnum - b - 1; // iteration index
-
-          bool is_producer = idx == indexes[j][0] || idx == indexes[j][1];
-          bool is_consumer = !is_producer;
-          // only-consumer or only-producer modes
-          unsigned int flag = is_producer ? 0x1 : 0x2;
-
-          unsigned int producers = prods[j];
-          unsigned int consumers = Threads - producers;
-
-          if (is_producer) {
-            unsigned int p_off = j * sizeof(int) * SlmSize / 4;
-            // second iteration store partialy overlaps first iteration stores
-            unsigned int dx = producers == 2 ? (idx - 1) : 0;
-            p_off += dx * sizeof(int) * SlmSize / 2;
-            int v = 0xdead0000 + idx;
-            simd<int, SlmSize / 2> init(v);
-            // producer stores to SLM
-            lsc_slm_block_store<int, SlmSize / 2>(p_off, init);
-          }
-
-          named_barrier_signal(b, flag, producers, consumers);
-
-          if (is_consumer)
-            named_barrier_wait(b); // consumers waiting for signal
-
-          auto val = lsc_slm_block_load<int, VL>(off); // reading SLM
-          // and storing it to output surface
-          lsc_fence();
-          lsc_block_store<int, VL>(acc, off + j * SlmSize * sizeof(int), val);
-          lsc_fence();
-        }
-      });
+      cgh.parallel_for<KernelID<case_num>>(
+          Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL {
+            // 2 named barriers, id 0 reserved for unnamed
+            constexpr unsigned bnum = 3;
+
+            // SLM size is half of output surface size so
+            // content of SLM can be copied to out buffer on each iteration
+            constexpr unsigned SlmSize = Size / 2;     // 32
+            constexpr unsigned VL = SlmSize / Threads; // 4
+
+            named_barrier_init<bnum>();
+
+            unsigned int idx = ndi.get_local_id(0);
+            unsigned int off = idx * VL * sizeof(int);
+
+            // 2 producers on first iteration, 1 producer on second
+            unsigned int indexes[2][2] = {{1, 2},
+                                          {3, 3}}; // local ids of producers
+            unsigned int prods[2] = {2, 1};        // number of producers
+
+            slm_init(SlmSize * sizeof(int));
+            lsc_slm_block_store<int, VL>(off, simd<int, VL>(0));
+            barrier();
+
+            for (int b = bnum - 1; b > 0; b--) {
+              int j = bnum - b - 1; // iteration index
+
+              bool is_producer = idx == indexes[j][0] || idx == indexes[j][1];
+              bool is_consumer = !is_producer;
+              // only-consumer or only-producer modes
+              unsigned int flag = is_producer ? 0x1 : 0x2;
+
+              unsigned int producers = prods[j];
+              unsigned int consumers = Threads - producers;
+
+              if (is_producer) {
+                unsigned int p_off = j * sizeof(int) * SlmSize / 4;
+                // second iteration store partialy overlaps first iteration
+                // stores
+                unsigned int dx = producers == 2 ? (idx - 1) : 0;
+                p_off += dx * sizeof(int) * SlmSize / 2;
+                int v = 0xdead0000 + idx;
+                simd<int, SlmSize / 2> init(v);
+                // producer stores to SLM
+                lsc_slm_block_store<int, SlmSize / 2>(p_off, init);
+              }
+
+              named_barrier_signal(b, flag, producers, consumers);
+
+              if (is_consumer)
+                named_barrier_wait(b); // consumers waiting for signal
+
+              auto val = lsc_slm_block_load<int, VL>(off); // reading SLM
+              // and storing it to output surface
+              fence();
+              lsc_block_store<int, VL>(acc, off + j * SlmSize * sizeof(int),
+                                       val);
+              fence();
+            }
+          });
     });
     e.wait();
   } catch (sycl::exception const &e) {
@@ -136,11 +140,8 @@ bool test(QueueTY q) {
 }
 
 int main() {
-  auto GPUSelector = gpu_selector{};
-  auto q = queue{GPUSelector};
-  auto dev = q.get_device();
-  std::cout << "Running on " << dev.get_info<sycl::info::device::name>()
-            << "\n";
+  queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());
+  esimd_test::printTestLabel(q);
 
   bool passed = true;
 
diff --git a/sycl/test-e2e/ESIMD/named_barriers/multiple_wg.cpp b/sycl/test-e2e/ESIMD/named_barriers/multiple_wg.cpp
index 1eeeb8f67d478..55a88c3b48fa9 100644
--- a/sycl/test-e2e/ESIMD/named_barriers/multiple_wg.cpp
+++ b/sycl/test-e2e/ESIMD/named_barriers/multiple_wg.cpp
@@ -17,9 +17,10 @@
 // Producers store to SLM; consumers read SLM and store data to surface.
 
 #include <CL/sycl.hpp>
+#include <iostream>
 #include <sycl/ext/intel/esimd.hpp>
 
-#include <iostream>
+#include "../esimd_test_utils.hpp"
 
 using namespace sycl;
 using namespace sycl::ext::intel::esimd;
@@ -134,11 +135,8 @@ bool test(QueueTY q) {
 }
 
 int main() {
-  auto GPUSelector = gpu_selector{};
-  auto q = queue{GPUSelector};
-  auto dev = q.get_device();
-  std::cout << "Running on " << dev.get_info<sycl::info::device::name>()
-            << "\n";
+  queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());
+  esimd_test::printTestLabel(q);
 
   bool passed = true;
 
diff --git a/sycl/test-e2e/ESIMD/named_barriers/single_wg.cpp b/sycl/test-e2e/ESIMD/named_barriers/single_wg.cpp
index 1e875d2e18a2a..d65dbfdf95649 100644
--- a/sycl/test-e2e/ESIMD/named_barriers/single_wg.cpp
+++ b/sycl/test-e2e/ESIMD/named_barriers/single_wg.cpp
@@ -16,10 +16,11 @@
 // Producers store data to SLM, then all threads read SLM and store data to
 // surface.
 
+#include <iostream>
 #include <sycl/ext/intel/esimd.hpp>
 #include <sycl/sycl.hpp>
 
-#include <iostream>
+#include "../esimd_test_utils.hpp"
 
 using namespace sycl;
 using namespace sycl::ext::intel::esimd;
@@ -116,11 +117,8 @@ bool test(QueueTY q) {
 }
 
 int main() {
-  auto GPUSelector = gpu_selector{};
-  auto q = queue{GPUSelector};
-  auto dev = q.get_device();
-  std::cout << "Running on " << dev.get_info<sycl::info::device::name>()
-            << "\n";
+  queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());
+  esimd_test::printTestLabel(q);
 
   bool passed = true;
 
diff --git a/sycl/test-e2e/ESIMD/noinline_call_from_func.cpp b/sycl/test-e2e/ESIMD/noinline_call_from_func.cpp
index 731a5123d03f2..9f892b1afbbd7 100644
--- a/sycl/test-e2e/ESIMD/noinline_call_from_func.cpp
+++ b/sycl/test-e2e/ESIMD/noinline_call_from_func.cpp
@@ -34,9 +34,7 @@ template <typename AccTy> ESIMD_NOINLINE void test(AccTy acc, int A, int B) {
 
 int main(int argc, char **argv) {
   queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());
-
-  auto dev = q.get_device();
-  std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
+  esimd_test::printTestLabel(q);
 
   int result = 0;
   int *output = &result;
@@ -56,7 +54,7 @@ int main(int argc, char **argv) {
     });
   } catch (sycl::exception const &e) {
     std::cout << "SYCL exception caught: " << e.what() << std::endl;
-    return e.get_cl_code();
+    return 1;
   }
 
   if (result != (in1 + in2)) {
diff --git a/sycl/test-e2e/ESIMD/radix_sort.cpp b/sycl/test-e2e/ESIMD/radix_sort.cpp
index 93b3edac387a9..67d6ac5114de2 100644
--- a/sycl/test-e2e/ESIMD/radix_sort.cpp
+++ b/sycl/test-e2e/ESIMD/radix_sort.cpp
@@ -299,7 +299,7 @@ void cmk_prefix_iterative(unsigned *buf, unsigned h_pos,
     if (i == n_iter - 1)
       cnt_table.column(31) -= cnt_table.column(30);
 
-    scatter_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset, S);
+    scatter_rgba<GATHER_SCATTER_MASK>(buf, element_offset, S);
 
     element_offset += stride_elems * TUPLE_SZ * sizeof(unsigned) * 32;
     prev = cnt_table.column(31);
@@ -397,7 +397,7 @@ void cmk_radix_count(
   simd<unsigned, N_WI> elem_offset =
       (init * N_ELEM_WI + offset) * sizeof(unsigned); // byte offset
 
-  simd<unsigned, RADIX *N_WI> V = 0;
+  simd<unsigned, RADIX * N_WI> V = 0;
   auto counters = V.bit_cast_view<unsigned, RADIX, N_WI>();
 
   // each WI process N_ELEM_WI. each iteration reads in 4 elements (gather_rgba)
diff --git a/sycl/test-e2e/ESIMD/regression/Inputs/complex-lib-sycl.cpp b/sycl/test-e2e/ESIMD/regression/Inputs/complex-lib-sycl.cpp
index 0eaf35299de5d..27eb6ba456656 100644
--- a/sycl/test-e2e/ESIMD/regression/Inputs/complex-lib-sycl.cpp
+++ b/sycl/test-e2e/ESIMD/regression/Inputs/complex-lib-sycl.cpp
@@ -4,7 +4,8 @@ sycl::event iota(size_t n, sycl::buffer<int, 1> &buf, sycl::queue &Q) {
   auto HK = [&](sycl::handler &H) {
     sycl::accessor acc_y{buf, H, sycl::write_only};
     auto K = [=](sycl::id<1> id) {
-      int *y = acc_y.get_pointer();
+      int *y =
+          acc_y.template get_multi_ptr<sycl::access::decorated::yes>().get();
       size_t i = id.get(0);
       y[i] = static_cast<int>(i);
     };
diff --git a/sycl/test-e2e/ESIMD/regression/Inputs/dgetrf.hpp b/sycl/test-e2e/ESIMD/regression/Inputs/dgetrf.hpp
index abab1e935b47d..015c7cdc3068e 100644
--- a/sycl/test-e2e/ESIMD/regression/Inputs/dgetrf.hpp
+++ b/sycl/test-e2e/ESIMD/regression/Inputs/dgetrf.hpp
@@ -18,6 +18,8 @@
 #include <sycl/ext/intel/esimd.hpp>
 #include <sycl/sycl.hpp>
 
+#include "../../esimd_test_utils.hpp"
+
 #define ABS(x) ((x) >= 0 ? (x) : -(x))
 #define MIN(x, y) ((x) <= (y) ? (x) : (y))
 #define MAX(x, y) ((x) >= (y) ? (x) : (y))
@@ -445,7 +447,8 @@ void dgetrfnp_batch_strided_c(int64_t m, int64_t n, double *a, int64_t lda,
                               int64_t *info);
 
 int main(int argc, char *argv[]) {
-  queue queue((gpu_selector()));
+  queue queue(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());
+  esimd_test::printTestLabel(queue);
 
   if (!queue.get_device().has(aspect::fp64))
     return 0;
diff --git a/sycl/test-e2e/ESIMD/regression/big_const_initializer.cpp b/sycl/test-e2e/ESIMD/regression/big_const_initializer.cpp
index 0fb3b6a762e8c..80b81a98ffafc 100644
--- a/sycl/test-e2e/ESIMD/regression/big_const_initializer.cpp
+++ b/sycl/test-e2e/ESIMD/regression/big_const_initializer.cpp
@@ -64,7 +64,7 @@ int main(int argc, char **argv) {
     std::cout << "*** EXCEPTION caught: " << e.what() << "\n";
     return 1;
   }
-  auto acc = r.template get_access<sycl::access::mode::read>();
+  auto acc = r.template get_host_access(sycl::read_only);
   for (int i = 0; i < N_PRINT; i++) {
     std::cout << acc[i] << " ";
   }
@@ -78,8 +78,7 @@ int main(int argc, char **argv) {
     if (test != gold) {
       if (++err_cnt < 10) {
         std::cout << "failed at index " << i << ", " << test << " != " << gold
-                  << " (expected)"
-                  << "\n";
+                  << " (expected)" << "\n";
       }
     }
   }
diff --git a/sycl/test-e2e/ESIMD/regression/dgetrf_8x8.cpp b/sycl/test-e2e/ESIMD/regression/dgetrf_8x8.cpp
index 4624733f21675..83ec0760555a9 100644
--- a/sycl/test-e2e/ESIMD/regression/dgetrf_8x8.cpp
+++ b/sycl/test-e2e/ESIMD/regression/dgetrf_8x8.cpp
@@ -19,6 +19,8 @@
 #include <sycl/ext/intel/esimd.hpp>
 #include <sycl/sycl.hpp>
 
+#include "../esimd_test_utils.hpp"
+
 #define ABS(x) ((x) >= 0 ? (x) : -(x))
 #define MIN(x, y) ((x) <= (y) ? (x) : (y))
 #define MAX(x, y) ((x) >= (y) ? (x) : (y))
@@ -266,7 +268,8 @@ static int dgetrfnp_batch_strided_check(int64_t m, int64_t n, double *a_in,
 }
 
 int main(int argc, char *argv[]) {
-  queue queue((gpu_selector()));
+  queue queue(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());
+  esimd_test::printTestLabel(queue);
 
   int exit_status = 0;
   constexpr int64_t m = 8, n = 8, lda = 8;

From 6863dfcd502e54e4db60debe074b088823e5e231 Mon Sep 17 00:00:00 2001
From: Buildbot for SYCL <bb-sycl@intel.com>
Date: Wed, 21 Feb 2024 00:10:25 +0800
Subject: [PATCH 02/30] [GHA] Uplift Linux GPU RT version to 24.05.28454.6
 (#12764)

Scheduled drivers uplift

Co-authored-by: GitHub Actions <actions@github.com>
---
 devops/dependencies.json | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/devops/dependencies.json b/devops/dependencies.json
index e8bd3ef2bf97e..9306396ab6aaf 100644
--- a/devops/dependencies.json
+++ b/devops/dependencies.json
@@ -1,15 +1,15 @@
 {
   "linux": {
     "compute_runtime": {
-      "github_tag": "23.52.28202.14",
-      "version": "23.52.28202.14",
-      "url": "https://github.com/intel/compute-runtime/releases/tag/23.52.28202.14",
+      "github_tag": "24.05.28454.6",
+      "version": "24.05.28454.6",
+      "url": "https://github.com/intel/compute-runtime/releases/tag/24.05.28454.6",
       "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu"
     },
     "igc": {
-      "github_tag": "igc-1.0.15770.11",
-      "version": "1.0.15770.11",
-      "url": "https://github.com/intel/intel-graphics-compiler/releases/tag/igc-1.0.15770.11",
+      "github_tag": "igc-1.0.15985.7",
+      "version": "1.0.15985.7",
+      "url": "https://github.com/intel/intel-graphics-compiler/releases/tag/igc-1.0.15985.7",
       "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu"
     },
     "cm": {
@@ -19,9 +19,9 @@
       "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu"
     },
     "level_zero": {
-      "github_tag": "v1.15.13",
-      "version": "v1.15.13",
-      "url": "https://github.com/oneapi-src/level-zero/releases/tag/v1.15.13",
+      "github_tag": "v1.16.1",
+      "version": "v1.16.1",
+      "url": "https://github.com/oneapi-src/level-zero/releases/tag/v1.16.1",
       "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu"
     },
     "tbb": {

From 8f182cda75d39a99344164c88c0ad3ae1a93c04f Mon Sep 17 00:00:00 2001
From: wangdi4 <101905226+wangdi4@users.noreply.github.com>
Date: Tue, 20 Feb 2024 11:23:20 -0800
Subject: [PATCH 03/30] [SYCL] Insert annotation in `annotated_ptr::get()`
 (#12343)

When properties like alignment is specified in a `annotated_ptr` type,
certain operators (like `[]`, `+=`, `++`) are disabled. This results in
loop code to be written as follows:
```
annotated_ptr<int, decltype(properties{...alignment<8>...})> ann_ptr;
...
int *p = ann_ptr.get();     // ann_ptr cannot be used in the for loop directly
for (int i = 0; i < n; i++) {
    p[i] = i;
}
```
When getting the underlying pointer, the annotation gets lost, so does
the the possible optimization on the for-loop brought by the
annotated_ptr properties.

This PR includes changes on spec, header and clang compiler:
1. In `annotated_ptr` spec, update the spec for the `get()` function
2. In the `annotated_ptr` header, update the `get()` function by
inserting `llvm.ptr.annotation`, so that on the target machines like
FPGA for which clang FE only performs O0 optimization, the annotation
inserted can be preserved for the corresponding backends to perform
platform-specific optimizations. For the example above, the `alignment`
information can help the FPGA compiler to build aligned loads/stores.
3. In the clang compiler, the pass `CompileTimePropertiesPass` used to
always drop `alignment` from the annotation string. This PR changes this
behavior to dropping `alignment` only when the compiler finds
load/store/MemIntrinsics in the users of `llvm.ptr.annotation` and
applies the alignment to these instructions.
---
 .../SYCLLowerIR/CompileTimePropertiesPass.h   |  2 +-
 .../SYCLLowerIR/CompileTimePropertiesPass.cpp | 29 ++++---
 ...dstore.ll => sycl-properties-alignment.ll} | 22 +++++-
 .../sycl_ext_oneapi_annotated_ptr.asciidoc    |  4 +-
 .../annotated_ptr/annotated_ptr.hpp           | 79 +++++++++++--------
 .../properties.hpp                            |  5 ++
 .../annotated_ptr/annotation_insertion.cpp    | 48 +++++++++++
 .../properties/properties_cache_control.cpp   |  2 -
 8 files changed, 139 insertions(+), 52 deletions(-)
 rename llvm/test/SYCLLowerIR/CompileTimePropertiesPass/{sycl-properties-alignment-loadstore.ll => sycl-properties-alignment.ll} (79%)
 create mode 100644 sycl/test/extensions/annotated_ptr/annotation_insertion.cpp

diff --git a/llvm/include/llvm/SYCLLowerIR/CompileTimePropertiesPass.h b/llvm/include/llvm/SYCLLowerIR/CompileTimePropertiesPass.h
index 5cfa27e61cbf4..f2c1f96b65d35 100644
--- a/llvm/include/llvm/SYCLLowerIR/CompileTimePropertiesPass.h
+++ b/llvm/include/llvm/SYCLLowerIR/CompileTimePropertiesPass.h
@@ -40,7 +40,7 @@ class CompileTimePropertiesPass
       Module &M, IntrinsicInst *IntrInst,
       SmallVectorImpl<IntrinsicInst *> &RemovableAnnotations);
 
-  void parseAlignmentAndApply(Module &M, IntrinsicInst *IntrInst);
+  bool parseAlignmentAndApply(Module &M, IntrinsicInst *IntrInst);
 
   // Map for keeping track of global variables generated for annotation strings.
   // This allows reuse for annotations with the same generated annotation
diff --git a/llvm/lib/SYCLLowerIR/CompileTimePropertiesPass.cpp b/llvm/lib/SYCLLowerIR/CompileTimePropertiesPass.cpp
index 32a476a8ac893..63065e76faf6d 100644
--- a/llvm/lib/SYCLLowerIR/CompileTimePropertiesPass.cpp
+++ b/llvm/lib/SYCLLowerIR/CompileTimePropertiesPass.cpp
@@ -685,7 +685,7 @@ PreservedAnalyses CompileTimePropertiesPass::run(Module &M,
                                   : PreservedAnalyses::all();
 }
 
-void CompileTimePropertiesPass::parseAlignmentAndApply(
+bool CompileTimePropertiesPass::parseAlignmentAndApply(
     Module &M, IntrinsicInst *IntrInst) {
   // Get the global variable with the annotation string.
   const GlobalVariable *AnnotStrArgGV = nullptr;
@@ -695,11 +695,11 @@ void CompileTimePropertiesPass::parseAlignmentAndApply(
   else if (auto *GEP = dyn_cast<GEPOperator>(IntrAnnotStringArg))
     AnnotStrArgGV = dyn_cast<GlobalVariable>(GEP->getOperand(0));
   if (!AnnotStrArgGV)
-    return;
+    return false;
 
   std::optional<StringRef> AnnotStr = getGlobalVariableString(AnnotStrArgGV);
   if (!AnnotStr)
-    return;
+    return false;
 
   // parse properties string to decoration-value pairs
   auto Properties = parseSYCLPropertiesString(M, IntrInst);
@@ -710,6 +710,7 @@ void CompileTimePropertiesPass::parseAlignmentAndApply(
   getUserListIgnoringCast<StoreInst>(IntrInst, TargetedInstList);
   getUserListIgnoringCast<MemTransferInst>(IntrInst, TargetedInstList);
 
+  bool AlignApplied = false;
   for (auto &Property : Properties) {
     auto DecorStr = Property.first->str();
     auto DecorValue = Property.second;
@@ -733,18 +734,26 @@ void CompileTimePropertiesPass::parseAlignmentAndApply(
         auto Op_num = Pair.second;
         if (auto *LInst = dyn_cast<LoadInst>(Inst)) {
           LInst->setAlignment(Align_val);
+          AlignApplied = true;
         } else if (auto *SInst = dyn_cast<StoreInst>(Inst)) {
-          if (Op_num == 1)
+          if (Op_num == 1) {
             SInst->setAlignment(Align_val);
+            AlignApplied = true;
+          }
         } else if (auto *MI = dyn_cast<MemTransferInst>(Inst)) {
-          if (Op_num == 0)
+          if (Op_num == 0) {
             MI->setDestAlignment(Align_val);
-          else if (Op_num == 1)
+            AlignApplied = true;
+          } else if (Op_num == 1) {
             MI->setSourceAlignment(Align_val);
+            AlignApplied = true;
+          }
         }
       }
     }
   }
+
+  return AlignApplied;
 }
 
 // Returns true if the transformation changed IntrInst.
@@ -773,7 +782,7 @@ bool CompileTimePropertiesPass::transformSYCLPropertiesAnnotation(
     return false;
 
   // check alignment annotation and apply it to load/store
-  parseAlignmentAndApply(M, IntrInst);
+  bool AlignApplied = parseAlignmentAndApply(M, IntrInst);
 
   // Read the annotation values and create new annotation strings.
   std::string NewAnnotString = "";
@@ -782,9 +791,9 @@ bool CompileTimePropertiesPass::transformSYCLPropertiesAnnotation(
   bool CacheProp = false;
   bool FPGAProp = false;
   for (const auto &[PropName, PropVal] : Properties) {
-    // sycl-alignment is converted to align on
-    // previous parseAlignmentAndApply(), dropping here
-    if (PropName == "sycl-alignment")
+    // if sycl-alignment is converted to align on IR constructs
+    // during parseAlignmentAndApply(), dropping here
+    if (PropName == "sycl-alignment" && AlignApplied)
       continue;
 
     auto DecorIt = SpirvDecorMap.find(*PropName);
diff --git a/llvm/test/SYCLLowerIR/CompileTimePropertiesPass/sycl-properties-alignment-loadstore.ll b/llvm/test/SYCLLowerIR/CompileTimePropertiesPass/sycl-properties-alignment.ll
similarity index 79%
rename from llvm/test/SYCLLowerIR/CompileTimePropertiesPass/sycl-properties-alignment-loadstore.ll
rename to llvm/test/SYCLLowerIR/CompileTimePropertiesPass/sycl-properties-alignment.ll
index 11b595b748d3e..535b96648286e 100644
--- a/llvm/test/SYCLLowerIR/CompileTimePropertiesPass/sycl-properties-alignment-loadstore.ll
+++ b/llvm/test/SYCLLowerIR/CompileTimePropertiesPass/sycl-properties-alignment.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -passes=compile-time-properties -S %s -o %t.ll
 ; RUN: FileCheck %s -input-file=%t.ll
 ;
-; Tests the translation of "sycl-alignment" to alignment attributes on load/store
+; Tests the translation of "sycl-alignment" to alignment attributes on load/store/non-memory instructions
 
 target triple = "spir64_fpga-unknown-unknown"
 
@@ -11,13 +11,14 @@ target triple = "spir64_fpga-unknown-unknown"
 $_ZN7ann_refIiEC2EPi = comdat any
 $_ZN7ann_refIiEcvRiEv = comdat any
 $_ZN7ann_refIiEC2EPi1= comdat any
+$no_load_store = comdat any
 
 @.str = private unnamed_addr addrspace(1) constant [16 x i8] c"sycl-properties\00", section "llvm.metadata"
 @.str.1 = private unnamed_addr addrspace(1) constant [9 x i8] c"main.cpp\00", section "llvm.metadata"
 @.str.2 = private unnamed_addr addrspace(1) constant [15 x i8] c"sycl-alignment\00", section "llvm.metadata"
 @.str.3 = private unnamed_addr addrspace(1) constant [3 x i8] c"64\00", section "llvm.metadata"
-@.args = private unnamed_addr addrspace(1) constant { ptr addrspace(1), ptr addrspace(1) } { ptr addrspace(1) @.str.2, ptr addrspace(1) @.str.3 }, section "llvm.met
-adata"
+@.args = private unnamed_addr addrspace(1) constant { ptr addrspace(1), ptr addrspace(1) } { ptr addrspace(1) @.str.2, ptr addrspace(1) @.str.3 }, section "llvm.metadata"
+; CHECK: @[[AnnoStr:.*]] = private unnamed_addr addrspace(1) constant [10 x i8] c"{44:\2264\22}\00"
 
 ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite)
 declare ptr addrspace(4) @llvm.ptr.annotation.p4.p1(ptr addrspace(4), ptr addrspace(1), ptr addrspace(1), i32, ptr addrspace(1)) #5
@@ -77,4 +78,19 @@ entry:
   ret void
 }
 
+; Function Attrs: convergent mustprogress noinline norecurse nounwind optnone
+define linkonce_odr dso_local spir_func noundef ptr addrspace(4) @no_load_store(ptr addrspace(4) noundef %ptr) comdat align 2 {
+entry:
+  %retval = alloca ptr addrspace(4), align 8
+  %ptr.addr = alloca ptr addrspace(4), align 8
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %ptr.addr.ascast = addrspacecast ptr %ptr.addr to ptr addrspace(4)
+  store ptr addrspace(4) %ptr, ptr addrspace(4) %ptr.addr.ascast, align 8
+  %0 = load ptr addrspace(4), ptr addrspace(4) %ptr.addr.ascast, align 8
+  ; CHECK: %[[AnnoPtr:.*]] = call ptr addrspace(4) @llvm.ptr.annotation.p4.p1(ptr addrspace(4) %0, ptr addrspace(1) @[[AnnoStr]]
+  ; CHECK: ret ptr addrspace(4) %[[AnnoPtr]]
+  %1 = call ptr addrspace(4) @llvm.ptr.annotation.p4.p1(ptr addrspace(4) %0, ptr addrspace(1) @.str, ptr addrspace(1) @.str.1, i32 73, ptr addrspace(1) @.args)
+  ret ptr addrspace(4) %1
+}
+
 declare void @llvm.memcpy.p4.p4.i32(ptr addrspace(4), ptr addrspace(4), i32, i1)
diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_annotated_ptr.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_annotated_ptr.asciidoc
index b85904e975587..162cf9a8bbe08 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_annotated_ptr.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_annotated_ptr.asciidoc
@@ -476,8 +476,8 @@ a|
 T* get() const noexcept;
 ----
 |
-Returns the underlying raw pointer. The raw pointer will not retain the
-annotations.
+Returns the underlying raw pointer. Implementations are free to propagate information from properties of
+an annotated_ptr to the raw pointer.
 
 // --- ROW BREAK ---
 a|
diff --git a/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr.hpp b/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr.hpp
index e3ccb5cee5364..4f32802ae548b 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr.hpp
@@ -53,8 +53,41 @@ template <typename... Ts>
 using contains_alignment =
     detail::ContainsProperty<alignment_key, std::tuple<Ts...>>;
 
+// properties filter
+template <typename property_list, template <class...> typename filter>
+using PropertiesFilter =
+    sycl::detail::boost::mp11::mp_copy_if<property_list, filter>;
+
+// filter properties that are applied on annotations
+template <typename... Props>
+using annotation_filter = properties<
+    PropertiesFilter<std::tuple<Props...>, propagateToPtrAnnotation>>;
 } // namespace detail
 
+template <typename I, typename P> struct annotationHelper {};
+
+// unpack properties to varadic template
+template <typename I, typename... P>
+struct annotationHelper<I, detail::properties_t<P...>> {
+  static I *annotate(I *ptr) {
+    return __builtin_intel_sycl_ptr_annotation(
+        ptr, detail::PropertyMetaInfo<P>::name...,
+        detail::PropertyMetaInfo<P>::value...);
+  }
+
+  static I load(I *ptr) {
+    return *__builtin_intel_sycl_ptr_annotation(
+        ptr, detail::PropertyMetaInfo<P>::name...,
+        detail::PropertyMetaInfo<P>::value...);
+  }
+
+  template <class O> static I store(I *ptr, O &&Obj) {
+    return *__builtin_intel_sycl_ptr_annotation(
+               ptr, detail::PropertyMetaInfo<P>::name...,
+               detail::PropertyMetaInfo<P>::value...) = std::forward<O>(Obj);
+  }
+};
+
 template <typename T, typename... Props>
 class annotated_ref<T, detail::properties_t<Props...>> {
   using property_list_t = detail::properties_t<Props...>;
@@ -67,44 +100,14 @@ class annotated_ref<T, detail::properties_t<Props...>> {
   T *m_Ptr;
   explicit annotated_ref(T *Ptr) : m_Ptr(Ptr) {}
 
-  // properties filter
-  template <typename property_list, template <class...> typename filter>
-  using PropertiesFilter =
-      sycl::detail::boost::mp11::mp_copy_if<property_list, filter>;
-
-  template <typename p>
-  using annotation_filter = propagateToPtrAnnotation<typename p::key_t>;
-
-  // filter properties that are applied on annotations
-  using property_tuple_t = std::tuple<Props...>;
-  using annotation_props =
-      properties<PropertiesFilter<property_tuple_t, annotation_filter>>;
-
-  template <typename I, typename P> struct annotationHelper {};
-
-  // unpack properties to varadic template
-  template <typename I, typename... P>
-  struct annotationHelper<I, detail::properties_t<P...>> {
-    static I load(I *ptr) {
-      return *__builtin_intel_sycl_ptr_annotation(
-          ptr, detail::PropertyMetaInfo<P>::name...,
-          detail::PropertyMetaInfo<P>::value...);
-    }
-
-    template <class O> static I store(I *ptr, O &&Obj) {
-      return *__builtin_intel_sycl_ptr_annotation(
-                 ptr, detail::PropertyMetaInfo<P>::name...,
-                 detail::PropertyMetaInfo<P>::value...) = std::forward<O>(Obj);
-    }
-  };
-
 public:
   annotated_ref(const annotated_ref &) = delete;
 
   // implicit conversion with annotaion
   operator T() const {
 #ifdef __SYCL_DEVICE_ONLY__
-    return annotationHelper<T, annotation_props>::load(m_Ptr);
+    return annotationHelper<T, detail::annotation_filter<Props...>>::load(
+        m_Ptr);
 #else
     return *m_Ptr;
 #endif
@@ -114,7 +117,8 @@ class annotated_ref<T, detail::properties_t<Props...>> {
   template <class O, typename = std::enable_if_t<!detail::is_ann_ref_v<O>>>
   T operator=(O &&Obj) const {
 #ifdef __SYCL_DEVICE_ONLY__
-    return annotationHelper<T, annotation_props>::store(m_Ptr, Obj);
+    return annotationHelper<T, detail::annotation_filter<Props...>>::store(
+        m_Ptr, Obj);
 #else
     return *m_Ptr = std::forward<O>(Obj);
 #endif
@@ -376,7 +380,14 @@ __SYCL_TYPE(annotated_ptr) annotated_ptr<T, detail::properties_t<Props...>> {
 
   operator T *() const noexcept = delete;
 
-  T *get() const noexcept { return m_Ptr; }
+  T *get() const noexcept {
+#ifdef __SYCL_DEVICE_ONLY__
+    return annotationHelper<T, detail::annotation_filter<Props...>>::annotate(
+        m_Ptr);
+#else
+    return m_Ptr;
+#endif
+  }
 
   // When the properties contain alignment, operator '[]', '+', '++' and '--'
   // (both post- and prefix) are disabled. Calling these operators when
diff --git a/sycl/include/sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp b/sycl/include/sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp
index 9864e916bc475..1a73da026df8f 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp
@@ -58,6 +58,11 @@ struct check_property_list<T, Prop, Props...>
 
 template <typename PropTy> struct propagateToPtrAnnotation : std::false_type {};
 
+// Partial specilization for property_value
+template <typename PropKeyT, typename... PropValuesTs>
+struct propagateToPtrAnnotation<property_value<PropKeyT, PropValuesTs...>>
+    : propagateToPtrAnnotation<PropKeyT> {};
+
 //===----------------------------------------------------------------------===//
 //        Common properties of annotated_arg/annotated_ptr
 //===----------------------------------------------------------------------===//
diff --git a/sycl/test/extensions/annotated_ptr/annotation_insertion.cpp b/sycl/test/extensions/annotated_ptr/annotation_insertion.cpp
new file mode 100644
index 0000000000000..78d20a0f7f5d7
--- /dev/null
+++ b/sycl/test/extensions/annotated_ptr/annotation_insertion.cpp
@@ -0,0 +1,48 @@
+// RUN: %clangxx -fsycl-device-only -fsycl-targets=spir64_fpga -S -emit-llvm %s -o - | FileCheck %s
+
+// Tests that `@llvm.ptr.annotation` is inserted when calling
+// `annotated_ptr::get()`
+
+#include "sycl/sycl.hpp"
+#include <sycl/ext/intel/fpga_extensions.hpp>
+
+#include <iostream>
+
+// clang-format on
+
+using namespace sycl;
+using namespace ext::oneapi::experimental;
+using namespace ext::intel::experimental;
+
+// CHECK: @[[AnnStr:.*]] = private unnamed_addr addrspace(1) constant [19 x i8] c"{5921:\220\22}{44:\228\22}\00"
+
+using ann_ptr_t1 =
+    annotated_ptr<int, decltype(properties(buffer_location<0>, alignment<8>))>;
+
+struct MyIP {
+  ann_ptr_t1 a;
+
+  MyIP(int *a_) : a(a_) {}
+
+  void operator()() const {
+    // CHECK: %ptr.addr = alloca ptr addrspace(4), align 8
+    // CHECK: store ptr addrspace(4) %ptr, ptr %ptr.addr, align 8
+    // CHECK: %[[LoadPtr:.*]] = load ptr addrspace(4), ptr %ptr.addr, align 8
+    // CHECK: %[[AnnPtr:.*]] = call ptr addrspace(4) @llvm.ptr.annotation.p4.p1(ptr addrspace(4) %[[LoadPtr]], ptr addrspace(1) @[[AnnStr]]
+    // CHECK: ret ptr addrspace(4) %[[AnnPtr]]
+    int *ptr = a.get(); // llvm.ptr.annotation is inserted
+    *ptr = 15;
+  }
+};
+
+void TestVectorAddWithAnnotatedMMHosts() {
+  sycl::queue q;
+  auto raw = malloc_shared<int>(5, q);
+  q.submit([&](handler &h) { h.single_task(MyIP{raw}); }).wait();
+  free(raw, q);
+}
+
+int main() {
+  TestVectorAddWithAnnotatedMMHosts();
+  return 0;
+}
diff --git a/sycl/test/extensions/properties/properties_cache_control.cpp b/sycl/test/extensions/properties/properties_cache_control.cpp
index 273079334036e..cf853d2a6c7ac 100755
--- a/sycl/test/extensions/properties/properties_cache_control.cpp
+++ b/sycl/test/extensions/properties/properties_cache_control.cpp
@@ -9,14 +9,12 @@ using namespace ext::intel::experimental;
 
 using load_hint = annotated_ptr<
     float, decltype(properties(
-               alignment<8>,
                read_hint<cache_control<cache_mode::cached, cache_level::L1>,
                          cache_control<cache_mode::uncached, cache_level::L2,
                                        cache_level::L3>>))>;
 using load_assertion = annotated_ptr<
     int,
     decltype(properties(
-        alignment<8>,
         read_assertion<cache_control<cache_mode::constant, cache_level::L1>,
                        cache_control<cache_mode::invalidate, cache_level::L2,
                                      cache_level::L3>>))>;

From 39639f6b8bbd32d90d53466dd7e16bf6552243ba Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Tue, 20 Feb 2024 13:34:07 -0800
Subject: [PATCH 04/30] [NFC][SYCL] Use get_elem_type_t instead of
 TryToGetElementType<T>::type (#12738)

---
 sycl/include/sycl/accessor.hpp                |  4 +---
 .../sycl/detail/generic_type_traits.hpp       | 24 +++++++++----------
 .../sycl/detail/image_accessor_util.hpp       |  6 ++---
 .../Basic/image/image_accessor_readwrite.cpp  |  6 ++---
 4 files changed, 18 insertions(+), 22 deletions(-)

diff --git a/sycl/include/sycl/accessor.hpp b/sycl/include/sycl/accessor.hpp
index 51d3172e33e5b..be0f6cde70d01 100644
--- a/sycl/include/sycl/accessor.hpp
+++ b/sycl/include/sycl/accessor.hpp
@@ -1121,9 +1121,7 @@ class __image_array_slice__ {
 
   constexpr static int AdjustedDims = (Dimensions == 2) ? 4 : Dimensions + 1;
 
-  template <typename CoordT,
-            typename CoordElemType =
-                typename detail::TryToGetElementType<CoordT>::type>
+  template <typename CoordT, typename CoordElemType = get_elem_type_t<CoordT>>
   sycl::vec<CoordElemType, AdjustedDims>
   getAdjustedCoords(const CoordT &Coords) const {
     CoordElemType LastCoord = 0;
diff --git a/sycl/include/sycl/detail/generic_type_traits.hpp b/sycl/include/sycl/detail/generic_type_traits.hpp
index 4fffdbaf234f0..33931102c40a8 100644
--- a/sycl/include/sycl/detail/generic_type_traits.hpp
+++ b/sycl/include/sycl/detail/generic_type_traits.hpp
@@ -301,18 +301,6 @@ template <typename T>
 using make_unsinged_integer_t =
     make_type_t<T, gtl::scalar_unsigned_integer_list>;
 
-// TryToGetElementType<T>::type is T::element_type or T::value_type if those
-// exist, otherwise T.
-template <typename T> class TryToGetElementType {
-  static T check(...);
-  template <typename A> static typename A::element_type check(const A &);
-  template <typename A> static typename A::value_type check(const A &);
-
-public:
-  using type = decltype(check(T()));
-  static constexpr bool value = !std::is_same_v<T, type>;
-};
-
 // select_apply_cl_scalar_t selects from T8/T16/T32/T64 basing on
 // sizeof(IN).  expected to handle scalar types.
 template <typename T, typename T8, typename T16, typename T32, typename T64>
@@ -525,6 +513,18 @@ template <typename T, typename Enable = void> struct RelConverter {
   static R apply(value_t value) { return value; }
 };
 
+// TryToGetElementType<T>::type is T::element_type or T::value_type if those
+// exist, otherwise T.
+template <typename T> class TryToGetElementType {
+  static T check(...);
+  template <typename A> static typename A::element_type check(const A &);
+  template <typename A> static typename A::value_type check(const A &);
+
+public:
+  using type = decltype(check(T()));
+  static constexpr bool value = !std::is_same_v<T, type>;
+};
+
 template <typename T>
 struct RelConverter<T,
                     typename std::enable_if_t<TryToGetElementType<T>::value>> {
diff --git a/sycl/include/sycl/detail/image_accessor_util.hpp b/sycl/include/sycl/detail/image_accessor_util.hpp
index f237ed5cd74a0..1de4e5808a7da 100644
--- a/sycl/include/sycl/detail/image_accessor_util.hpp
+++ b/sycl/include/sycl/detail/image_accessor_util.hpp
@@ -773,8 +773,6 @@ void imageWriteHostImpl(const CoordT &Coords, const WriteDataT &Color,
     break;
   case image_channel_type::fp16:
     writePixel(
-        // convertWriteDataToHalf<typename
-        // TryToGetElementType<WriteDataT>::type>(
         convertWriteData<half>(Color, ImgChannelType),
         reinterpret_cast<half *>(Ptr), ImgChannelOrder, ImgChannelType);
     break;
@@ -915,7 +913,7 @@ DataT getColor(const int4 PixelCoord, const addressing_mode SmplAddrMode,
   DataT RetData;
   if (isOutOfRange(PixelCoord, SmplAddrMode, ImgRange)) {
     float4 BorderColor = getBorderColor(ImgChannelOrder);
-    RetData = BorderColor.convert<typename TryToGetElementType<DataT>::type>();
+    RetData = BorderColor.convert<get_elem_type_t<DataT>>();
   } else {
     RetData = ReadPixelData<DataT>(PixelCoord, ImgPitch, ImgChannelType,
                                    ImgChannelOrder, BasePtr, ElementSize);
@@ -984,7 +982,7 @@ DataT ReadPixelDataLinearFiltMode(const int8 CoordValues, const float4 abc,
   //           (1 – a) * b * Ci0j1 + a * b * Ci1j1;
   // For 1D image: j0 = 0, j1 = 0, k0 = 0, k1 = 0, b = 0.5, c = 0.5.
   // RetData = (1 – a) * Ci0 + a * Ci1;
-  return RetData.convert<typename TryToGetElementType<DataT>::type>();
+  return RetData.convert<get_elem_type_t<DataT>>();
 }
 
 // imageReadSamplerHostImpl method is called by the read API in image accessors
diff --git a/sycl/test-e2e/Basic/image/image_accessor_readwrite.cpp b/sycl/test-e2e/Basic/image/image_accessor_readwrite.cpp
index 126c66de4978d..7694ccfc5059d 100644
--- a/sycl/test-e2e/Basic/image/image_accessor_readwrite.cpp
+++ b/sycl/test-e2e/Basic/image/image_accessor_readwrite.cpp
@@ -27,10 +27,10 @@ template <typename WriteDataT, int ImgType, int read_write> class kernel_class;
 
 template <typename ReadDataT,
           typename = typename std::enable_if<
-              (!(std::is_same<ReadDataT, s::cl_float4>::value) &&
-               !(std::is_same<ReadDataT, s::cl_half4>::value))>::type>
+              (!(std::is_same_v<ReadDataT, s::cl_float4>) &&
+               !(std::is_same_v<ReadDataT, s::cl_half4>))>::type>
 void check_read_data(ReadDataT ReadData, ReadDataT ExpectedColor) {
-  using ReadDataType = typename s::detail::TryToGetElementType<ReadDataT>::type;
+  using ReadDataType = typename ReadDataT::element_type;
   bool CorrectData = false;
   if ((ReadData.x() == ExpectedColor.x()) &&
       (ReadData.y() == ExpectedColor.y()) &&

From db07a26bc5882ee4421c117f92a5f4b82feaf06c Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Tue, 20 Feb 2024 16:22:47 -0800
Subject: [PATCH 05/30] [SYCL][E2E] Remove XFAIL from
 Basic/accessor/empty_zero_dim_accessor.cpp (#12773)

The behavior seems to have changed after
https://github.com/intel/llvm/pull/12764.
---
 sycl/test-e2e/Basic/accessor/empty_zero_dim_accessor.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/sycl/test-e2e/Basic/accessor/empty_zero_dim_accessor.cpp b/sycl/test-e2e/Basic/accessor/empty_zero_dim_accessor.cpp
index 8cdc02f736d68..62f61136f024f 100644
--- a/sycl/test-e2e/Basic/accessor/empty_zero_dim_accessor.cpp
+++ b/sycl/test-e2e/Basic/accessor/empty_zero_dim_accessor.cpp
@@ -1,9 +1,6 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
-// https://github.com/intel/llvm/issues/11434
-// XFAIL: gpu-intel-dg2
-
 // Tests the size and iterator members of an empty zero-dimensional accessor.
 
 #include <sycl/sycl.hpp>

From 16e06ffb4ec47ba3ff5c291a01b4f9fc8fe692a6 Mon Sep 17 00:00:00 2001
From: Srividya Sundaram <srividya.sundaram@intel.com>
Date: Tue, 20 Feb 2024 23:30:53 -0800
Subject: [PATCH 06/30] [Driver] Save PTX files for SYCL kernels in the user
 input directory. (#12422)

Save `PTX` files generated (for `SYCL` kernels) during PTX target
processing for CUDA backend using the `-fsycl-dump-device-code` option.

Example usage:
`clang++ -fsycl -fsycl-targets=nvptx64-nvidia-cuda
-fsycl-dmp-device-code=/path/to/ptx syclfile.cpp`

The `PTX` files (`.s` files for each kernel) will be saved under
`/path/to/ptx`
---
 clang/lib/Driver/Compilation.cpp     | 15 ++++++----
 clang/lib/Driver/ToolChains/SYCL.cpp | 20 +++++++++++++
 clang/test/Driver/save-ptx-files.cpp | 43 ++++++++++++++++++++++++++++
 3 files changed, 73 insertions(+), 5 deletions(-)
 create mode 100644 clang/test/Driver/save-ptx-files.cpp

diff --git a/clang/lib/Driver/Compilation.cpp b/clang/lib/Driver/Compilation.cpp
index 6c728f316fc89..9562a2cc2cc83 100644
--- a/clang/lib/Driver/Compilation.cpp
+++ b/clang/lib/Driver/Compilation.cpp
@@ -185,17 +185,22 @@ bool Compilation::CleanupFile(const char *File, bool IssueErrors) const {
   // able to remove), or non-regular files. Underlying tools may have
   // intentionally not overwritten them.
 
-  // Save the device code files(spv files) only if -fsycl-dump-device-code
-  // option is enabled.
+  // Save the device code files if -fsycl-dump-device-code option is enabled.
   if (TheDriver.isDumpDeviceCodeEnabled()) {
     Arg *DumpDeviceCodeArg =
         getArgs().getLastArg(options::OPT_fsycl_dump_device_code_EQ);
     std::string ExpectedDir =
         DumpDeviceCodeArg ? DumpDeviceCodeArg->getValue() : "";
     std::string ActualFile(File);
-    if (ActualFile.find(ExpectedDir) != std::string::npos &&
-        llvm::sys::path::extension(ActualFile).equals(".spv"))
-      return false;
+
+    if (ActualFile.find(ExpectedDir) != std::string::npos) {
+      // Save PTX files generated by LLVM NVPTX Back-End,
+      // when the nvptx*-nvidia-cuda is passed to -fsycl-targets.
+      if (DefaultToolChain.getTriple().isNVPTX())
+        return false;
+      if (llvm::sys::path::extension(ActualFile).equals(".spv"))
+        return false;
+    }
   }
 
   if (!llvm::sys::fs::can_write(File) || !llvm::sys::fs::is_regular_file(File))
diff --git a/clang/lib/Driver/ToolChains/SYCL.cpp b/clang/lib/Driver/ToolChains/SYCL.cpp
index 257a85cda126c..b3545c7241346 100644
--- a/clang/lib/Driver/ToolChains/SYCL.cpp
+++ b/clang/lib/Driver/ToolChains/SYCL.cpp
@@ -111,6 +111,26 @@ void SYCL::constructLLVMForeachCommand(Compilation &C, const JobAction &JA,
         C.getArgs().MakeArgString("--out-dir=" + OutputDirName));
   }
 
+  // If fsycl-dump-device-code is passed, put the PTX files
+  // into the path provided in fsycl-dump-device-code.
+  if (T->getToolChain().getTriple().isNVPTX() &&
+      C.getDriver().isDumpDeviceCodeEnabled() && Ext.equals("s")) {
+    SmallString<128> OutputDir;
+
+    Arg *DumpDeviceCodeArg =
+        C.getArgs().getLastArg(options::OPT_fsycl_dump_device_code_EQ);
+
+    OutputDir = (DumpDeviceCodeArg ? DumpDeviceCodeArg->getValue() : "");
+
+    // If the output directory path is empty, put the PTX files in the
+    // current directory.
+    if (OutputDir.empty())
+      llvm::sys::path::native(OutputDir = "./");
+    else
+      OutputDir.append(llvm::sys::path::get_separator());
+    ForeachArgs.push_back(C.getArgs().MakeArgString("--out-dir=" + OutputDir));
+  }
+
   ForeachArgs.push_back(C.getArgs().MakeArgString("--"));
   ForeachArgs.push_back(
       C.getArgs().MakeArgString(InputCommand->getExecutable()));
diff --git a/clang/test/Driver/save-ptx-files.cpp b/clang/test/Driver/save-ptx-files.cpp
new file mode 100644
index 0000000000000..d9545d16ef890
--- /dev/null
+++ b/clang/test/Driver/save-ptx-files.cpp
@@ -0,0 +1,43 @@
+// Save PTX files during PTX target processing using -fsycl-dump-device-code option.
+
+// Verify that -fsycl-dump-device-code saves PTX files in the user provided directory
+// while targeting CUDA enabled GPUs.
+
+// Linux
+// RUN: %clang -### -fsycl -fsycl-targets=nvptx64-nvidia-cuda,spir64-unknown-unknown -target x86_64-unknown-linux-gnu --cuda-path=%S/Inputs/CUDA/usr/local/cuda -fsycl-dump-device-code=/user/input/path %s 2>&1 \
+// RUN: | FileCheck %s --check-prefixes=CHECK-PTX-FILES,CHECK-SPIRV-FILES
+
+// clang --driver-mode=g++
+// RUN: %clangxx -### -fsycl  -fsycl-targets=nvptx64-nvidia-cuda -target x86_64-unknown-linux-gnu --cuda-path=%S/Inputs/CUDA/usr/local/cuda -fsycl-dump-device-code=/user/input/path %s 2>&1 \
+// RUN: | FileCheck %s --check-prefixes=CHECK-PTX-FILES
+
+// RUN: %clang -### -fsycl -fsycl-targets=nvptx64-nvidia-cuda,spir64-unknown-unknown -target x86_64-unknown-linux-gnu --cuda-path=%S/Inputs/CUDA/usr/local/cuda -fsycl-dump-device-code= %s 2>&1 \
+// RUN: | FileCheck %s --check-prefixes=CHECK-PTX-FILES-CWD,CHECK-SPIRV-FILES-CWD
+
+// CHECK-PTX-FILES: llvm-foreach{{.*}} "--out-ext=s"{{.*}} "--out-dir=/user/input/path{{(/|\\\\)}}" "--" "{{.*}}clang{{.*}}" {{.*}} "-fsycl-is-device" {{.*}}.s{{.*}}
+// CHECK-SPIRV-FILES: llvm-foreach{{.*}} "--out-dir=/user/input/path{{(/|\\\\)}}" "--" "{{.*}}llvm-spirv"
+// CHECK-PTX-FILES-CWD: llvm-foreach{{.*}} "--out-ext=s"{{.*}} "--out-dir=.{{(/|\\\\)}}" "--" "{{.*}}clang{{.*}}" {{.*}} "-fsycl-is-device"
+// CHECK-SPIRV-FILES-CWD: llvm-foreach{{.*}} "--out-dir=.{{(/|\\\\)}}" "--" "{{.*}}llvm-spirv"
+
+// Windows - Check if PTX files are saved in the user provided path.
+// RUN: %clang_cl -### -fsycl \
+// RUN: -fsycl-targets=nvptx64-nvidia-cuda --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
+// RUN: -fsycl-dump-device-code=/user/input/path %s 2>&1 \
+// RUN: | FileCheck -check-prefix=CHECK-PTX-WIN %s
+
+// Windows - Check if PTX and SPV files are saved in user provided path.
+// RUN: %clang_cl -### -fsycl \
+// RUN: -fsycl-targets=nvptx64-nvidia-cuda,spir64-unknown-unknown --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
+// RUN: -fsycl-dump-device-code=/user/input/path %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=CHECK-PTX-WIN,CHECK-SPV-WIN %s
+
+// Windows - Check PTX files saved in current working directory when -fsycl-dump-device-code
+// is empty. 
+// RUN: %clang_cl -### -fsycl \
+// RUN: -fsycl-targets=nvptx64-nvidia-cuda --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
+// RUN: -fsycl-dump-device-code= %s 2>&1 \
+// RUN: | FileCheck -check-prefix=CHECK-PTX-WIN-CWD %s
+
+// CHECK-PTX-WIN: llvm-foreach{{.*}} "--out-ext=s"{{.*}} "--out-dir=/user/input/path{{(/|\\\\)}}" "--" "{{.*}}clang{{.*}}" {{.*}} "-fsycl-is-device" {{.*}}.asm{{.*}}
+// CHECK-PTX-WIN-CWD: llvm-foreach{{.*}} "--out-ext=s"{{.*}} "--out-dir=.{{(/|\\\\)}}" "--" "{{.*}}clang{{.*}}" {{.*}} "-fsycl-is-device" {{.*}}.asm{{.*}}
+// CHECK-SPV-WIN:  llvm-foreach{{.*}} "--out-dir=/user/input/path{{(/|\\\\)}}" "--" "{{.*}}llvm-spirv"

From 04a222f7bb3022f3623ad40c9de70fd97579061a Mon Sep 17 00:00:00 2001
From: Dounia Khaldi <dounia.khaldi@intel.com>
Date: Wed, 21 Feb 2024 03:26:31 -0600
Subject: [PATCH 07/30] [SYCL][Matrix spec] Add joint_matrix_prefetch and
 overloads of load/store with annotated_ptr (#11473)

---
 .../sycl_ext_intel_matrix.asciidoc            | 19 +++-
 .../sycl_ext_oneapi_matrix.asciidoc           | 98 +++++++++++++++++++
 2 files changed, 115 insertions(+), 2 deletions(-)

diff --git a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_intel_matrix.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_intel_matrix.asciidoc
index 06784706ea157..b76a8b2292f78 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_intel_matrix.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_intel_matrix.asciidoc
@@ -148,14 +148,28 @@ template <typename Group, typename T, size_t Rows, size_t Cols,
           access::decorated IsDecorated>
 void joint_matrix_store(Group g,
     const joint_matrix<Group, T, use::a, Rows, Cols, Layout> &res,
-    multi_ptr<T, Space, IsDecorated> src, size_t stride);
+    multi_ptr<T, Space, IsDecorated> dest, size_t stride);
 
 template <typename Group, typename T, size_t Rows, size_t Cols,
           layout Layout, access::address_space Space,
           access::decorated IsDecorated>
 void joint_matrix_store(Group g,
     const joint_matrix<Group, T, use::b, Rows, Cols, Layout> &res,
-    multi_ptr<T, Space, IsDecorated> src, size_t stride);
+    multi_ptr<T, Space, IsDecorated> dest, size_t stride);
+
+template <typename Group, typename T, size_t Rows, size_t Cols,
+          layout Layout, typename PropertyListT>
+void joint_matrix_store(Group g,
+    const joint_matrix<Group, T, use::a, Rows, Cols, Layout> &res,
+    ext::oneapi::experimental::annotated_ptr<T, PropertyListT> dest,
+    size_t stride);
+
+template <typename Group, typename T, size_t Rows, size_t Cols,
+          layout Layout, typename PropertyListT>
+void joint_matrix_store(Group g,
+    const joint_matrix<Group, T, use::b, Rows, Cols, Layout> &res,
+    ext::oneapi::experimental::annotated_ptr<T, PropertyListT> dest,
+    size_t stride);
 
 } // namespace sycl::ext::intel::experimental::matrix
 ```
@@ -327,6 +341,7 @@ q.submit([&](sycl::handler& cgh) {
 });
 q.wait();
 ```
+
 == Revision History
 
 [frame="none",options="header"]
diff --git a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc
index f7d916a05144f..1af6f7a72de88 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc
@@ -228,6 +228,23 @@ void joint_matrix_load(Group g,
     joint_matrix<Group, T1, Use, Rows, Cols, Layout> &res,
     multi_ptr<T2, Space, IsDecorated> src, size_t stride);
 
+// Only available when std::is_same_v<T1, std::remove_const_t<T2>>
+template <typename Group, typename T1, typename T2,
+          size_t Rows, size_t Cols,
+          typename PropertyListT>
+void joint_matrix_load(Group g,
+    joint_matrix<Group, T1, use::accumulator, Rows, Cols, layout::dynamic> &res,
+    annotated_ptr<T2, PropertyListT> src, size_t stride, layout Layout);
+
+// Only available when Layout != layout::dynamic
+// and when std::is_same_v<T1, std::remove_const_t<T2>>
+template <typename Group, typename T1, typename T2,
+          size_t Rows, size_t Cols, use Use, layout Layout,
+          typename PropertyListT>
+void joint_matrix_load(Group g,
+    joint_matrix<Group, T1, Use, Rows, Cols, Layout> &res,
+    annotated_ptr<T2, PropertyListT> src, size_t stride);
+
 } // namespace sycl::ext::oneapi::experimental::matrix
 ```
 
@@ -248,6 +265,33 @@ fashion. `stride` describes the number of elements between consecutive
 rows for the row major layout, or between columns for the column major
 layout.
 
+The two last overloads of `joint_matrix_load` take
+`sycl::ext::oneapi::experimental::annotated_ptr` as argument instead
+of `sycl::multi_ptr`. The property list associated with the
+`annotated_ptr` argument represents the compile-time constant
+properties for cache control included in the SYCL extenion
+link:../../proposed/sycl_ext_intel_cache_controls.asciidoc[sycl_ext_intel_cache_controls]
+as illustrated in the example below.
+
+```c++
+using syclex = sycl::ext::oneapi::experimental;
+using syclintelex = sycl::ext::intel::experimental;
+
+auto A_ptr = syclex::annotated_ptr{A,
+               syclex::properties{syclintelex::read_hint<
+                   syclintelex::cache_control<syclintelex::cache_mode::cached,
+                                              syclex::cache_level::L2>>}};
+q.parallel_for(nd_range<2>(G, L), [=](nd_item<2> it) {
+  sub_group sg = it.get_sub_group();
+  joint_matrix<sub_group, bfloat16, use::a, tM, tK, layout::row_major> tA;
+  for (int k = 0; k < K; k += tileK) {
+    // User specifies that this load will be cached to L2
+    joint_matrix_load(sg, tA, A_ptr + sg_startx * tM * K + k, K);
+    ...
+  }
+});
+```
+
 ==== Store
 ```c++
 namespace sycl::ext::oneapi::experimental::matrix {
@@ -259,6 +303,12 @@ void joint_matrix_store(Group g,
    const joint_matrix<Group, T1, use::accumulator, Rows, Cols, layout::dynamic> &res,
    multi_ptr<T2, Space, IsDecorated> dest, size_t stride, layout Layout);
 
+template <typename Group, typename T1, typename T2, size_t Rows, size_t Cols,
+          typename PropertyListT>
+void joint_matrix_store(Group g,
+   const joint_matrix<Group, T1, use::accumulator, Rows, Cols, layout::dynamic> &res,
+   annotated_ptr<T2, PropertyListT> dest, size_t stride, layout Layout);
+
 } // namespace sycl::ext::oneapi::experimental::matrix
 ```
 This function stores the data in the accumulator matrix from the
@@ -270,6 +320,11 @@ written in a row (`row_major`), column major (`col_major`)
 fashion. `stride` describes the number of elements between consecutive
 rows for the row major layout, or between columns for the column major layout.
 
+The second overload of `joint_matrix_store` takes
+`sycl::ext::oneapi::experimental::annotated_ptr` as argument instead
+of `sycl::multi_ptr`. The property list associated with the
+`annotated_ptr` argument represents the compile-time constant
+properties for cache control included in the SYCL extenion link:../../proposed/sycl_ext_intel_cache_controls.asciidoc[sycl_ext_intel_cache_controls]
 
 ==== Multiply and Add
 
@@ -372,6 +427,47 @@ joint_matrix_apply(sg, C, [=](T &x) {
 });
 ```
 
+==== Prefetch
+
+```c++
+namespace sycl::ext::oneapi::experimental::matrix {
+
+template <size_t Rows, size_t Cols, typename Group, typename T,
+          typename Properties = empty_properties_t>
+void joint_matrix_prefetch(Group g, T* ptr, size_t stride, layout Layout,
+                           Properties properties = {});
+
+} // namespace sycl::ext::oneapi::experimental::matrix
+```
+
+`joint_matrix_prefetch` allows groups of work-items to cooperatively
+prefetch `Rows x Cols` elements in a 2d manner. This function is a group
+function, as defined in Section 4.17.3 of the core SYCL
+specification.
+
+The level of cache targeted by `joint_matrix_prefetch` in the last
+argument is specified using the compile-time properties defined in the
+SYCL extension
+link:../../proposed/sycl_ext_oneapi_prefetch.asciidoc[sycl_ext_oneapi_prefetch]
+as illustrated in the example below. When no cache levels are
+specified, the default behavior is to prefetch into the lowest level
+cache (i.e. L1).
+
+```c++
+using syclex = sycl::ext::oneapi::experimental;
+
+bfloat16 *memA = malloc_shared<bfloat16>(M*K, q);
+q.parallel_for(nd_range<2>(G, L), [=](nd_item<2> it) {
+  sub_group sg = it.get_sub_group();
+  for (int k = 0; k < K; k += tileK) {
+    syclex::joint_matrix_prefetch<tM, tK>(sg, memA + tM * K + tK, K,
+                                  layout::row_major,
+                                  syclex::properties{syclex::prefetch_hint_L2});
+    ...
+  }
+});
+```
+
 === Support for Machine Learning Types
 Some devices support special matrix element types that are commonly
 used in machine learning algorithms.
@@ -1035,4 +1131,6 @@ and Intel XMX
 |8   |2023-10-05 |Mahmoud Moadeli |Add AMD Matrix Core supported combinations
 |9   |2023-11-13 |Dounia Khaldi |Add Granite Rapids Intel AMX
 supported combinations
+|9   |2023-12-04 |Dounia Khaldi |Add prefetch and `annotated_ptr`
+load/store overloads
 |======================

From 3031733b28ad378bc80406cd5dde4736f5f6681c Mon Sep 17 00:00:00 2001
From: Sergey Semenov <sergey.semenov@intel.com>
Date: Wed, 21 Feb 2024 03:46:56 -0800
Subject: [PATCH 08/30] [SYCL] Fix a stream related in-order queue hang
 (#12761)

Now that submitting a task to an in-order queue can lock its main mutex,
the vector of stream service events (which can be accessed during that
same submission) should use a different mutex to avoid deadlocks.
---
 sycl/source/detail/queue_impl.cpp         |  2 +-
 sycl/source/detail/queue_impl.hpp         |  3 ++-
 sycl/test-e2e/Basic/stream/auto_flush.cpp | 16 ++++++++++++----
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp
index a24905d4214da..eb0e274a191fb 100644
--- a/sycl/source/detail/queue_impl.cpp
+++ b/sycl/source/detail/queue_impl.cpp
@@ -510,7 +510,7 @@ void queue_impl::wait(const detail::code_location &CodeLoc) {
 
   std::vector<EventImplPtr> StreamsServiceEvents;
   {
-    std::lock_guard<std::mutex> Lock(MMutex);
+    std::lock_guard<std::mutex> Lock(MStreamsServiceEventsMutex);
     StreamsServiceEvents.swap(MStreamsServiceEvents);
   }
   for (const EventImplPtr &Event : StreamsServiceEvents)
diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
index 0fe4242cc9472..12f440c5fc9c9 100644
--- a/sycl/source/detail/queue_impl.hpp
+++ b/sycl/source/detail/queue_impl.hpp
@@ -689,7 +689,7 @@ class queue_impl {
 #endif
 
   void registerStreamServiceEvent(const EventImplPtr &Event) {
-    std::lock_guard<std::mutex> Lock(MMutex);
+    std::lock_guard<std::mutex> Lock(MStreamsServiceEventsMutex);
     MStreamsServiceEvents.push_back(Event);
   }
 
@@ -945,6 +945,7 @@ class queue_impl {
   const bool MIsInorder;
 
   std::vector<EventImplPtr> MStreamsServiceEvents;
+  std::mutex MStreamsServiceEventsMutex;
 
   // All member variable defined here  are needed for the SYCL instrumentation
   // layer. Do not guard these variables below with XPTI_ENABLE_INSTRUMENTATION
diff --git a/sycl/test-e2e/Basic/stream/auto_flush.cpp b/sycl/test-e2e/Basic/stream/auto_flush.cpp
index 13ac298c39acd..e48811d792656 100644
--- a/sycl/test-e2e/Basic/stream/auto_flush.cpp
+++ b/sycl/test-e2e/Basic/stream/auto_flush.cpp
@@ -15,17 +15,25 @@
 
 using namespace sycl;
 
-int main() {
-  queue Queue;
+// Test that data is flushed to the buffer at the end of kernel execution even
+// without explicit flush
 
-  // Test that data is flushed to the buffer at the end of kernel execution even
-  // without explicit flush
+void test(queue &Queue) {
   Queue.submit([&](handler &CGH) {
     stream Out(1024, 80, CGH);
     CGH.parallel_for<class auto_flush1>(
         range<1>(2), [=](id<1> i) { Out << "Hello World!\n"; });
   });
   Queue.wait();
+}
+int main() {
+  queue Queue;
+  test(Queue);
+  // CHECK: Hello World!
+  // CHECK-NEXT: Hello World!
+
+  queue InOrderQueue{{sycl::property::queue::in_order()}};
+  test(InOrderQueue);
   // CHECK: Hello World!
   // CHECK-NEXT: Hello World!
 

From 6b5cb127274d1d4b35563a6afcac4c128d0397fc Mon Sep 17 00:00:00 2001
From: Sergey Semenov <sergey.semenov@intel.com>
Date: Wed, 21 Feb 2024 03:51:40 -0800
Subject: [PATCH 09/30] [SYCL] Make queue hash dependent on its unique ID
 (#12578)

Using implementation pointer as the hash can lead to situations where
queues consistently get assigned the same hash as a queue that has been
destroyed (due to being allocated at the same address). In some cases,
this can cause consistent hash collisions if a library uses these hashes
as a fast way to detect a different queue being passed to it.
---
 sycl/include/sycl/queue.hpp             |  7 +--
 sycl/source/ld-version-script.txt       |  3 +
 sycl/source/queue.cpp                   |  7 +++
 sycl/test/abi/sycl_symbols_linux.dump   |  1 +
 sycl/test/abi/sycl_symbols_windows.dump | 79 +++++++++++++------------
 sycl/unittests/queue/CMakeLists.txt     |  1 +
 sycl/unittests/queue/Hash.cpp           | 14 +++++
 7 files changed, 69 insertions(+), 43 deletions(-)
 create mode 100644 sycl/unittests/queue/Hash.cpp

diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp
index b693cbaae62c8..506618f945c4a 100644
--- a/sycl/include/sycl/queue.hpp
+++ b/sycl/include/sycl/queue.hpp
@@ -2983,11 +2983,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
 } // namespace sycl
 
 namespace std {
-template <> struct hash<sycl::queue> {
-  size_t operator()(const sycl::queue &Q) const {
-    return std::hash<std::shared_ptr<sycl::detail::queue_impl>>()(
-        sycl::detail::getSyclObjImpl(Q));
-  }
+template <> struct __SYCL_EXPORT hash<sycl::queue> {
+  size_t operator()(const sycl::queue &Q) const;
 };
 } // namespace std
 
diff --git a/sycl/source/ld-version-script.txt b/sycl/source/ld-version-script.txt
index a169f320a627f..61e2e6a874406 100644
--- a/sycl/source/ld-version-script.txt
+++ b/sycl/source/ld-version-script.txt
@@ -27,6 +27,9 @@
     __sycl_register_lib;
     __sycl_unregister_lib;
 
+    /* Export std::hash specializations */
+    _ZNKSt4hashIN4sycl3_V15queueEEclERKS2_;
+
   local:
     *;
 };
diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp
index 1b877a31da4e0..2dd794dcd40ea 100644
--- a/sycl/source/queue.cpp
+++ b/sycl/source/queue.cpp
@@ -360,3 +360,10 @@ void queue::ext_oneapi_set_external_event(const event &external_event) {
 
 } // namespace _V1
 } // namespace sycl
+
+size_t std::hash<sycl::queue>::operator()(const sycl::queue &Q) const {
+  // Compared to using the impl pointer, the unique ID helps avoid hash
+  // collisions with previously destroyed queues.
+  return std::hash<unsigned long long>()(
+      sycl::detail::getSyclObjImpl(Q)->getQueueID());
+}
diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump
index edc3d9b41a00d..c6f7422a4808e 100644
--- a/sycl/test/abi/sycl_symbols_linux.dump
+++ b/sycl/test/abi/sycl_symbols_linux.dump
@@ -4792,5 +4792,6 @@ _ZNK4sycl3_V19exception4codeEv
 _ZNK4sycl3_V19exception4whatEv
 _ZNK4sycl3_V19exception8categoryEv
 _ZNK4sycl3_V19kernel_id8get_nameEv
+_ZNKSt4hashIN4sycl3_V15queueEEclERKS2_
 __sycl_register_lib
 __sycl_unregister_lib
diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump
index 98be6da4c5f37..1c4b29549b5c1 100644
--- a/sycl/test/abi/sycl_symbols_windows.dump
+++ b/sycl/test/abi/sycl_symbols_windows.dump
@@ -705,6 +705,8 @@
 ??4?$OwnerLessBase@Vqueue@_V1@sycl@@@detail@_V1@sycl@@QEAAAEAV0123@AEBV0123@@Z
 ??4?$OwnerLessBase@Vstream@_V1@sycl@@@detail@_V1@sycl@@QEAAAEAV0123@$$QEAV0123@@Z
 ??4?$OwnerLessBase@Vstream@_V1@sycl@@@detail@_V1@sycl@@QEAAAEAV0123@AEBV0123@@Z
+??4?$hash@Vqueue@_V1@sycl@@@std@@QEAAAEAU01@$$QEAU01@@Z
+??4?$hash@Vqueue@_V1@sycl@@@std@@QEAAAEAU01@AEBU01@@Z
 ??4AccessorBaseHost@detail@_V1@sycl@@QEAAAEAV0123@$$QEAV0123@@Z
 ??4AccessorBaseHost@detail@_V1@sycl@@QEAAAEAV0123@AEBV0123@@Z
 ??4AccessorImplHost@detail@_V1@sycl@@QEAAAEAV0123@AEBV0123@@Z
@@ -810,6 +812,7 @@
 ??Fhalf@host_half_impl@detail@_V1@sycl@@QEAA?AV01234@H@Z
 ??Fhalf@host_half_impl@detail@_V1@sycl@@QEAAAEAV01234@XZ
 ??Ghalf@host_half_impl@detail@_V1@sycl@@QEAAAEAV01234@XZ
+??R?$hash@Vqueue@_V1@sycl@@@std@@QEBA_KAEBVqueue@_V1@sycl@@@Z
 ??Raccelerator_selector@_V1@sycl@@UEBAHAEBVdevice@12@@Z
 ??Rcpu_selector@_V1@sycl@@UEBAHAEBVdevice@12@@Z
 ??Rdefault_selector@_V1@sycl@@UEBAHAEBVdevice@12@@Z
@@ -1011,6 +1014,7 @@
 ?ext_codeplay_supports_fusion@queue@_V1@sycl@@QEBA_NXZ
 ?ext_intel_read_host_pipe@handler@_V1@sycl@@AEAAXAEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEAX_K_N@Z
 ?ext_intel_write_host_pipe@handler@_V1@sycl@@AEAAXAEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEAX_K_N@Z
+?ext_oneapi_advise_usm_cmd_buffer@MemoryManager@detail@_V1@sycl@@SAXV?$shared_ptr@Vcontext_impl@detail@_V1@sycl@@@std@@PEAU_pi_ext_command_buffer@@PEBX_KW4_pi_mem_advice@@V?$vector@IV?$allocator@I@std@@@6@PEAI@Z
 ?ext_oneapi_architecture_is@device@_V1@sycl@@QEAA_NW4architecture@experimental@oneapi@ext@23@@Z
 ?ext_oneapi_barrier@handler@_V1@sycl@@QEAAXAEBV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@@Z
 ?ext_oneapi_barrier@handler@_V1@sycl@@QEAAXXZ
@@ -1043,8 +1047,6 @@
 ?ext_oneapi_copyD2H_cmd_buffer@MemoryManager@detail@_V1@sycl@@SAXV?$shared_ptr@Vcontext_impl@detail@_V1@sycl@@@std@@PEAU_pi_ext_command_buffer@@PEAVSYCLMemObjI@234@PEAXIV?$range@$02@34@4V?$id@$02@34@IPEADI45IV?$vector@IV?$allocator@I@std@@@6@PEAI@Z
 ?ext_oneapi_copyH2D_cmd_buffer@MemoryManager@detail@_V1@sycl@@SAXV?$shared_ptr@Vcontext_impl@detail@_V1@sycl@@@std@@PEAU_pi_ext_command_buffer@@PEAVSYCLMemObjI@234@PEADIV?$range@$02@34@V?$id@$02@34@IPEAXI445IV?$vector@IV?$allocator@I@std@@@6@PEAI@Z
 ?ext_oneapi_copy_usm_cmd_buffer@MemoryManager@detail@_V1@sycl@@SAXV?$shared_ptr@Vcontext_impl@detail@_V1@sycl@@@std@@PEBXPEAU_pi_ext_command_buffer@@_KPEAXV?$vector@IV?$allocator@I@std@@@6@PEAI@Z
-?ext_oneapi_prefetch_usm_cmd_buffer@MemoryManager@detail@_V1@sycl@@SAXV?$shared_ptr@Vcontext_impl@detail@_V1@sycl@@@std@@PEAU_pi_ext_command_buffer@@PEAX_KV?$vector@IV?$allocator@I@std@@@6@PEAI@Z
-?ext_oneapi_advise_usm_cmd_buffer@MemoryManager@detail@_V1@sycl@@SAXV?$shared_ptr@Vcontext_impl@detail@_V1@sycl@@@std@@PEAU_pi_ext_command_buffer@@PEBX_KW4_pi_mem_advice@@V?$vector@IV?$allocator@I@std@@@6@PEAI@Z
 ?ext_oneapi_disable_peer_access@device@_V1@sycl@@QEAAXAEBV123@@Z
 ?ext_oneapi_empty@queue@_V1@sycl@@QEBA_NXZ
 ?ext_oneapi_enable_peer_access@device@_V1@sycl@@QEAAXAEBV123@@Z
@@ -1053,9 +1055,9 @@
 ?ext_oneapi_fill_usm_cmd_buffer@MemoryManager@detail@_V1@sycl@@SAXV?$shared_ptr@Vcontext_impl@detail@_V1@sycl@@@std@@PEAU_pi_ext_command_buffer@@PEAX_KHV?$vector@IV?$allocator@I@std@@@6@PEAI@Z
 ?ext_oneapi_get_composite_devices@platform@_V1@sycl@@QEBA?AV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@XZ
 ?ext_oneapi_get_default_context@platform@_V1@sycl@@QEBA?AVcontext@23@XZ
+?ext_oneapi_get_graph@queue@_V1@sycl@@QEBA?AV?$command_graph@$0A@@experimental@oneapi@ext@23@XZ
 ?ext_oneapi_get_kernel@kernel_bundle_plain@detail@_V1@sycl@@QEAA?AVkernel@34@AEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@Z
 ?ext_oneapi_get_last_event@queue@_V1@sycl@@QEBA?AVevent@23@XZ
-?ext_oneapi_get_graph@queue@_V1@sycl@@QEBA?AV?$command_graph@$0A@@experimental@oneapi@ext@23@XZ
 ?ext_oneapi_get_state@queue@_V1@sycl@@QEBA?AW4queue_state@experimental@oneapi@ext@23@XZ
 ?ext_oneapi_graph@handler@_V1@sycl@@QEAAXV?$command_graph@$00@experimental@oneapi@ext@23@@Z
 ?ext_oneapi_graph@queue@_V1@sycl@@QEAA?AVevent@23@V?$command_graph@$00@experimental@oneapi@ext@23@AEBUcode_location@detail@23@@Z
@@ -1080,6 +1082,7 @@
 ?ext_oneapi_owner_before@?$OwnerLessBase@Vqueue@_V1@sycl@@@detail@_V1@sycl@@QEBA_NAEBVqueue@34@@Z
 ?ext_oneapi_owner_before@?$OwnerLessBase@Vstream@_V1@sycl@@@detail@_V1@sycl@@QEBA_NAEBV?$weak_object_base@Vstream@_V1@sycl@@@2oneapi@ext@34@@Z
 ?ext_oneapi_owner_before@?$OwnerLessBase@Vstream@_V1@sycl@@@detail@_V1@sycl@@QEBA_NAEBVstream@34@@Z
+?ext_oneapi_prefetch_usm_cmd_buffer@MemoryManager@detail@_V1@sycl@@SAXV?$shared_ptr@Vcontext_impl@detail@_V1@sycl@@@std@@PEAU_pi_ext_command_buffer@@PEAX_KV?$vector@IV?$allocator@I@std@@@6@PEAI@Z
 ?ext_oneapi_set_external_event@queue@_V1@sycl@@QEAAXAEBVevent@23@@Z
 ?ext_oneapi_signal_external_semaphore@handler@_V1@sycl@@QEAAXUinterop_semaphore_handle@experimental@oneapi@ext@23@@Z
 ?ext_oneapi_signal_external_semaphore@queue@_V1@sycl@@QEAA?AVevent@23@Uinterop_semaphore_handle@experimental@oneapi@ext@23@AEBUcode_location@detail@23@@Z
@@ -1278,38 +1281,38 @@
 ?get_max_statement_size@stream@_V1@sycl@@QEBA_KXZ
 ?get_max_statement_size@stream_impl@detail@_V1@sycl@@QEBA_KXZ
 ?get_mip_level_mem_handle@experimental@oneapi@ext@_V1@sycl@@YA?AUimage_mem_handle@12345@U612345@IAEBVdevice@45@AEBVcontext@45@@Z
-?get_mip_level_mem_handle@experimental@oneapi@ext@_V1@sycl@@YA?AUimage_mem_handle@12345@U612345@IAEBVqueue@45@@Z
-?get_mip_level_mem_handle@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBA?AUimage_mem_handle@23456@I@Z
-?get_name@kernel_id@_V1@sycl@@QEBAPEBDXZ
-?get_node_from_event@node@experimental@oneapi@ext@_V1@sycl@@SA?AV123456@Vevent@56@@Z
-?get_nodes@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEBA?AV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ
-?get_num_channels@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBAIXZ
-?get_pipe_name@pipe_base@experimental@intel@ext@_V1@sycl@@KA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEBX@Z
-?get_pitch@image_impl@detail@_V1@sycl@@QEBA?AV?$range@$01@34@XZ
+?get_mip_level_mem_handle@experimental@oneapi@ext@_V1@sycl@@YA?AUimage_mem_handle@12345@U612345@IAEBVqueue@45@@Z
+?get_mip_level_mem_handle@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBA?AUimage_mem_handle@23456@I@Z
+?get_name@kernel_id@_V1@sycl@@QEBAPEBDXZ
+?get_node_from_event@node@experimental@oneapi@ext@_V1@sycl@@SA?AV123456@Vevent@56@@Z
+?get_nodes@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEBA?AV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ
+?get_num_channels@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBAIXZ
+?get_pipe_name@pipe_base@experimental@intel@ext@_V1@sycl@@KA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEBX@Z
+?get_pitch@image_impl@detail@_V1@sycl@@QEBA?AV?$range@$01@34@XZ
 ?get_pitch@image_plain@detail@_V1@sycl@@IEBA?AV?$range@$01@34@XZ
 ?get_platform@context@_V1@sycl@@QEBA?AVplatform@23@XZ
 ?get_platform@device@_V1@sycl@@QEBA?AVplatform@23@XZ
 ?get_platforms@platform@_V1@sycl@@SA?AV?$vector@Vplatform@_V1@sycl@@V?$allocator@Vplatform@_V1@sycl@@@std@@@std@@XZ
-?get_pointer_device@_V1@sycl@@YA?AVdevice@12@PEBXAEBVcontext@12@@Z
-?get_pointer_type@_V1@sycl@@YA?AW4alloc@usm@12@PEBXAEBVcontext@12@@Z
-?get_precision@stream@_V1@sycl@@QEBA_KXZ
-?get_predecessors@node@experimental@oneapi@ext@_V1@sycl@@QEBA?AV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ
-?get_queue@fusion_wrapper@experimental@codeplay@ext@_V1@sycl@@QEBA?AVqueue@56@XZ
-?get_range@image_impl@detail@_V1@sycl@@QEBA?AV?$range@$02@34@XZ
-?get_range@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBA?AV?$range@$02@56@XZ
-?get_range@image_plain@detail@_V1@sycl@@IEBA?AV?$range@$02@34@XZ
-?get_root_nodes@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEBA?AV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ
-?get_size@image_plain@detail@_V1@sycl@@IEBA_KXZ
-?get_size@stream@_V1@sycl@@QEBA_KXZ
-?get_size@stream_impl@detail@_V1@sycl@@QEBA_KXZ
-?get_specialization_constant_impl@kernel_bundle_plain@detail@_V1@sycl@@IEBAXPEBDPEAX@Z
-?get_stream_mode@stream@_V1@sycl@@QEBA?AW4stream_manipulator@23@XZ
-?get_successors@node@experimental@oneapi@ext@_V1@sycl@@QEBA?AV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ
-?get_type@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBA?AW4image_type@23456@XZ
-?get_type@node@experimental@oneapi@ext@_V1@sycl@@QEBA?AW4node_type@23456@XZ
-?get_wait_list@event@_V1@sycl@@QEAA?AV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@XZ
-?get_width@stream@_V1@sycl@@QEBA_KXZ
-?get_work_item_buffer_size@stream@_V1@sycl@@QEBA_KXZ
+?get_pointer_device@_V1@sycl@@YA?AVdevice@12@PEBXAEBVcontext@12@@Z
+?get_pointer_type@_V1@sycl@@YA?AW4alloc@usm@12@PEBXAEBVcontext@12@@Z
+?get_precision@stream@_V1@sycl@@QEBA_KXZ
+?get_predecessors@node@experimental@oneapi@ext@_V1@sycl@@QEBA?AV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ
+?get_queue@fusion_wrapper@experimental@codeplay@ext@_V1@sycl@@QEBA?AVqueue@56@XZ
+?get_range@image_impl@detail@_V1@sycl@@QEBA?AV?$range@$02@34@XZ
+?get_range@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBA?AV?$range@$02@56@XZ
+?get_range@image_plain@detail@_V1@sycl@@IEBA?AV?$range@$02@34@XZ
+?get_root_nodes@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEBA?AV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ
+?get_size@image_plain@detail@_V1@sycl@@IEBA_KXZ
+?get_size@stream@_V1@sycl@@QEBA_KXZ
+?get_size@stream_impl@detail@_V1@sycl@@QEBA_KXZ
+?get_specialization_constant_impl@kernel_bundle_plain@detail@_V1@sycl@@IEBAXPEBDPEAX@Z
+?get_stream_mode@stream@_V1@sycl@@QEBA?AW4stream_manipulator@23@XZ
+?get_successors@node@experimental@oneapi@ext@_V1@sycl@@QEBA?AV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ
+?get_type@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBA?AW4image_type@23456@XZ
+?get_type@node@experimental@oneapi@ext@_V1@sycl@@QEBA?AW4node_type@23456@XZ
+?get_wait_list@event@_V1@sycl@@QEAA?AV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@XZ
+?get_width@stream@_V1@sycl@@QEBA_KXZ
+?get_work_item_buffer_size@stream@_V1@sycl@@QEBA_KXZ
 ?get_work_item_buffer_size@stream_impl@detail@_V1@sycl@@QEBA_KXZ
 ?gpu_selector_v@_V1@sycl@@YAHAEBVdevice@12@@Z
 ?handleHostData@SYCLMemObjT@detail@_V1@sycl@@QEAAXAEBV?$function@$$A6AXPEAX@Z@std@@_K_N@Z
@@ -1493,13 +1496,13 @@
 ?setLocalAccessorArgHelper@handler@_V1@sycl@@AEAAXHAEAVLocalAccessorBaseHost@detail@23@@Z
 ?setPitches@image_impl@detail@_V1@sycl@@AEAAXAEBV?$range@$01@34@@Z
 ?setPitches@image_impl@detail@_V1@sycl@@AEAAXXZ
-?setStateExplicitKernelBundle@handler@_V1@sycl@@AEAAXXZ
-?setStateSpecConstSet@handler@_V1@sycl@@AEAAXXZ
-?setType@handler@_V1@sycl@@AEAAXW4CGTYPE@CG@detail@23@@Z
-?setUserFacingNodeType@handler@_V1@sycl@@AEAAXW4node_type@experimental@oneapi@ext@23@@Z
-?set_final_data@SYCLMemObjT@detail@_V1@sycl@@QEAAX$$T@Z
-?set_final_data@SYCLMemObjT@detail@_V1@sycl@@QEAAXAEBV?$function@$$A6AXAEBV?$function@$$A6AXPEAX@Z@std@@@Z@std@@@Z
-?set_final_data_from_storage@SYCLMemObjT@detail@_V1@sycl@@QEAAXXZ
+?setStateExplicitKernelBundle@handler@_V1@sycl@@AEAAXXZ
+?setStateSpecConstSet@handler@_V1@sycl@@AEAAXXZ
+?setType@handler@_V1@sycl@@AEAAXW4CGTYPE@CG@detail@23@@Z
+?setUserFacingNodeType@handler@_V1@sycl@@AEAAXW4node_type@experimental@oneapi@ext@23@@Z
+?set_final_data@SYCLMemObjT@detail@_V1@sycl@@QEAAX$$T@Z
+?set_final_data@SYCLMemObjT@detail@_V1@sycl@@QEAAXAEBV?$function@$$A6AXAEBV?$function@$$A6AXPEAX@Z@std@@@Z@std@@@Z
+?set_final_data_from_storage@SYCLMemObjT@detail@_V1@sycl@@QEAAXXZ
 ?set_final_data_internal@buffer_plain@detail@_V1@sycl@@IEAAXAEBV?$function@$$A6AXAEBV?$function@$$A6AXPEAX@Z@std@@@Z@std@@@Z
 ?set_final_data_internal@buffer_plain@detail@_V1@sycl@@IEAAXXZ
 ?set_final_data_internal@image_plain@detail@_V1@sycl@@IEAAXAEBV?$function@$$A6AXAEBV?$function@$$A6AXPEAX@Z@std@@@Z@std@@@Z
diff --git a/sycl/unittests/queue/CMakeLists.txt b/sycl/unittests/queue/CMakeLists.txt
index 4092a144d4fcd..bf2819c69833c 100644
--- a/sycl/unittests/queue/CMakeLists.txt
+++ b/sycl/unittests/queue/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_sycl_unittest(QueueTests OBJECT
   DeviceCheck.cpp
   EventClear.cpp
+  Hash.cpp
   USM.cpp
   Wait.cpp
   GetProfilingInfo.cpp
diff --git a/sycl/unittests/queue/Hash.cpp b/sycl/unittests/queue/Hash.cpp
new file mode 100644
index 0000000000000..000850743c882
--- /dev/null
+++ b/sycl/unittests/queue/Hash.cpp
@@ -0,0 +1,14 @@
+#include <detail/queue_impl.hpp>
+#include <gtest/gtest.h>
+#include <helpers/PiMock.hpp>
+#include <sycl/queue.hpp>
+
+using namespace sycl;
+
+// Checks that the queue hash uses its unique ID.
+TEST(QueueHash, QueueHashUsesID) {
+  unittest::PiMock Mock;
+  queue Q;
+  unsigned long long ID = detail::getSyclObjImpl(Q)->getQueueID();
+  ASSERT_EQ(std::hash<unsigned long long>{}(ID), std::hash<queue>{}(Q));
+}

From ef6887f8221faf814e912fcce362099614c7c614 Mon Sep 17 00:00:00 2001
From: Udit Agarwal <16324601+uditagarwal97@users.noreply.github.com>
Date: Wed, 21 Feb 2024 08:08:04 -0800
Subject: [PATCH 10/30] [SYCL] Make 'supportAcc' a template variable in
 getSyclDeviceTypeMap (#12775)

The output of getSyclDeviceTypeMap() is different for
SYCL_DEVICE_ALLOWLIST and ONEAPI_DEVICE_SELECTOR. However, currently, we
return a reference to a 'static const' type, which causes calls to
getSyclDeviceTypeMap() return the same value, irrespective of
'supportAcc'. This results in several test failures.

This PR makes 'supportAcc' a template parameter instead.
---
 sycl/source/detail/allowlist.cpp            |  4 ++--
 sycl/source/detail/config.cpp               | 17 -----------------
 sycl/source/detail/config.hpp               | 16 ++++++++++++++--
 sycl/source/detail/device_filter.cpp        | 16 ++++++++--------
 sycl/unittests/allowlist/ParseAllowList.cpp |  2 +-
 5 files changed, 25 insertions(+), 30 deletions(-)

diff --git a/sycl/source/detail/allowlist.cpp b/sycl/source/detail/allowlist.cpp
index 83309ec9f2d92..d607090b185d3 100644
--- a/sycl/source/detail/allowlist.cpp
+++ b/sycl/source/detail/allowlist.cpp
@@ -167,7 +167,7 @@ AllowListParsedT parseAllowList(const std::string &AllowListRaw) {
         // described in SyclBeMap
         ValidateEnumValues(BackendNameKeyName, getSyclBeMap());
         ValidateEnumValues(DeviceTypeKeyName,
-                           getSyclDeviceTypeMap(true /*Enable 'acc'*/));
+                           getSyclDeviceTypeMap<true /*Enable 'acc'*/>());
 
         if (Key == DeviceVendorIdKeyName) {
           // DeviceVendorId should have hex format
@@ -382,7 +382,7 @@ void applyAllowList(std::vector<sycl::detail::pi::PiDevice> &PiDevices,
         &PiDevType, nullptr);
     sycl::info::device_type DeviceType = pi::cast<info::device_type>(PiDevType);
     for (const auto &SyclDeviceType :
-         getSyclDeviceTypeMap(true /*Enable 'acc'*/)) {
+         getSyclDeviceTypeMap<true /*Enable 'acc'*/>()) {
       if (SyclDeviceType.second == DeviceType) {
         const auto &DeviceTypeValue = SyclDeviceType.first;
         DeviceDesc[DeviceTypeKeyName] = DeviceTypeValue;
diff --git a/sycl/source/detail/config.cpp b/sycl/source/detail/config.cpp
index 7ae96d42e220d..f632e3c94c234 100644
--- a/sycl/source/detail/config.cpp
+++ b/sycl/source/detail/config.cpp
@@ -161,23 +161,6 @@ void dumpConfig() {
 #undef CONFIG
 }
 
-// Array is used by SYCL_DEVICE_ALLOWLIST and ONEAPI_DEVICE_SELECTOR.
-// TODO: host device type will be removed once sycl_ext_oneapi_filter_selector
-// is removed.
-const std::array<std::pair<std::string, info::device_type>, 6> &
-getSyclDeviceTypeMap(bool supportAcc) {
-  static const std::array<std::pair<std::string, info::device_type>, 6>
-      SyclDeviceTypeMap = {
-          {{"host", info::device_type::host},
-           {"cpu", info::device_type::cpu},
-           {"gpu", info::device_type::gpu},
-           /* Duplicate entries are fine as this map is one-directional.*/
-           {supportAcc ? "acc" : "fpga", info::device_type::accelerator},
-           {"fpga", info::device_type::accelerator},
-           {"*", info::device_type::all}}};
-  return SyclDeviceTypeMap;
-}
-
 // Array is used by SYCL_DEVICE_FILTER and SYCL_DEVICE_ALLOWLIST and
 // ONEAPI_DEVICE_SELECTOR
 // TODO: Remove esimd_emulator in the next ABI breaking window.
diff --git a/sycl/source/detail/config.hpp b/sycl/source/detail/config.hpp
index 388192300585c..1fdf229860022 100644
--- a/sycl/source/detail/config.hpp
+++ b/sycl/source/detail/config.hpp
@@ -234,8 +234,20 @@ template <> class SYCLConfig<SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS> {
 // Array is used by SYCL_DEVICE_ALLOWLIST and ONEAPI_DEVICE_SELECTOR.
 // The 'supportAcc' parameter is used by SYCL_DEVICE_ALLOWLIST which,
 // unlike ONEAPI_DEVICE_SELECTOR, also accepts 'acc' as a valid device type.
+template <bool supportAcc = false>
 const std::array<std::pair<std::string, info::device_type>, 6> &
-getSyclDeviceTypeMap(bool supportAcc = false);
+getSyclDeviceTypeMap() {
+  static const std::array<std::pair<std::string, info::device_type>, 6>
+      SyclDeviceTypeMap = {
+          {{"host", info::device_type::host},
+           {"cpu", info::device_type::cpu},
+           {"gpu", info::device_type::gpu},
+           /* Duplicate entries are fine as this map is one-directional.*/
+           {supportAcc ? "acc" : "fpga", info::device_type::accelerator},
+           {"fpga", info::device_type::accelerator},
+           {"*", info::device_type::all}}};
+  return SyclDeviceTypeMap;
+}
 
 // Array is used by SYCL_DEVICE_FILTER and SYCL_DEVICE_ALLOWLIST and
 // ONEAPI_DEVICE_SELECTOR
@@ -512,7 +524,7 @@ template <> class SYCLConfig<SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE> {
       return Result;
 
     std::string ValueStr{ValueRaw};
-    auto DeviceTypeMap = getSyclDeviceTypeMap(true /*Enable 'acc'*/);
+    auto DeviceTypeMap = getSyclDeviceTypeMap<true /*Enable 'acc'*/>();
 
     // Iterate over all configurations.
     size_t Start = 0, End = 0;
diff --git a/sycl/source/detail/device_filter.cpp b/sycl/source/detail/device_filter.cpp
index eb3d0f83ed26e..11072eae9e610 100644
--- a/sycl/source/detail/device_filter.cpp
+++ b/sycl/source/detail/device_filter.cpp
@@ -93,12 +93,12 @@ static void Parse_ODS_Device(ods_target &Target,
   std::string_view TopDeviceStr = DeviceSubTuple[0];
 
   // Handle explicit device type (e.g. 'gpu').
-  auto DeviceTypeMap = getSyclDeviceTypeMap(
+  auto DeviceTypeMap = getSyclDeviceTypeMap<
 #ifndef __INTEL_PREVIEW_BREAKING_CHANGES
       true /*Enable 'acc'*/
 #endif
-  ); // <-- std::array<std::pair<std::string,
-     // info::device::type>>
+      >(); // <-- std::array<std::pair<std::string,
+           // info::device::type>>
 
   auto It =
       std::find_if(std::begin(DeviceTypeMap), std::end(DeviceTypeMap),
@@ -266,11 +266,11 @@ Parse_ONEAPI_DEVICE_SELECTOR(const std::string &envString) {
 std::ostream &operator<<(std::ostream &Out, const ods_target &Target) {
   Out << Target.Backend;
   if (Target.DeviceType) {
-    auto DeviceTypeMap = getSyclDeviceTypeMap(
+    auto DeviceTypeMap = getSyclDeviceTypeMap<
 #ifndef __INTEL_PREVIEW_BREAKING_CHANGES
         true /*Enable 'acc'*/
 #endif
-    );
+        >();
     auto Match = std::find_if(
         DeviceTypeMap.begin(), DeviceTypeMap.end(),
         [&](auto Pair) { return (Pair.second == Target.DeviceType); });
@@ -344,11 +344,11 @@ device_filter::device_filter(const std::string &FilterString) {
     DeviceType = info::device_type::all;
   } else {
     auto Iter = std::find_if(
-        std::begin(getSyclDeviceTypeMap(true /*Enable 'acc'*/)),
-        std::end(getSyclDeviceTypeMap(true /*Enable 'acc'*/)), FindElement);
+        std::begin(getSyclDeviceTypeMap<true /*Enable 'acc'*/>()),
+        std::end(getSyclDeviceTypeMap<true /*Enable 'acc'*/>()), FindElement);
     // If no match is found, set device_type 'all',
     // which actually means 'any device_type' will be a match.
-    if (Iter == getSyclDeviceTypeMap(true /*Enable 'acc'*/).end())
+    if (Iter == getSyclDeviceTypeMap<true /*Enable 'acc'*/>().end())
       DeviceType = info::device_type::all;
     else {
       DeviceType = Iter->second;
diff --git a/sycl/unittests/allowlist/ParseAllowList.cpp b/sycl/unittests/allowlist/ParseAllowList.cpp
index 40fbceb76616e..543436a50f96e 100644
--- a/sycl/unittests/allowlist/ParseAllowList.cpp
+++ b/sycl/unittests/allowlist/ParseAllowList.cpp
@@ -179,7 +179,7 @@ TEST(ParseAllowListTests, CheckAllValidBackendNameValuesAreProcessed) {
 TEST(ParseAllowListTests, CheckAllValidDeviceTypeValuesAreProcessed) {
   std::string AllowList;
   for (const auto &SyclDeviceType :
-       sycl::detail::getSyclDeviceTypeMap(true /*Enable 'acc'*/)) {
+       sycl::detail::getSyclDeviceTypeMap<true /*Enable 'acc'*/>()) {
     if (!AllowList.empty())
       AllowList += "|";
     AllowList += "DeviceType:" + SyclDeviceType.first;

From a4201f702b2a7e189232e0a660f592d721c4910d Mon Sep 17 00:00:00 2001
From: przemektmalon <przemek.malon@codeplay.com>
Date: Wed, 21 Feb 2024 16:28:11 +0000
Subject: [PATCH 11/30] [Bindless][Exp][NFC] Deprecate `read_image` for more
 descriptive naming (#12756)

- The `read_image` and `read_mipmap` APIs have been deprecated
- They are replaced with the more descriptive `fetch_image`,
`sample_image`, and `sample_mipmap` (for unsampled reads, sampled reads,
and mipmap sampled reads, respectively).
- This change is made in preperation for future functionality of
fetching data from sampled images.
- The reason behind this change is to avoid determining the underlying
image read operation based on the coordinate type passed, and instead
making it more transparent for the user which operation is performed
based on the name of the function.
- The extension document, bindless images headers, and all bindless
images tests have all been updated.
- The specification revision history has been updated to include a
missed changelog entry for PR https://github.com/intel/llvm/pull/12581
---
 .../sycl_ext_oneapi_bindless_images.asciidoc  |  86 ++++----
 .../sycl/ext/oneapi/bindless_images.hpp       | 195 ++++++++++++------
 .../bindless_images/mipmap/mipmap_read_1D.cpp |   8 +-
 .../bindless_images/mipmap/mipmap_read_2D.cpp |   7 +-
 .../bindless_images/mipmap/mipmap_read_3D.cpp |  10 +-
 sycl/test-e2e/bindless_images/read_1D.cpp     |   6 +-
 sycl/test-e2e/bindless_images/read_2D.cpp     |   6 +-
 .../bindless_images/read_2D_dynamic.cpp       |   7 +-
 sycl/test-e2e/bindless_images/read_3D.cpp     |   6 +-
 .../bindless_images/read_norm_types.cpp       |   6 +-
 .../test-e2e/bindless_images/read_sampled.cpp |   2 +-
 .../bindless_images/read_write_1D.cpp         |   6 +-
 .../read_write_1D_subregion.cpp               |   6 +-
 .../bindless_images/read_write_2D.cpp         |   6 +-
 .../read_write_2D_subregion.cpp               |   6 +-
 .../bindless_images/read_write_3D.cpp         |   6 +-
 .../read_write_3D_subregion.cpp               |   6 +-
 .../bindless_images/read_write_unsampled.cpp  |  24 +--
 sycl/test-e2e/bindless_images/sampling_1D.cpp |   4 +-
 sycl/test-e2e/bindless_images/sampling_2D.cpp |   6 +-
 .../sampling_2D_USM_shared.cpp                |   4 +-
 .../bindless_images/sampling_2D_half.cpp      |   4 +-
 sycl/test-e2e/bindless_images/sampling_3D.cpp |   4 +-
 .../sampling_unique_addr_modes.cpp            |   4 +-
 .../user_types/mipmap_read_user_type_2D.cpp   |   4 +-
 .../user_types/read_write_user_type.cpp       |   6 +-
 .../vulkan_interop/sampled_images.cpp         |   8 +-
 .../vulkan_interop/unsampled_images.cpp       |  20 +-
 sycl/test/extensions/bindless_images.cpp      |   2 +-
 29 files changed, 272 insertions(+), 193 deletions(-)

diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc
index 28ceead32b48b..7616890d25674 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc
@@ -986,11 +986,11 @@ listed above caused the failure.
 namespace sycl::ext::oneapi::experimental {
 
 template <typename DataT, typename HintT = DataT, typename CoordT>
-DataT read_image(const unsampled_image_handle &ImageHandle,
-                 const CoordT &Coords);
+DataT fetch_image(const unsampled_image_handle &ImageHandle,
+                  const CoordT &Coords);
 template <typename DataT, typename HintT = DataT, typename CoordT>
-DataT read_image(const sampled_image_handle &ImageHandle, 
-                 const CoordT &Coords);
+DataT sample_image(const sampled_image_handle &ImageHandle, 
+                   const CoordT &Coords);
 
 template <typename DataT, typename CoordT>
 void write_image(unsampled_image_handle &ImageHandle,
@@ -998,27 +998,29 @@ void write_image(unsampled_image_handle &ImageHandle,
 }
 ```
 
-Inside a kernel, it's possible to read an image via `read_image`, passing 
-the image handle. For the form that takes `unsampled_image_handle`, image data 
-will be fetched exactly as is in device memory. For the form that takes a 
-`sampled_image_handle`, the image will be sampled according to the 
+Inside a kernel, it's possible to retrieve data from an image via `fetch_image` 
+or  `sample_image`, passing the appropirate image handle. The `fetch_image` API 
+is only applicable to unsampled images, and the data will be fetched exactly as 
+is in device memory. The `sample_image` API is only applicable to sampled 
+images, the image data will be sampled according to the 
 `bindless_image_sampler` that was passed to the image upon construction.
 
 The user is required to pass a `DataT` template parameter, which specifies the
-return type of the `read_image` function. If `DataT` is not a recognized 
-standard type, as defined in <<recognized_standard_types>>, and instead a 
-user-defined type, the user must provide a `HintT` template parameter to the 
-`read_image` function, to allow the backend to select the correct device 
-intrinsic to fetch or sample their data.
+return type of the `fetch_image` and `sample_image` functions. If `DataT` is 
+not a recognized standard type, as defined in <<recognized_standard_types>>, 
+and instead a user-defined type, the user must provide a `HintT` template 
+parameter to the `fetch_image` and `sample_image` functions, to allow the 
+backend to select the correct device intrinsic to fetch or sample their data.
+
 `HintT` must be one of the the <<recognized_standard_types>>, and must be the 
 same size as `DataT`.
 If `DataT` is a recognized standard type, and `HintT` is also passed, `HintT` 
 will be ignored.
 
-When reading a texture backed by a normalized integer channel type, either 
-`DataT` must be a 32-bit or 16-bit floating point value, a `sycl::vec` of 
-32-bit or 16-bit floating point values, or, in the case `DataT` is not one of 
-the above, then `HintT` must be one of the above, and be of the same size as 
+When fetching or sampling an image backed by a normalized integer channel type, 
+either `DataT` must be a 32-bit or 16-bit floating point value, a `sycl::vec` 
+of 32-bit or 16-bit floating point values, or, in the case `DataT` is not one 
+of the above, then `HintT` must be one of the above, and be of the same size as 
 `DataT`.
 
 It's possible to write to an unsampled image via `write_image` passing the 
@@ -1029,8 +1031,8 @@ of the <<recognized_standard_types>>.
 
 Sampled images cannot be written to using `write_image`.
 
-For reading and writing of unsampled images, coordinates are specified by `int`, 
-`sycl::vec<int, 2>`, and `sycl::vec<int, 3>` for 1D, 2D, and 3D images, 
+For fetching and writing of unsampled images, coordinates are specified by 
+`int`, `sycl::vec<int, 2>`, and `sycl::vec<int, 3>` for 1D, 2D, and 3D images, 
 respectively.
 
 Sampled image reads take `float`, `sycl::vec<float, 2>`, and 
@@ -1046,8 +1048,8 @@ kernel must be submitted for the written data to be accessible.
 
 [NOTE]
 ====
-Attempting to read an image with `read_mipmap` or any other defined read 
-function will result in undefined behaviour.
+Attempting to sample a standard sampled image with `sample_mipmap` or any other 
+defined sampling function will result in undefined behaviour.
 ====
 
 === Recognized standard types [[recognized_standard_types]]
@@ -1057,7 +1059,8 @@ standard types.
 
 * All POD types (`char`, `short`, `int`, `float`, etc.) excluding `double`
 * `sycl::half`
-* Variants of `sycl::vec<T, N>` where `T` is one of the above, and `N` is `1`, `2`, or `3`
+* Variants of `sycl::vec<T, N>` where `T` is one of the above, and `N` is `1`, 
+  `2`, or `3`
 
 Any other types are classified as user-defined types.
 
@@ -1168,26 +1171,26 @@ level of a given top-level descriptor.
 
 === Reading a mipmap
 
-Inside the kernel, it's possible to read a mipmap via `read_mipmap`, passing the 
-`sampled_image_handle`, the coordinates, and either the level or anisotropic 
-gradient values.
+Inside the kernel, it's possible to sample a mipmap via `sample_mipmap`, 
+passing the `sampled_image_handle`, the coordinates, and either the level or 
+anisotropic gradient values.
 
-The method of sampling a mipmap is different based on which `read_mipmap` 
+The method of sampling a mipmap is different based on which `sample_mipmap` 
 function is used, and the sampler attributes passed upon creation of the 
 mipmap.
 
 ```c++
 // Nearest/linear filtering between mip levels
 template <typename DataT, typename HintT = DataT, typename CoordT>
-DataT read_mipmap(const sampled_image_handle &ImageHandle,
-                  const CoordT &Coords,
-                  const float Level);
+DataT sample_mipmap(const sampled_image_handle &ImageHandle,
+                    const CoordT &Coords,
+                    const float Level);
 
 // Anisotropic filtering
 template <typename DataT, typename HintT = DataT, typename CoordT>
-DataT read_mipmap(const sampled_image_handle &ImageHandle,
-                  const CoordT &Coords,
-                  const CoordT &Dx, const CoordT &Dy);
+DataT sample_mipmap(const sampled_image_handle &ImageHandle,
+                    const CoordT &Coords,
+                    const CoordT &Dx, const CoordT &Dy);
 ```
 
 Reading a mipmap follows the same restrictions on what coordinate types may be 
@@ -1199,8 +1202,8 @@ the restrictions as laid out in <<reading_writing_inside_kernel>>.
 
 [NOTE]
 ====
-Attempting to read a mipmap with `read_image` or any other defined read function 
-will result in undefined behaviour.
+Attempting to sample a mipmap with `sample_image` or any other defined sample 
+function will result in undefined behaviour.
 ====
 
 == Interoperability
@@ -1544,7 +1547,7 @@ try {
 
     cgh.parallel_for(width, [=](sycl::id<1> id) {
       // Extension: read image data from handle
-      float pixel = sycl::ext::oneapi::experimental::read_image<float>(
+      float pixel = sycl::ext::oneapi::experimental::fetch_image<float>(
           imgIn, int(id[0]));
 
       // Extension: write to image data using handle
@@ -1646,7 +1649,7 @@ try {
           float sum = 0;
           for (int i = 0; i < numImages; i++) {
             // Extension: read image data from handle
-            sum += (sycl::ext::oneapi::experimental::read_image<float>(
+            sum += (sycl::ext::oneapi::experimental::fetch_image<float>(
                 imgHandleAcc[i], sycl::vec<int, 2>(dim0, dim1)));
           }
           outAcc[sycl::id{dim1, dim0}] = sum;
@@ -1736,9 +1739,9 @@ try {
       float x = (static_cast<float>(id[0]) + 0.5f) / static_cast<float>(width);
       // Read mipmap level 0 with anisotropic filtering
       // and level 1 with level filtering
-      float px1 = sycl::ext::oneapi::experimental::read_mipmap<float>(
+      float px1 = sycl::ext::oneapi::experimental::sample_mipmap<float>(
           mipHandle, x, 0.0f, 0.0f);
-      float px2 = sycl::ext::oneapi::experimental::read_mipmap<float>(
+      float px2 = sycl::ext::oneapi::experimental::sample_mipmap<float>(
           mipHandle, x, 1.0f);
 
       sum = px1 + px2;
@@ -1874,7 +1877,7 @@ try {
 
           // Extension: read image data from handle to imported image
           uint32_t pixel =
-              sycl::ext::oneapi::experimental::read_image<uint32_t>(
+              sycl::ext::oneapi::experimental::fetch_image<uint32_t>(
                   img_input, sycl::vec<int, 2>(dim0, dim1));
 
           // Modify the data before writing back
@@ -2076,4 +2079,9 @@ These features still need to be handled:
                    user-defined type.
 |5.1|2023-12-06| - Added unique addressing modes per dimension to the 
                    `bindless_image_sampler`
+|5.2|2024-02-14| - Image read and write functions now accept 3-component 
+                   coordinates for 3D reads, instead of 4-component coordinates.
+|5.3|2024-02-16| - Replace `read_image` and `read_mipmap` APIs in favor of more 
+                   descriptive naming, with `fetch_image`, `sample_image`, and
+                   `sample_mipmap`.
 |======================
diff --git a/sycl/include/sycl/ext/oneapi/bindless_images.hpp b/sycl/include/sycl/ext/oneapi/bindless_images.hpp
index c68e0992bb670..07a0ccb261dde 100644
--- a/sycl/include/sycl/ext/oneapi/bindless_images.hpp
+++ b/sycl/include/sycl/ext/oneapi/bindless_images.hpp
@@ -759,14 +759,14 @@ template <typename DataT> constexpr bool is_recognized_standard_type() {
 } // namespace detail
 
 /**
- *  @brief   Read an unsampled image using its handle
+ *  @brief   [Deprecated] Read an unsampled image using its handle
  *
  *  @tparam  DataT The return type
  *  @tparam  HintT A hint type that can be used to select for a specialized
  *           backend intrinsic when a user-defined type is passed as `DataT`.
  *           HintT should be a `sycl::vec` type, `sycl::half` type, or POD type.
  *           HintT must also have the same size as DataT.
- *  @tparam  CoordT The input coordinate type. e.g. int, int2, or int4 for
+ *  @tparam  CoordT The input coordinate type. e.g. int, int2, or int3 for
  *           1D, 2D, and 3D, respectively
  *  @param   imageHandle The image handle
  *  @param   coords The coordinates at which to fetch image data
@@ -779,8 +779,36 @@ template <typename DataT> constexpr bool is_recognized_standard_type() {
  *             another
  */
 template <typename DataT, typename HintT = DataT, typename CoordT>
+__SYCL_DEPRECATED("read_image for standard unsampled images is deprecated. "
+                  "Instead use fetch_image.")
 DataT read_image(const unsampled_image_handle &imageHandle [[maybe_unused]],
                  const CoordT &coords [[maybe_unused]]) {
+  return fetch_image(imageHandle, coords);
+}
+
+/**
+ *  @brief   Fetch data from an unsampled image using its handle
+ *
+ *  @tparam  DataT The return type
+ *  @tparam  HintT A hint type that can be used to select for a specialized
+ *           backend intrinsic when a user-defined type is passed as `DataT`.
+ *           HintT should be a `sycl::vec` type, `sycl::half` type, or POD type.
+ *           HintT must also have the same size as DataT.
+ *  @tparam  CoordT The input coordinate type. e.g. int, int2, or int3 for
+ *           1D, 2D, and 3D, respectively
+ *  @param   imageHandle The image handle
+ *  @param   coords The coordinates at which to fetch image data
+ *  @return  Image data
+ *
+ *  __NVPTX__: Name mangling info
+ *             Cuda surfaces require integer coords (by bytes)
+ *             Cuda textures require float coords (by element or normalized)
+ *             The name mangling should therefore not interfere with one
+ *             another
+ */
+template <typename DataT, typename HintT = DataT, typename CoordT>
+DataT fetch_image(const unsampled_image_handle &imageHandle [[maybe_unused]],
+                  const CoordT &coords [[maybe_unused]]) {
   detail::assert_unsampled_coords<CoordT>();
   constexpr size_t coordSize = detail::coord_size<CoordT>();
   static_assert(coordSize == 1 || coordSize == 2 || coordSize == 3,
@@ -805,17 +833,17 @@ DataT read_image(const unsampled_image_handle &imageHandle [[maybe_unused]],
 }
 
 /**
- *  @brief   Read a sampled image using its handle
+ *  @brief   [Deprecated] Read a sampled image using its handle
  *
  *  @tparam  DataT The return type
  *  @tparam  HintT A hint type that can be used to select for a specialized
  *           backend intrinsic when a user-defined type is passed as `DataT`.
  *           HintT should be a `sycl::vec` type, `sycl::half` type, or POD type.
  *           HintT must also have the same size as DataT.
- *  @tparam  CoordT The input coordinate type. e.g. float, float2, or float4 for
+ *  @tparam  CoordT The input coordinate type. e.g. float, float2, or float3 for
  *           1D, 2D, and 3D, respectively
  *  @param   imageHandle The image handle
- *  @param   coords The coordinates at which to fetch image data
+ *  @param   coords The coordinates at which to sample image data
  *  @return  Sampled image data
  *
  *  __NVPTX__: Name mangling info
@@ -825,8 +853,36 @@ DataT read_image(const unsampled_image_handle &imageHandle [[maybe_unused]],
  *             another
  */
 template <typename DataT, typename HintT = DataT, typename CoordT>
+__SYCL_DEPRECATED("read_image for standard sampled images is deprecated. "
+                  "Instead use sample_image.")
 DataT read_image(const sampled_image_handle &imageHandle [[maybe_unused]],
                  const CoordT &coords [[maybe_unused]]) {
+  return sample_image(imageHandle, coords);
+}
+
+/**
+ *  @brief   Sample data from a sampled image using its handle
+ *
+ *  @tparam  DataT The return type
+ *  @tparam  HintT A hint type that can be used to select for a specialized
+ *           backend intrinsic when a user-defined type is passed as `DataT`.
+ *           HintT should be a `sycl::vec` type, `sycl::half` type, or POD type.
+ *           HintT must also have the same size as DataT.
+ *  @tparam  CoordT The input coordinate type. e.g. float, float2, or float3 for
+ *           1D, 2D, and 3D, respectively
+ *  @param   imageHandle The image handle
+ *  @param   coords The coordinates at which to sample image data
+ *  @return  Sampled image data
+ *
+ *  __NVPTX__: Name mangling info
+ *             Cuda surfaces require integer coords (by bytes)
+ *             Cuda textures require float coords (by element or normalized)
+ *             The name mangling should therefore not interfere with one
+ *             another
+ */
+template <typename DataT, typename HintT = DataT, typename CoordT>
+DataT sample_image(const sampled_image_handle &imageHandle [[maybe_unused]],
+                   const CoordT &coords [[maybe_unused]]) {
   detail::assert_sampled_coords<CoordT>();
   constexpr size_t coordSize = detail::coord_size<CoordT>();
   static_assert(coordSize == 1 || coordSize == 2 || coordSize == 3,
@@ -851,24 +907,76 @@ DataT read_image(const sampled_image_handle &imageHandle [[maybe_unused]],
 }
 
 /**
- *  @brief   Read a mipmap image using its handle with LOD filtering
+ *  @brief   [Deprecated] Read a mipmap image using its handle with LOD
+ *           filtering
  *
  *  @tparam  DataT The return type
  *  @tparam  HintT A hint type that can be used to select for a specialized
  *           backend intrinsic when a user-defined type is passed as `DataT`.
  *           HintT should be a `sycl::vec` type, `sycl::half` type, or POD type.
  *           HintT must also have the same size as DataT.
- *  @tparam  CoordT The input coordinate type. e.g. float, float2, or float4 for
+ *  @tparam  CoordT The input coordinate type. e.g. float, float2, or float3 for
  *           1D, 2D, and 3D, respectively
  *  @param   imageHandle The mipmap image handle
- *  @param   coords The coordinates at which to fetch mipmap image data
+ *  @param   coords The coordinates at which to sample mipmap image data
  *  @param   level The mipmap level at which to sample
  *  @return  Mipmap image data with LOD filtering
  */
 template <typename DataT, typename HintT = DataT, typename CoordT>
+__SYCL_DEPRECATED("read_mipmap has been deprecated. "
+                  "Instead use sample_mipmap.")
 DataT read_mipmap(const sampled_image_handle &imageHandle [[maybe_unused]],
                   const CoordT &coords [[maybe_unused]],
                   const float level [[maybe_unused]]) {
+  return sample_mipmap(imageHandle, coords, level);
+}
+
+/**
+ *  @brief   [Deprecated] Read a mipmap image using its handle with anisotropic
+ *           filtering
+ *
+ *  @tparam  DataT The return type
+ *  @tparam  HintT A hint type that can be used to select for a specialized
+ *           backend intrinsic when a user-defined type is passed as `DataT`.
+ *           HintT should be a `sycl::vec` type, `sycl::half` type, or POD type.
+ *           HintT must also have the same size as DataT.
+ *  @tparam  CoordT The input coordinate type. e.g. float, float2, or float3 for
+ *           1D, 2D, and 3D, respectively
+ *  @param   imageHandle The mipmap image handle
+ *  @param   coords The coordinates at which to sample mipmap image data
+ *  @param   dX Screen space gradient in the x dimension
+ *  @param   dY Screen space gradient in the y dimension
+ *  @return  Mipmap image data with anisotropic filtering
+ */
+template <typename DataT, typename HintT = DataT, typename CoordT>
+__SYCL_DEPRECATED("read_mipmap has been deprecated. "
+                  "Instead use sample_mipmap.")
+DataT read_mipmap(const sampled_image_handle &imageHandle [[maybe_unused]],
+                  const CoordT &coords [[maybe_unused]],
+                  const CoordT &dX [[maybe_unused]],
+                  const CoordT &dY [[maybe_unused]]) {
+  return sample_mipmap(imageHandle, coords, dX, dY);
+}
+
+/**
+ *  @brief   Sample a mipmap image using its handle with LOD filtering
+ *
+ *  @tparam  DataT The return type
+ *  @tparam  HintT A hint type that can be used to select for a specialized
+ *           backend intrinsic when a user-defined type is passed as `DataT`.
+ *           HintT should be a `sycl::vec` type, `sycl::half` type, or POD type.
+ *           HintT must also have the same size as DataT.
+ *  @tparam  CoordT The input coordinate type. e.g. float, float2, or float3 for
+ *           1D, 2D, and 3D, respectively
+ *  @param   imageHandle The mipmap image handle
+ *  @param   coords The coordinates at which to sample mipmap image data
+ *  @param   level The mipmap level at which to sample
+ *  @return  Mipmap image data with LOD filtering
+ */
+template <typename DataT, typename HintT = DataT, typename CoordT>
+DataT sample_mipmap(const sampled_image_handle &imageHandle [[maybe_unused]],
+                    const CoordT &coords [[maybe_unused]],
+                    const float level [[maybe_unused]]) {
   detail::assert_sampled_coords<CoordT>();
   constexpr size_t coordSize = detail::coord_size<CoordT>();
   static_assert(coordSize == 1 || coordSize == 2 || coordSize == 3,
@@ -893,26 +1001,26 @@ DataT read_mipmap(const sampled_image_handle &imageHandle [[maybe_unused]],
 }
 
 /**
- *  @brief   Read a mipmap image using its handle with anisotropic filtering
+ *  @brief   Sample a mipmap image using its handle with anisotropic filtering
  *
  *  @tparam  DataT The return type
  *  @tparam  HintT A hint type that can be used to select for a specialized
  *           backend intrinsic when a user-defined type is passed as `DataT`.
  *           HintT should be a `sycl::vec` type, `sycl::half` type, or POD type.
  *           HintT must also have the same size as DataT.
- *  @tparam  CoordT The input coordinate type. e.g. float, float2, or float4 for
+ *  @tparam  CoordT The input coordinate type. e.g. float, float2, or float3 for
  *           1D, 2D, and 3D, respectively
  *  @param   imageHandle The mipmap image handle
- *  @param   coords The coordinates at which to fetch mipmap image data
+ *  @param   coords The coordinates at which to sample mipmap image data
  *  @param   dX Screen space gradient in the x dimension
  *  @param   dY Screen space gradient in the y dimension
  *  @return  Mipmap image data with anisotropic filtering
  */
 template <typename DataT, typename HintT = DataT, typename CoordT>
-DataT read_mipmap(const sampled_image_handle &imageHandle [[maybe_unused]],
-                  const CoordT &coords [[maybe_unused]],
-                  const CoordT &dX [[maybe_unused]],
-                  const CoordT &dY [[maybe_unused]]) {
+DataT sample_mipmap(const sampled_image_handle &imageHandle [[maybe_unused]],
+                    const CoordT &coords [[maybe_unused]],
+                    const CoordT &dX [[maybe_unused]],
+                    const CoordT &dY [[maybe_unused]]) {
   detail::assert_sampled_coords<CoordT>();
   constexpr size_t coordSize = detail::coord_size<CoordT>();
   static_assert(coordSize == 1 || coordSize == 2 || coordSize == 3,
@@ -946,40 +1054,20 @@ DataT read_mipmap(const sampled_image_handle &imageHandle [[maybe_unused]],
  *           backend intrinsic when a user-defined type is passed as `DataT`.
  *           HintT should be a `sycl::vec` type, `sycl::half` type, or POD type.
  *           HintT must also have the same size as DataT.
- *  @tparam  CoordT The input coordinate type. e.g. float, float2, or float4 for
+ *  @tparam  CoordT The input coordinate type. e.g. float, float2, or float3 for
  *           1D, 2D, and 3D, respectively
  *  @param   imageHandle The mipmap image handle
- *  @param   coords The coordinates at which to fetch mipmap image data
+ *  @param   coords The coordinates at which to sample mipmap image data
  *  @param   level The mipmap level at which to sample
  *  @return  Mipmap image data with LOD filtering
  */
 template <typename DataT, typename HintT = DataT, typename CoordT>
 __SYCL_DEPRECATED("read_image for mipmaps is deprecated. "
-                  "Instead use read_mipmap.")
+                  "Instead use sample_mipmap.")
 DataT read_image(const sampled_image_handle &imageHandle [[maybe_unused]],
                  const CoordT &coords [[maybe_unused]],
                  const float level [[maybe_unused]]) {
-  detail::assert_sampled_coords<CoordT>();
-  constexpr size_t coordSize = detail::coord_size<CoordT>();
-  static_assert(coordSize == 1 || coordSize == 2 || coordSize == 3,
-                "Expected input coordinate to be have 1, 2, or 3 components "
-                "for 1D, 2D and 3D images, respectively.");
-
-#ifdef __SYCL_DEVICE_ONLY__
-  if constexpr (detail::is_recognized_standard_type<DataT>()) {
-    return __invoke__ImageReadLod<DataT>(imageHandle.raw_handle, coords, level);
-  } else {
-    static_assert(sizeof(HintT) == sizeof(DataT),
-                  "When trying to read a user-defined type, HintT must be of "
-                  "the same size as the user-defined DataT.");
-    static_assert(detail::is_recognized_standard_type<HintT>(),
-                  "HintT must always be a recognized standard type");
-    return sycl::bit_cast<DataT>(
-        __invoke__ImageReadLod<HintT>(imageHandle.raw_handle, coords, level));
-  }
-#else
-  assert(false); // Bindless images not yet implemented on host
-#endif
+  return sample_mipmap(imageHandle, coords, level);
 }
 
 /**
@@ -991,7 +1079,7 @@ DataT read_image(const sampled_image_handle &imageHandle [[maybe_unused]],
  *           backend intrinsic when a user-defined type is passed as `DataT`.
  *           HintT should be a `sycl::vec` type, `sycl::half` type, or POD type.
  *           HintT must also have the same size as DataT.
- *  @tparam  CoordT The input coordinate type. e.g. float, float2, or float4 for
+ *  @tparam  CoordT The input coordinate type. e.g. float, float2, or float3 for
  *           1D, 2D, and 3D, respectively
  *  @param   imageHandle The mipmap image handle
  *  @param   coords The coordinates at which to fetch mipmap image data
@@ -1001,40 +1089,19 @@ DataT read_image(const sampled_image_handle &imageHandle [[maybe_unused]],
  */
 template <typename DataT, typename HintT = DataT, typename CoordT>
 __SYCL_DEPRECATED("read_image for mipmaps is deprecated. "
-                  "Instead use read_mipmap.")
+                  "Instead use sample_mipmap.")
 DataT read_image(const sampled_image_handle &imageHandle [[maybe_unused]],
                  const CoordT &coords [[maybe_unused]],
                  const CoordT &dX [[maybe_unused]],
                  const CoordT &dY [[maybe_unused]]) {
-  detail::assert_sampled_coords<CoordT>();
-  constexpr size_t coordSize = detail::coord_size<CoordT>();
-  static_assert(coordSize == 1 || coordSize == 2 || coordSize == 3,
-                "Expected input coordinates and gradients to have 1, 2, or 3 "
-                "components for 1D, 2D, and 3D images, respectively.");
-
-#ifdef __SYCL_DEVICE_ONLY__
-  if constexpr (detail::is_recognized_standard_type<DataT>()) {
-    return __invoke__ImageReadGrad<DataT>(imageHandle.raw_handle, coords, dX,
-                                          dY);
-  } else {
-    static_assert(sizeof(HintT) == sizeof(DataT),
-                  "When trying to read a user-defined type, HintT must be of "
-                  "the same size as the user-defined DataT.");
-    static_assert(detail::is_recognized_standard_type<HintT>(),
-                  "HintT must always be a recognized standard type");
-    return sycl::bit_cast<DataT>(
-        __invoke__ImageReadGrad<HintT>(imageHandle.raw_handle, coords, dX, dY));
-  }
-#else
-  assert(false); // Bindless images not yet implemented on host
-#endif
+  return sample_mipmap(imageHandle, coords, dX, dY);
 }
 
 /**
  *  @brief   Write to an unsampled image using its handle
  *
  *  @tparam  DataT The data type to write
- *  @tparam  CoordT The input coordinate type. e.g. int, int2, or int4 for
+ *  @tparam  CoordT The input coordinate type. e.g. int, int2, or int3 for
  *           1D, 2D, and 3D, respectively
  *  @param   imageHandle The image handle
  *  @param   coords The coordinates at which to write image data
diff --git a/sycl/test-e2e/bindless_images/mipmap/mipmap_read_1D.cpp b/sycl/test-e2e/bindless_images/mipmap/mipmap_read_1D.cpp
index 0ea0419e12dbf..cc896e36dd0c5 100644
--- a/sycl/test-e2e/bindless_images/mipmap/mipmap_read_1D.cpp
+++ b/sycl/test-e2e/bindless_images/mipmap/mipmap_read_1D.cpp
@@ -95,11 +95,11 @@ template <typename DType, sycl::image_channel_type CType> bool runTest() {
       cgh.parallel_for<kernel<DType, CType>>(N, [=](sycl::id<1> id) {
         DType sum = 0;
         float x = float(id[0] + 0.5f) / (float)N;
-        // Extension: read mipmap level 0 with anisotropic filtering and level 1
-        // with LOD
-        VecType px1 = sycl::ext::oneapi::experimental::read_mipmap<VecType>(
+        // Extension: sample mipmap level 0 with anisotropic filtering and
+        // level 1 with LOD
+        VecType px1 = sycl::ext::oneapi::experimental::sample_mipmap<VecType>(
             mipHandle, x, 0.0f);
-        VecType px2 = sycl::ext::oneapi::experimental::read_mipmap<VecType>(
+        VecType px2 = sycl::ext::oneapi::experimental::sample_mipmap<VecType>(
             mipHandle, x, 1.0f);
 
         sum = px1[0] + px2[0];
diff --git a/sycl/test-e2e/bindless_images/mipmap/mipmap_read_2D.cpp b/sycl/test-e2e/bindless_images/mipmap/mipmap_read_2D.cpp
index 3cd12f0e1bf3c..5b24e5a170c72 100644
--- a/sycl/test-e2e/bindless_images/mipmap/mipmap_read_2D.cpp
+++ b/sycl/test-e2e/bindless_images/mipmap/mipmap_read_2D.cpp
@@ -110,9 +110,10 @@ template <typename DType, sycl::image_channel_type CType> bool runTest() {
             float fdim0 = float(dim0 + 0.5f) / (float)width;
             float fdim1 = float(dim1 + 0.5f) / (float)height;
 
-            // Extension: read mipmap level 1 with LOD
-            VecType px2 = sycl::ext::oneapi::experimental::read_mipmap<VecType>(
-                mipHandle, sycl::float2(fdim0, fdim1), 1.0f);
+            // Extension: sample mipmap level 1 with LOD
+            VecType px2 =
+                sycl::ext::oneapi::experimental::sample_mipmap<VecType>(
+                    mipHandle, sycl::float2(fdim0, fdim1), 1.0f);
 
             outAcc[sycl::id<2>{dim1, dim0}] = px2[0];
           });
diff --git a/sycl/test-e2e/bindless_images/mipmap/mipmap_read_3D.cpp b/sycl/test-e2e/bindless_images/mipmap/mipmap_read_3D.cpp
index fd08eaa729987..289f1d7c95486 100644
--- a/sycl/test-e2e/bindless_images/mipmap/mipmap_read_3D.cpp
+++ b/sycl/test-e2e/bindless_images/mipmap/mipmap_read_3D.cpp
@@ -99,11 +99,13 @@ template <typename DType, sycl::image_channel_type CType> bool runTest() {
             float fdim1 = float(dim1 + 0.5f) / (float)height;
             float fdim2 = float(dim2 + 0.5f) / (float)depth;
 
-            // Extension: read mipmap with anisotropic filtering with zero
+            // Extension: sample mipmap with anisotropic filtering with zero
             // viewing gradients
-            VecType px1 = sycl::ext::oneapi::experimental::read_mipmap<VecType>(
-                mipHandle, sycl::float3(fdim0, fdim1, fdim2),
-                sycl::float3(0.0f, 0.0f, 0.0f), sycl::float3(0.0f, 0.0f, 0.0f));
+            VecType px1 =
+                sycl::ext::oneapi::experimental::sample_mipmap<VecType>(
+                    mipHandle, sycl::float3(fdim0, fdim1, fdim2),
+                    sycl::float3(0.0f, 0.0f, 0.0f),
+                    sycl::float3(0.0f, 0.0f, 0.0f));
 
             outAcc[sycl::id<3>{dim2, dim1, dim0}] = px1[0];
           });
diff --git a/sycl/test-e2e/bindless_images/read_1D.cpp b/sycl/test-e2e/bindless_images/read_1D.cpp
index 00292587b069e..95260f6daa773 100644
--- a/sycl/test-e2e/bindless_images/read_1D.cpp
+++ b/sycl/test-e2e/bindless_images/read_1D.cpp
@@ -93,12 +93,12 @@ int main() {
 
       cgh.parallel_for<image_addition>(width, [=](sycl::id<1> id) {
         float sum = 0;
-        // Extension: read image data from handle
+        // Extension: fetch image data from handle
         sycl::float4 px1 =
-            sycl::ext::oneapi::experimental::read_image<sycl::float4>(
+            sycl::ext::oneapi::experimental::fetch_image<sycl::float4>(
                 imgHandle1, int(id[0]));
         sycl::float4 px2 =
-            sycl::ext::oneapi::experimental::read_image<sycl::float4>(
+            sycl::ext::oneapi::experimental::fetch_image<sycl::float4>(
                 imgHandle2, int(id[0]));
 
         sum = px1[0] + px2[0];
diff --git a/sycl/test-e2e/bindless_images/read_2D.cpp b/sycl/test-e2e/bindless_images/read_2D.cpp
index 0ac2d2aec2350..835a83cb7a28a 100644
--- a/sycl/test-e2e/bindless_images/read_2D.cpp
+++ b/sycl/test-e2e/bindless_images/read_2D.cpp
@@ -73,12 +73,12 @@ int main() {
             size_t dim0 = it.get_local_id(0);
             size_t dim1 = it.get_local_id(1);
             float sum = 0;
-            // Extension: read image data from handle
+            // Extension: fetch image data from handle
             sycl::float4 px1 =
-                sycl::ext::oneapi::experimental::read_image<sycl::float4>(
+                sycl::ext::oneapi::experimental::fetch_image<sycl::float4>(
                     imgHandle1, sycl::int2(dim0, dim1));
             sycl::float4 px2 =
-                sycl::ext::oneapi::experimental::read_image<sycl::float4>(
+                sycl::ext::oneapi::experimental::fetch_image<sycl::float4>(
                     imgHandle2, sycl::int2(dim0, dim1));
 
             sum = px1[0] + px2[0];
diff --git a/sycl/test-e2e/bindless_images/read_2D_dynamic.cpp b/sycl/test-e2e/bindless_images/read_2D_dynamic.cpp
index 2451a59f229b4..e7e3b436bd677 100644
--- a/sycl/test-e2e/bindless_images/read_2D_dynamic.cpp
+++ b/sycl/test-e2e/bindless_images/read_2D_dynamic.cpp
@@ -83,9 +83,10 @@ int main() {
             // Sum each image by reading their handle
             float sum = 0;
             for (int i = 0; i < numImages; i++) {
-              // Extension: read image data from handle
-              sum += (sycl::ext::oneapi::experimental::read_image<sycl::float4>(
-                  imgHandleAcc[i], sycl::int2(dim0, dim1)))[0];
+              // Extension: fetch image data from handle
+              sum +=
+                  (sycl::ext::oneapi::experimental::fetch_image<sycl::float4>(
+                      imgHandleAcc[i], sycl::int2(dim0, dim1)))[0];
             }
             outAcc[sycl::id<2>{dim1, dim0}] = sum;
           });
diff --git a/sycl/test-e2e/bindless_images/read_3D.cpp b/sycl/test-e2e/bindless_images/read_3D.cpp
index 2b45aadba3a1c..009ec83351cd1 100644
--- a/sycl/test-e2e/bindless_images/read_3D.cpp
+++ b/sycl/test-e2e/bindless_images/read_3D.cpp
@@ -73,12 +73,12 @@ int main() {
             size_t dim1 = it.get_global_id(1);
             size_t dim2 = it.get_global_id(2);
             float sum = 0;
-            // Extension: read image data from handle
+            // Extension: fetch image data from handle
             sycl::float4 px1 =
-                sycl::ext::oneapi::experimental::read_image<sycl::float4>(
+                sycl::ext::oneapi::experimental::fetch_image<sycl::float4>(
                     imgHandle1, sycl::int3(dim0, dim1, dim2));
             sycl::float4 px2 =
-                sycl::ext::oneapi::experimental::read_image<sycl::float4>(
+                sycl::ext::oneapi::experimental::fetch_image<sycl::float4>(
                     imgHandle2, sycl::int3(dim0, dim1, dim2));
 
             sum = px1[0] + px2[0];
diff --git a/sycl/test-e2e/bindless_images/read_norm_types.cpp b/sycl/test-e2e/bindless_images/read_norm_types.cpp
index 9c0ce065114c0..646fe6918b694 100644
--- a/sycl/test-e2e/bindless_images/read_norm_types.cpp
+++ b/sycl/test-e2e/bindless_images/read_norm_types.cpp
@@ -63,14 +63,14 @@ bool run_test(sycl::range<NDims> globalSize, sycl::range<NDims> localSize) {
 
             if constexpr (NDims == 1) {
               OutputType pixel =
-                  syclexp::read_image<OutputType>(imgIn, float(dim0));
+                  syclexp::sample_image<OutputType>(imgIn, float(dim0));
               syclexp::write_image(imgOut, int(dim0), pixel);
             } else if constexpr (NDims == 2) {
-              OutputType pixel = syclexp::read_image<OutputType>(
+              OutputType pixel = syclexp::sample_image<OutputType>(
                   imgIn, sycl::float2(dim0, dim1));
               syclexp::write_image(imgOut, sycl::int2(dim0, dim1), pixel);
             } else if constexpr (NDims == 3) {
-              OutputType pixel = syclexp::read_image<OutputType>(
+              OutputType pixel = syclexp::sample_image<OutputType>(
                   imgIn, sycl::float3(dim0, dim1, dim2));
               syclexp::write_image(imgOut, sycl::int3(dim0, dim1, dim2), pixel);
             }
diff --git a/sycl/test-e2e/bindless_images/read_sampled.cpp b/sycl/test-e2e/bindless_images/read_sampled.cpp
index 237aba1510eaf..f48846cc3fc15 100644
--- a/sycl/test-e2e/bindless_images/read_sampled.cpp
+++ b/sycl/test-e2e/bindless_images/read_sampled.cpp
@@ -795,7 +795,7 @@ runNDimTestDevice(sycl::queue &q, sycl::range<NDims> globalSize,
               accessorCoords[i] = it.get_global_id(NDims - i - 1);
             }
 
-            VecType px1 = syclexp::read_image<VecType>(inputImage, coords);
+            VecType px1 = syclexp::sample_image<VecType>(inputImage, coords);
 
             outAcc[accessorCoords] = px1;
           });
diff --git a/sycl/test-e2e/bindless_images/read_write_1D.cpp b/sycl/test-e2e/bindless_images/read_write_1D.cpp
index eabf71e9248d8..e1ab02ba9f995 100644
--- a/sycl/test-e2e/bindless_images/read_write_1D.cpp
+++ b/sycl/test-e2e/bindless_images/read_write_1D.cpp
@@ -62,12 +62,12 @@ int main() {
     q.submit([&](sycl::handler &cgh) {
       cgh.parallel_for<image_addition>(width, [=](sycl::id<1> id) {
         float sum = 0;
-        // Extension: read image data from handle
+        // Extension: fetch image data from handle
         sycl::float4 px1 =
-            sycl::ext::oneapi::experimental::read_image<sycl::float4>(
+            sycl::ext::oneapi::experimental::fetch_image<sycl::float4>(
                 imgIn1, int(id[0]));
         sycl::float4 px2 =
-            sycl::ext::oneapi::experimental::read_image<sycl::float4>(
+            sycl::ext::oneapi::experimental::fetch_image<sycl::float4>(
                 imgIn2, int(id[0]));
 
         sum = px1[0] + px2[0];
diff --git a/sycl/test-e2e/bindless_images/read_write_1D_subregion.cpp b/sycl/test-e2e/bindless_images/read_write_1D_subregion.cpp
index 3691a397f1942..3c1a9e964c775 100644
--- a/sycl/test-e2e/bindless_images/read_write_1D_subregion.cpp
+++ b/sycl/test-e2e/bindless_images/read_write_1D_subregion.cpp
@@ -73,10 +73,10 @@ int main() {
     q.submit([&](sycl::handler &cgh) {
       cgh.parallel_for<image_addition>(width, [=](sycl::id<1> id) {
         float sum = 0;
-        // Extension: read image data from handle
-        float px1 = sycl::ext::oneapi::experimental::read_image<float>(
+        // Extension: fetch image data from handle
+        float px1 = sycl::ext::oneapi::experimental::fetch_image<float>(
             imgHandle1, int(id[0]));
-        float px2 = sycl::ext::oneapi::experimental::read_image<float>(
+        float px2 = sycl::ext::oneapi::experimental::fetch_image<float>(
             imgHandle2, int(id[0]));
 
         sum = px1 + px2;
diff --git a/sycl/test-e2e/bindless_images/read_write_2D.cpp b/sycl/test-e2e/bindless_images/read_write_2D.cpp
index dbc5f4cc219f9..6320288e8f373 100644
--- a/sycl/test-e2e/bindless_images/read_write_2D.cpp
+++ b/sycl/test-e2e/bindless_images/read_write_2D.cpp
@@ -68,12 +68,12 @@ int main() {
             size_t dim0 = it.get_local_id(0);
             size_t dim1 = it.get_local_id(1);
             float sum = 0;
-            // Extension: read image data from handle
+            // Extension: fetch image data from handle
             sycl::float4 px1 =
-                sycl::ext::oneapi::experimental::read_image<sycl::float4>(
+                sycl::ext::oneapi::experimental::fetch_image<sycl::float4>(
                     imgIn1, sycl::int2(dim0, dim1));
             sycl::float4 px2 =
-                sycl::ext::oneapi::experimental::read_image<sycl::float4>(
+                sycl::ext::oneapi::experimental::fetch_image<sycl::float4>(
                     imgIn2, sycl::int2(dim0, dim1));
 
             sum = px1[0] + px2[0];
diff --git a/sycl/test-e2e/bindless_images/read_write_2D_subregion.cpp b/sycl/test-e2e/bindless_images/read_write_2D_subregion.cpp
index 55e43eb3e7fe6..49def4bce9d4a 100644
--- a/sycl/test-e2e/bindless_images/read_write_2D_subregion.cpp
+++ b/sycl/test-e2e/bindless_images/read_write_2D_subregion.cpp
@@ -90,10 +90,10 @@ int main() {
             size_t dim0 = it.get_local_id(0);
             size_t dim1 = it.get_local_id(1);
             float sum = 0;
-            // Extension: read image data from handle
-            float px1 = sycl::ext::oneapi::experimental::read_image<float>(
+            // Extension: fetch image data from handle
+            float px1 = sycl::ext::oneapi::experimental::fetch_image<float>(
                 imgHandle1, sycl::int2(dim0, dim1));
-            float px2 = sycl::ext::oneapi::experimental::read_image<float>(
+            float px2 = sycl::ext::oneapi::experimental::fetch_image<float>(
                 imgHandle2, sycl::int2(dim0, dim1));
 
             sum = px1 + px2;
diff --git a/sycl/test-e2e/bindless_images/read_write_3D.cpp b/sycl/test-e2e/bindless_images/read_write_3D.cpp
index 3e362b09e9135..75f9f8e916df7 100644
--- a/sycl/test-e2e/bindless_images/read_write_3D.cpp
+++ b/sycl/test-e2e/bindless_images/read_write_3D.cpp
@@ -73,12 +73,12 @@ int main() {
             size_t dim1 = it.get_local_id(1);
             size_t dim2 = it.get_local_id(2);
             float sum = 0;
-            // Extension: read image data from handle
+            // Extension: fetch image data from handle
             sycl::float4 px1 =
-                sycl::ext::oneapi::experimental::read_image<sycl::float4>(
+                sycl::ext::oneapi::experimental::fetch_image<sycl::float4>(
                     imgIn1, sycl::int3(dim0, dim1, dim2));
             sycl::float4 px2 =
-                sycl::ext::oneapi::experimental::read_image<sycl::float4>(
+                sycl::ext::oneapi::experimental::fetch_image<sycl::float4>(
                     imgIn2, sycl::int3(dim0, dim1, dim2));
 
             sum = px1[0] + px2[0];
diff --git a/sycl/test-e2e/bindless_images/read_write_3D_subregion.cpp b/sycl/test-e2e/bindless_images/read_write_3D_subregion.cpp
index 706a3ef64065e..dd00fdc26e2b2 100644
--- a/sycl/test-e2e/bindless_images/read_write_3D_subregion.cpp
+++ b/sycl/test-e2e/bindless_images/read_write_3D_subregion.cpp
@@ -109,10 +109,10 @@ int main() {
             size_t dim1 = it.get_global_id(1);
             size_t dim2 = it.get_global_id(2);
             float sum = 0;
-            // Extension: read image data from handle
-            float px1 = sycl::ext::oneapi::experimental::read_image<float>(
+            // Extension: fetch image data from handle
+            float px1 = sycl::ext::oneapi::experimental::fetch_image<float>(
                 imgHandle1, sycl::int3(dim0, dim1, dim2));
-            float px2 = sycl::ext::oneapi::experimental::read_image<float>(
+            float px2 = sycl::ext::oneapi::experimental::fetch_image<float>(
                 imgHandle2, sycl::int3(dim0, dim1, dim2));
 
             sum = px1 + px2;
diff --git a/sycl/test-e2e/bindless_images/read_write_unsampled.cpp b/sycl/test-e2e/bindless_images/read_write_unsampled.cpp
index 38adcac3294d0..bd290e92bd3bf 100644
--- a/sycl/test-e2e/bindless_images/read_write_unsampled.cpp
+++ b/sycl/test-e2e/bindless_images/read_write_unsampled.cpp
@@ -98,10 +98,10 @@ struct util {
 
               if constexpr (NChannels >= 1) {
                 VecType px1 =
-                    sycl::ext::oneapi::experimental::read_image<VecType>(
+                    sycl::ext::oneapi::experimental::fetch_image<VecType>(
                         input_0, sycl::int3(dim0, dim1, dim2));
                 VecType px2 =
-                    sycl::ext::oneapi::experimental::read_image<VecType>(
+                    sycl::ext::oneapi::experimental::fetch_image<VecType>(
                         input_1, sycl::int3(dim0, dim1, dim2));
 
                 auto sum =
@@ -109,9 +109,9 @@ struct util {
                 sycl::ext::oneapi::experimental::write_image<VecType>(
                     output, sycl::int3(dim0, dim1, dim2), VecType(sum));
               } else {
-                DType px1 = sycl::ext::oneapi::experimental::read_image<DType>(
+                DType px1 = sycl::ext::oneapi::experimental::fetch_image<DType>(
                     input_0, sycl::int3(dim0, dim1, dim2));
-                DType px2 = sycl::ext::oneapi::experimental::read_image<DType>(
+                DType px2 = sycl::ext::oneapi::experimental::fetch_image<DType>(
                     input_1, sycl::int3(dim0, dim1, dim2));
 
                 auto sum = DType(util::add_kernel<DType, NChannels>(px1, px2));
@@ -148,10 +148,10 @@ struct util {
 
               if constexpr (NChannels >= 1) {
                 VecType px1 =
-                    sycl::ext::oneapi::experimental::read_image<VecType>(
+                    sycl::ext::oneapi::experimental::fetch_image<VecType>(
                         input_0, sycl::int2(dim0, dim1));
                 VecType px2 =
-                    sycl::ext::oneapi::experimental::read_image<VecType>(
+                    sycl::ext::oneapi::experimental::fetch_image<VecType>(
                         input_1, sycl::int2(dim0, dim1));
 
                 auto sum =
@@ -159,9 +159,9 @@ struct util {
                 sycl::ext::oneapi::experimental::write_image<VecType>(
                     output, sycl::int2(dim0, dim1), VecType(sum));
               } else {
-                DType px1 = sycl::ext::oneapi::experimental::read_image<DType>(
+                DType px1 = sycl::ext::oneapi::experimental::fetch_image<DType>(
                     input_0, sycl::int2(dim0, dim1));
-                DType px2 = sycl::ext::oneapi::experimental::read_image<DType>(
+                DType px2 = sycl::ext::oneapi::experimental::fetch_image<DType>(
                     input_1, sycl::int2(dim0, dim1));
 
                 auto sum = DType(util::add_kernel<DType, NChannels>(px1, px2));
@@ -197,10 +197,10 @@ struct util {
 
               if constexpr (NChannels >= 1) {
                 VecType px1 =
-                    sycl::ext::oneapi::experimental::read_image<VecType>(
+                    sycl::ext::oneapi::experimental::fetch_image<VecType>(
                         input_0, int(dim0));
                 VecType px2 =
-                    sycl::ext::oneapi::experimental::read_image<VecType>(
+                    sycl::ext::oneapi::experimental::fetch_image<VecType>(
                         input_1, int(dim0));
 
                 auto sum =
@@ -208,9 +208,9 @@ struct util {
                 sycl::ext::oneapi::experimental::write_image<VecType>(
                     output, int(dim0), VecType(sum));
               } else {
-                DType px1 = sycl::ext::oneapi::experimental::read_image<DType>(
+                DType px1 = sycl::ext::oneapi::experimental::fetch_image<DType>(
                     input_0, int(dim0));
-                DType px2 = sycl::ext::oneapi::experimental::read_image<DType>(
+                DType px2 = sycl::ext::oneapi::experimental::fetch_image<DType>(
                     input_1, int(dim0));
 
                 auto sum = DType(util::add_kernel<DType, NChannels>(px1, px2));
diff --git a/sycl/test-e2e/bindless_images/sampling_1D.cpp b/sycl/test-e2e/bindless_images/sampling_1D.cpp
index 554ba98e13333..f02e7eef26059 100644
--- a/sycl/test-e2e/bindless_images/sampling_1D.cpp
+++ b/sycl/test-e2e/bindless_images/sampling_1D.cpp
@@ -68,9 +68,9 @@ int main() {
       cgh.parallel_for<image_addition>(N, [=](sycl::id<1> id) {
         // Normalize coordinate -- +0.5 to look towards centre of pixel
         float x = float(id[0] + 0.5f) / (float)N;
-        // Extension: read image data from handle
+        // Extension: sample image data from handle
         float px1 =
-            sycl::ext::oneapi::experimental::read_image<float>(imgHandle, x);
+            sycl::ext::oneapi::experimental::sample_image<float>(imgHandle, x);
 
         outAcc[id] = px1;
       });
diff --git a/sycl/test-e2e/bindless_images/sampling_2D.cpp b/sycl/test-e2e/bindless_images/sampling_2D.cpp
index 026d6a15c6647..ac340a654f135 100644
--- a/sycl/test-e2e/bindless_images/sampling_2D.cpp
+++ b/sycl/test-e2e/bindless_images/sampling_2D.cpp
@@ -98,12 +98,12 @@ int main() {
             float fdim0 = float(dim0 + 0.5f) / (float)width;
             float fdim1 = float(dim1 + 0.5f) / (float)height;
 
-            // Extension: read image data from handle
+            // Extension: sample image data from handle
             sycl::float4 px1 =
-                sycl::ext::oneapi::experimental::read_image<sycl::float4>(
+                sycl::ext::oneapi::experimental::sample_image<sycl::float4>(
                     imgHandle1, sycl::float2(fdim0, fdim1));
             sycl::float4 px2 =
-                sycl::ext::oneapi::experimental::read_image<sycl::float4>(
+                sycl::ext::oneapi::experimental::sample_image<sycl::float4>(
                     imgHandle2, sycl::float2(fdim0, fdim1));
 
             outAcc[sycl::id<2>{dim1, dim0}] = px1[0] + px2[0];
diff --git a/sycl/test-e2e/bindless_images/sampling_2D_USM_shared.cpp b/sycl/test-e2e/bindless_images/sampling_2D_USM_shared.cpp
index b4bb46acfe39f..bd7401598dd3e 100644
--- a/sycl/test-e2e/bindless_images/sampling_2D_USM_shared.cpp
+++ b/sycl/test-e2e/bindless_images/sampling_2D_USM_shared.cpp
@@ -97,8 +97,8 @@ int main() {
             float fdim0 = float(dim0 + 0.5f) / (float)width;
             float fdim1 = float(dim1 + 0.5f) / (float)height;
 
-            // Extension: read image data from handle
-            float px = sycl::ext::oneapi::experimental::read_image<float>(
+            // Extension: sample image data from handle
+            float px = sycl::ext::oneapi::experimental::sample_image<float>(
                 imgHandle, sycl::float2(fdim0, fdim1));
 
             outAcc[sycl::id<2>{dim1, dim0}] = px;
diff --git a/sycl/test-e2e/bindless_images/sampling_2D_half.cpp b/sycl/test-e2e/bindless_images/sampling_2D_half.cpp
index 5365ea22f7021..07d364229702c 100644
--- a/sycl/test-e2e/bindless_images/sampling_2D_half.cpp
+++ b/sycl/test-e2e/bindless_images/sampling_2D_half.cpp
@@ -82,9 +82,9 @@ int main() {
             float fdim0 = float(dim0 + 0.5f) / (float)width;
             float fdim1 = float(dim1 + 0.5f) / (float)height;
 
-            // Extension: read image data from handle
+            // Extension: sample image data from handle
             sycl::half4 px1 =
-                sycl::ext::oneapi::experimental::read_image<sycl::half4>(
+                sycl::ext::oneapi::experimental::sample_image<sycl::half4>(
                     imgHandle, sycl::float2(fdim0, fdim1));
 
             outAcc[sycl::id<2>{dim1, dim0}] = px1[0];
diff --git a/sycl/test-e2e/bindless_images/sampling_3D.cpp b/sycl/test-e2e/bindless_images/sampling_3D.cpp
index 192fa1f556927..63dd1d930157e 100644
--- a/sycl/test-e2e/bindless_images/sampling_3D.cpp
+++ b/sycl/test-e2e/bindless_images/sampling_3D.cpp
@@ -77,9 +77,9 @@ int main() {
             float fdim1 = float(dim1 + 0.5f) / (float)height;
             float fdim2 = float(dim2 + 0.5f) / (float)depth;
 
-            // Extension: read image data from handle
+            // Extension: sample image data from handle
             sycl::float4 px1 =
-                sycl::ext::oneapi::experimental::read_image<sycl::float4>(
+                sycl::ext::oneapi::experimental::sample_image<sycl::float4>(
                     imgHandle, sycl::float3(fdim0, fdim1, fdim2));
 
             outAcc[sycl::id<3>{dim2, dim1, dim0}] = px1[0];
diff --git a/sycl/test-e2e/bindless_images/sampling_unique_addr_modes.cpp b/sycl/test-e2e/bindless_images/sampling_unique_addr_modes.cpp
index f17e96a6c6e55..431e22869d055 100644
--- a/sycl/test-e2e/bindless_images/sampling_unique_addr_modes.cpp
+++ b/sycl/test-e2e/bindless_images/sampling_unique_addr_modes.cpp
@@ -88,8 +88,8 @@ int main() {
             float fdim1 = float(dim1 + height + 0.5) / (float)height;
             float fdim2 = float(dim2 + depth + 0.5) / (float)depth;
 
-            // Extension: read image data from handle
-            float px1 = syclexp::read_image<float>(
+            // Extension: sample image data from handle
+            float px1 = syclexp::sample_image<float>(
                 imgHandle, sycl::float3(fdim0, fdim1, fdim2));
 
             outAcc[sycl::id<3>{dim2, dim1, dim0}] = px1;
diff --git a/sycl/test-e2e/bindless_images/user_types/mipmap_read_user_type_2D.cpp b/sycl/test-e2e/bindless_images/user_types/mipmap_read_user_type_2D.cpp
index b146e38bc5cde..56ce6b3e63c04 100644
--- a/sycl/test-e2e/bindless_images/user_types/mipmap_read_user_type_2D.cpp
+++ b/sycl/test-e2e/bindless_images/user_types/mipmap_read_user_type_2D.cpp
@@ -119,9 +119,9 @@ bool run_test() {
             float fdim0 = float(dim0 + 0.5f) / (float)width;
             float fdim1 = float(dim1 + 0.5f) / (float)height;
 
-            // Extension: read mipmap level 1 with LOD
+            // Extension: sample mipmap level 1 with LOD
             MyType pixel =
-                sycl::ext::oneapi::experimental::read_mipmap<MyType, OutType>(
+                sycl::ext::oneapi::experimental::sample_mipmap<MyType, OutType>(
                     mipHandle, sycl::float2(fdim0, fdim1), 1.0f);
 
             outAcc[sycl::id<2>{dim1, dim0}] = pixel;
diff --git a/sycl/test-e2e/bindless_images/user_types/read_write_user_type.cpp b/sycl/test-e2e/bindless_images/user_types/read_write_user_type.cpp
index c250d4a54aa58..1ea050376d019 100644
--- a/sycl/test-e2e/bindless_images/user_types/read_write_user_type.cpp
+++ b/sycl/test-e2e/bindless_images/user_types/read_write_user_type.cpp
@@ -72,12 +72,12 @@ bool run_test() {
 
         MyType myPixel{};
 
-        // Unsampled read
-        myPixel = syclexp::read_image<MyType, OutType>(unsampledImgIn, coords);
+        // Unsampled fetch
+        myPixel = syclexp::fetch_image<MyType, OutType>(unsampledImgIn, coords);
 
         // Sampled read
         myPixel +=
-            syclexp::read_image<MyType, OutType>(sampledImgIn, floatCoords);
+            syclexp::sample_image<MyType, OutType>(sampledImgIn, floatCoords);
 
         syclexp::write_image(imgOut, coords, myPixel);
       });
diff --git a/sycl/test-e2e/bindless_images/vulkan_interop/sampled_images.cpp b/sycl/test-e2e/bindless_images/vulkan_interop/sampled_images.cpp
index e3d28b202d213..db48fb0341f61 100644
--- a/sycl/test-e2e/bindless_images/vulkan_interop/sampled_images.cpp
+++ b/sycl/test-e2e/bindless_images/vulkan_interop/sampled_images.cpp
@@ -147,9 +147,9 @@ bool run_sycl(sycl::range<NDims> globalSize, sycl::range<NDims> localSize,
               float fdim1 = float(dim1 + 0.5f) / (float)height;
               float fdim2 = float(dim2 + 0.5f) / (float)depth;
 
-              // Extension: read image data from handle (Vulkan imported)
+              // Extension: sample image data from handle (Vulkan imported)
               VecType pixel;
-              pixel = syclexp::read_image<
+              pixel = syclexp::sample_image<
                   std::conditional_t<NChannels == 1, DType, VecType>>(
                   handles.imgInput, sycl::float3(fdim0, fdim1, fdim2));
 
@@ -163,8 +163,8 @@ bool run_sycl(sycl::range<NDims> globalSize, sycl::range<NDims> localSize,
               float fdim0 = float(dim0 + 0.5f) / (float)width;
               float fdim1 = float(dim1 + 0.5f) / (float)height;
 
-              // Extension: read image data from handle (Vulkan imported)
-              VecType pixel = syclexp::read_image<
+              // Extension: sample image data from handle (Vulkan imported)
+              VecType pixel = syclexp::sample_image<
                   std::conditional_t<NChannels == 1, DType, VecType>>(
                   handles.imgInput, sycl::float2(fdim0, fdim1));
 
diff --git a/sycl/test-e2e/bindless_images/vulkan_interop/unsampled_images.cpp b/sycl/test-e2e/bindless_images/vulkan_interop/unsampled_images.cpp
index c1d16567fad57..f9e23244350a4 100644
--- a/sycl/test-e2e/bindless_images/vulkan_interop/unsampled_images.cpp
+++ b/sycl/test-e2e/bindless_images/vulkan_interop/unsampled_images.cpp
@@ -185,9 +185,9 @@ void run_ndim_test(sycl::range<NDims> global_size,
 
             if constexpr (NDims == 2) {
               if constexpr (NChannels > 1) {
-                VecType px1 = syclexp::read_image<VecType>(
+                VecType px1 = syclexp::fetch_image<VecType>(
                     handles.input_1, sycl::int2(dim0, dim1));
-                VecType px2 = syclexp::read_image<VecType>(
+                VecType px2 = syclexp::fetch_image<VecType>(
                     handles.input_2, sycl::int2(dim0, dim1));
 
                 auto sum =
@@ -195,10 +195,10 @@ void run_ndim_test(sycl::range<NDims> global_size,
                 syclexp::write_image<VecType>(
                     handles.output, sycl::int2(dim0, dim1), VecType(sum));
               } else {
-                DType px1 = syclexp::read_image<DType>(handles.input_1,
-                                                       sycl::int2(dim0, dim1));
-                DType px2 = syclexp::read_image<DType>(handles.input_2,
-                                                       sycl::int2(dim0, dim1));
+                DType px1 = syclexp::fetch_image<DType>(handles.input_1,
+                                                        sycl::int2(dim0, dim1));
+                DType px2 = syclexp::fetch_image<DType>(handles.input_2,
+                                                        sycl::int2(dim0, dim1));
 
                 auto sum = DType(util::add_kernel<DType, NChannels>(px1, px2));
                 syclexp::write_image<DType>(handles.output,
@@ -208,9 +208,9 @@ void run_ndim_test(sycl::range<NDims> global_size,
               size_t dim2 = it.get_global_id(2);
 
               if constexpr (NChannels > 1) {
-                VecType px1 = syclexp::read_image<VecType>(
+                VecType px1 = syclexp::fetch_image<VecType>(
                     handles.input_1, sycl::int3(dim0, dim1, dim2));
-                VecType px2 = syclexp::read_image<VecType>(
+                VecType px2 = syclexp::fetch_image<VecType>(
                     handles.input_2, sycl::int3(dim0, dim1, dim2));
 
                 auto sum =
@@ -218,9 +218,9 @@ void run_ndim_test(sycl::range<NDims> global_size,
                 syclexp::write_image<VecType>(
                     handles.output, sycl::int3(dim0, dim1, dim2), VecType(sum));
               } else {
-                DType px1 = syclexp::read_image<DType>(
+                DType px1 = syclexp::fetch_image<DType>(
                     handles.input_1, sycl::int3(dim0, dim1, dim2));
-                DType px2 = syclexp::read_image<DType>(
+                DType px2 = syclexp::fetch_image<DType>(
                     handles.input_2, sycl::int3(dim0, dim1, dim2));
 
                 auto sum = DType(util::add_kernel<DType, NChannels>(px1, px2));
diff --git a/sycl/test/extensions/bindless_images.cpp b/sycl/test/extensions/bindless_images.cpp
index b73449b5428cc..0f7f72e60771b 100644
--- a/sycl/test/extensions/bindless_images.cpp
+++ b/sycl/test/extensions/bindless_images.cpp
@@ -35,7 +35,7 @@ int main() {
       auto outAcc = buf.get_access<sycl::access_mode::write>(cgh, width);
 
       cgh.parallel_for<image_read>(width, [=](sycl::id<1> id) {
-        sycl::float4 px1 = read_image<sycl::float4>(imgHandle1, int(id[0]));
+        sycl::float4 px1 = fetch_image<sycl::float4>(imgHandle1, int(id[0]));
         outAcc[id] = px1[0];
       });
     });

From feb77220764b2f978f5e1e9cbcc9b832710c7636 Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Wed, 21 Feb 2024 08:36:17 -0800
Subject: [PATCH 12/30] [SYCL] Sort platforms in platform::get_platforms()
 (#12719)

So that the first one would contain the most preferrable device in case
SYCL applications/library performs manual device selection and defaults
to the first available device.
---
 sycl/source/detail/platform_impl.cpp   | 25 +++++++++++++++++++++++++
 sycl/test-e2e/Basic/get_backend.cpp    |  7 +++----
 sycl/test-e2e/Config/select_device.cpp |  6 +++++-
 3 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/sycl/source/detail/platform_impl.cpp b/sycl/source/detail/platform_impl.cpp
index 57b4a2f48030b..1506cfdf0c28f 100644
--- a/sycl/source/detail/platform_impl.cpp
+++ b/sycl/source/detail/platform_impl.cpp
@@ -203,6 +203,31 @@ std::vector<platform> platform_impl::get_platforms() {
   // may be initialized after.
   GlobalHandler::registerDefaultContextReleaseHandler();
 
+  // Some applications/libraries prefer to implement their own device selection
+  // and default to just providing the first available device. Make sure that
+  // the first platform has the most preferrable device.
+  auto GetPlatformScore = [](const platform &p) {
+    auto devs = p.get_devices();
+    auto it =
+        std::max_element(devs.begin(), devs.end(), [](auto lhs, auto rhs) {
+          return default_selector_v(lhs) < default_selector_v(rhs);
+        });
+    return default_selector_v(*it);
+  };
+
+  std::vector<std::pair<platform, int>> PlatformScores;
+  PlatformScores.reserve(Platforms.size());
+  for (auto &p : Platforms)
+    PlatformScores.emplace_back(p, GetPlatformScore(p));
+
+  std::stable_sort(PlatformScores.begin(), PlatformScores.end(), [&](auto lhs, auto rhs) {
+    return lhs.second > rhs.second;
+  });
+
+  Platforms.clear();
+  for (auto &e : PlatformScores )
+    Platforms.push_back(e.first);
+
   return Platforms;
 }
 
diff --git a/sycl/test-e2e/Basic/get_backend.cpp b/sycl/test-e2e/Basic/get_backend.cpp
index 975b3a7c7456a..c3930b607b78a 100644
--- a/sycl/test-e2e/Basic/get_backend.cpp
+++ b/sycl/test-e2e/Basic/get_backend.cpp
@@ -1,8 +1,7 @@
-// Sporadic fails on DG2
-// TODO: Reenable when internal ticket is resolved
-// UNSUPPORTED: gpu-intel-dg2
 // RUN: %{build} -o %t.out
-// RUN: %{run-unfiltered-devices} %t.out
+// FPGA RT returns random CL_INVALID_CONTEXT in some configurations, tracked
+// internally. Avoid FPGA devices until that is fixed.
+// RUN: env ONEAPI_DEVICE_SELECTOR="*:gpu;*:cpu" %{run-unfiltered-devices} %t.out
 //
 //==----------------- get_backend.cpp ------------------------==//
 // This is a test of get_backend().
diff --git a/sycl/test-e2e/Config/select_device.cpp b/sycl/test-e2e/Config/select_device.cpp
index 16d1e24340c1e..970e3088b3e71 100644
--- a/sycl/test-e2e/Config/select_device.cpp
+++ b/sycl/test-e2e/Config/select_device.cpp
@@ -123,7 +123,11 @@ static std::vector<DevDescT> getAllowListDesc(std::string allowList) {
         throw std::runtime_error("Malformed device allowlist");
       }
       decDescs.back().devDriverVer = allowList.substr(start, pos - start);
-      pos = pos + 3;
+      pos = pos + 2;
+
+      if (allowList[pos] == ',') {
+        pos++;
+      }
     }
 
     else if ((allowList.compare(pos, platformName.size(), platformName)) == 0) {

From 7f51d9288bb33b8170446deb5709071dec72660a Mon Sep 17 00:00:00 2001
From: Udit Agarwal <16324601+uditagarwal97@users.noreply.github.com>
Date: Wed, 21 Feb 2024 09:29:34 -0800
Subject: [PATCH 13/30] [SYCL] Return nullptr when allocation size is zero in
 usm allocator (#12765)

Currently, usm allocator throws when the allocation size is zero.
However, this behavior is not aligned with that of std::allocator.
Refer https://github.com/KhronosGroup/SYCL-Docs/issues/355 for
discussion regarding this. The spec says that the allocation functions
must succeed when the size is zero. The value returned in this case is
unspecified (it can either be a NULL pointer or a non-NULL pointer)

This PR makes USM allocator return a null pointer when the allocation
size is zero.
---
 sycl/include/sycl/usm/usm_allocator.hpp       |  3 ++
 .../USM/usm_allocator_zero_allocation.cpp     | 30 +++++++++++++++++++
 2 files changed, 33 insertions(+)
 create mode 100644 sycl/test-e2e/USM/usm_allocator_zero_allocation.cpp

diff --git a/sycl/include/sycl/usm/usm_allocator.hpp b/sycl/include/sycl/usm/usm_allocator.hpp
index 6ae1da5a3e350..31d9c5ec9a955 100644
--- a/sycl/include/sycl/usm/usm_allocator.hpp
+++ b/sycl/include/sycl/usm/usm_allocator.hpp
@@ -71,6 +71,9 @@ class usm_allocator {
   T *allocate(size_t NumberOfElements, const detail::code_location CodeLoc =
                                            detail::code_location::current()) {
 
+    if (!NumberOfElements)
+      return nullptr;
+
     auto Result = reinterpret_cast<T *>(
         aligned_alloc(getAlignment(), NumberOfElements * sizeof(value_type),
                       MDevice, MContext, AllocKind, MPropList, CodeLoc));
diff --git a/sycl/test-e2e/USM/usm_allocator_zero_allocation.cpp b/sycl/test-e2e/USM/usm_allocator_zero_allocation.cpp
new file mode 100644
index 0000000000000..b8aace8933c28
--- /dev/null
+++ b/sycl/test-e2e/USM/usm_allocator_zero_allocation.cpp
@@ -0,0 +1,30 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+template <usm::alloc alloc_kind> void test(queue &q) {
+  sycl::usm_allocator<int, alloc_kind> ua(q);
+  int *p = ua.allocate(0);
+
+  assert(!p && "Our implementation of usm_allocator is expected to return a "
+               "null pointer when allocation size is zero.");
+
+  ua.deallocate(p, 0);
+}
+
+int main() {
+  queue q;
+  auto dev = q.get_device();
+
+  if (dev.has(aspect::usm_host_allocations)) {
+    test<usm::alloc::host>(q);
+  }
+  if (dev.has(aspect::usm_shared_allocations)) {
+    test<usm::alloc::shared>(q);
+  }
+
+  return 0;
+}

From 69d233b80399963c87fa0c728cd52be944346db3 Mon Sep 17 00:00:00 2001
From: Udit Agarwal <16324601+uditagarwal97@users.noreply.github.com>
Date: Wed, 21 Feb 2024 09:53:53 -0800
Subject: [PATCH 14/30] [CI] Enable FPGA on postcommit for Linux (#12673)

This PR enables testing on opencl::fpga for Linux in postcommit.
I have currently disabled the tests that were failing on the fpga and
created another Github issue for the same:
https://github.com/intel/llvm/issues/12683. I am not very sure if the
failures are because of some implementation bug or if that feature is
not supported on fpga.
---
 .github/workflows/sycl-linux-run-tests.yml                   | 2 +-
 .github/workflows/sycl-post-commit.yml                       | 1 +
 sycl/test-e2e/Assert/assert_in_kernels.cpp                   | 2 +-
 sycl/test-e2e/Assert/assert_in_multiple_tus.cpp              | 2 +-
 sycl/test-e2e/Assert/assert_in_multiple_tus_one_ndebug.cpp   | 2 +-
 sycl/test-e2e/Assert/assert_in_one_kernel.cpp                | 2 +-
 sycl/test-e2e/Assert/assert_in_simultaneous_kernels.cpp      | 2 +-
 .../Assert/assert_in_simultaneously_multiple_tus.cpp         | 2 +-
 sycl/test-e2e/DeviceLib/string_test.cpp                      | 5 ++++-
 .../joint_matrix_opt_kernel_feature_unsupported_hw.cpp       | 2 +-
 sycl/test-e2e/syclcompat/atomic/atomic_arith.cpp             | 4 +++-
 sycl/test-e2e/syclcompat/atomic/atomic_comp_exchange.cpp     | 4 +++-
 12 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/sycl-linux-run-tests.yml b/.github/workflows/sycl-linux-run-tests.yml
index 639c788c7fbd3..59ffe39325b8e 100644
--- a/.github/workflows/sycl-linux-run-tests.yml
+++ b/.github/workflows/sycl-linux-run-tests.yml
@@ -100,7 +100,7 @@ on:
         options:
           - 'opencl:cpu'
           - 'opencl:gpu'
-          - 'opencl:acc'
+          - 'opencl:fpga'
           - 'ext_oneapi_level_zero:gpu'
           - 'ext_oneapi_hip:gpu'
       tests_selector:
diff --git a/.github/workflows/sycl-post-commit.yml b/.github/workflows/sycl-post-commit.yml
index dc21752d712bf..aaf36263a93a9 100644
--- a/.github/workflows/sycl-post-commit.yml
+++ b/.github/workflows/sycl-post-commit.yml
@@ -45,6 +45,7 @@ jobs:
           - name: Intel GEN12 Graphics with Level Zero
             runner: '["Linux", "gen12"]'
             extra_lit_opts: --param gpu-intel-gen12=True
+            target_devices: ext_oneapi_level_zero:gpu;opencl:fpga
           - name: Intel Arc A-Series Graphics with Level Zero
             runner: '["Linux", "arc"]'
             extra_lit_opts: --param matrix-xmx8=True --param gpu-intel-dg2=True
diff --git a/sycl/test-e2e/Assert/assert_in_kernels.cpp b/sycl/test-e2e/Assert/assert_in_kernels.cpp
index aec66106d7bba..6321896701fb9 100644
--- a/sycl/test-e2e/Assert/assert_in_kernels.cpp
+++ b/sycl/test-e2e/Assert/assert_in_kernels.cpp
@@ -8,7 +8,7 @@
 
 // RUN: %{build} -DSYCL_FALLBACK_ASSERT=1 -o %t.out
 // Shouldn't fail on ACC as fallback assert isn't enqueued there
-// RUN: %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt %if acc %{ --check-prefix=CHECK-ACC %}
+// RUN: %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt %if fpga %{ --check-prefix=CHECK-ACC %}
 //
 // CHECK-NOT:  One shouldn't see this message
 // CHECK:      {{.*}}assert_in_kernels.hpp:25: void kernelFunc2(int *, int): {{.*}} [{{[0,2]}},0,0], {{.*}} [0,0,0]
diff --git a/sycl/test-e2e/Assert/assert_in_multiple_tus.cpp b/sycl/test-e2e/Assert/assert_in_multiple_tus.cpp
index 062f5952236ad..0ea8069a36245 100644
--- a/sycl/test-e2e/Assert/assert_in_multiple_tus.cpp
+++ b/sycl/test-e2e/Assert/assert_in_multiple_tus.cpp
@@ -10,7 +10,7 @@
 // XFAIL: (opencl && gpu)
 
 // RUN: %{build} -DSYCL_FALLBACK_ASSERT=1 -I %S/Inputs %S/Inputs/kernels_in_file2.cpp -o %t.out
-// RUN: %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt %if acc %{ --check-prefix=CHECK-ACC %}
+// RUN: %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt %if fpga %{ --check-prefix=CHECK-ACC %}
 // Shouldn't fail on ACC as fallback assert isn't enqueued there
 //
 // CUDA uses block/thread vs global/local id for SYCL, also it shows the
diff --git a/sycl/test-e2e/Assert/assert_in_multiple_tus_one_ndebug.cpp b/sycl/test-e2e/Assert/assert_in_multiple_tus_one_ndebug.cpp
index 1bf2ecdc98418..017d0ba462322 100644
--- a/sycl/test-e2e/Assert/assert_in_multiple_tus_one_ndebug.cpp
+++ b/sycl/test-e2e/Assert/assert_in_multiple_tus_one_ndebug.cpp
@@ -11,7 +11,7 @@
 
 // RUN: %clangxx -DSYCL_FALLBACK_ASSERT=1 -fsycl -fsycl-targets=%{sycl_triple} -DDEFINE_NDEBUG_INFILE2 -I %S/Inputs %S/assert_in_multiple_tus.cpp %S/Inputs/kernels_in_file2.cpp -o %t.out
 // Shouldn't fail on ACC as fallback assert isn't enqueued there
-// RUN: %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt %if acc %{ --check-prefix=CHECK-ACC %}
+// RUN: %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt %if fpga %{ --check-prefix=CHECK-ACC %}
 //
 // CHECK-NOT:  this message from calculus
 // CUDA uses block/thread vs global/local id for SYCL, also it shows the
diff --git a/sycl/test-e2e/Assert/assert_in_one_kernel.cpp b/sycl/test-e2e/Assert/assert_in_one_kernel.cpp
index 4ab0c382e8b43..cf5b5875f77d3 100644
--- a/sycl/test-e2e/Assert/assert_in_one_kernel.cpp
+++ b/sycl/test-e2e/Assert/assert_in_one_kernel.cpp
@@ -8,7 +8,7 @@
 
 // RUN: %{build} -DSYCL_FALLBACK_ASSERT=1 -o %t.out
 // Shouldn't fail on ACC as fallback assert isn't enqueued there
-// RUN: %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt %if acc %{ --check-prefix=CHECK-ACC %}
+// RUN: %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt %if fpga %{ --check-prefix=CHECK-ACC %}
 //
 // CHECK:      {{.*}}assert_in_one_kernel.hpp:10: void kernelFunc(int *, int): {{.*}} [{{[0-3]}},0,0], {{.*}} [0,0,0]
 // CHECK-SAME: Assertion `Buf[wiID] != 0 && "from assert statement"` failed
diff --git a/sycl/test-e2e/Assert/assert_in_simultaneous_kernels.cpp b/sycl/test-e2e/Assert/assert_in_simultaneous_kernels.cpp
index f6edc754960d9..49dd89f0a2a9f 100644
--- a/sycl/test-e2e/Assert/assert_in_simultaneous_kernels.cpp
+++ b/sycl/test-e2e/Assert/assert_in_simultaneous_kernels.cpp
@@ -17,7 +17,7 @@
 // DEFINE: %{gpu_env} = env SYCL_PI_LEVEL_ZERO_TRACK_INDIRECT_ACCESS_MEMORY=1 SYCL_PI_SUPPRESS_ERROR_MESSAGE=1
 
 // Shouldn't fail on ACC as fallback assert isn't enqueued there
-// RUN: %if gpu %{ %{gpu_env} %} %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt %if acc %{ --check-prefix=CHECK-ACC %}
+// RUN: %if gpu %{ %{gpu_env} %} %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt %if fpga %{ --check-prefix=CHECK-ACC %}
 //
 // CHECK:      {{.*}}assert_in_simultaneous_kernels.hpp:13: void assertFunc(): {{.*}}[9,7,0], {{.*}}[0,0,0]
 // CHECK-SAME: Assertion `false && "from assert statement"` failed
diff --git a/sycl/test-e2e/Assert/assert_in_simultaneously_multiple_tus.cpp b/sycl/test-e2e/Assert/assert_in_simultaneously_multiple_tus.cpp
index 582dc0cfbe66e..e95be2322c3f2 100644
--- a/sycl/test-e2e/Assert/assert_in_simultaneously_multiple_tus.cpp
+++ b/sycl/test-e2e/Assert/assert_in_simultaneously_multiple_tus.cpp
@@ -17,7 +17,7 @@
 // DEFINE: %{gpu_env} = env SYCL_PI_LEVEL_ZERO_TRACK_INDIRECT_ACCESS_MEMORY=1 SYCL_PI_SUPPRESS_ERROR_MESSAGE=1
 
 // Shouldn't fail on ACC as fallback assert isn't enqueued there
-// RUN: %if gpu %{ %{gpu_env} %} %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt %if acc %{ --check-prefix=CHECK-ACC %}
+// RUN: %if gpu %{ %{gpu_env} %} %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt %if fpga %{ --check-prefix=CHECK-ACC %}
 //
 // CHECK:      {{this message from file1|this message from file2}}
 // CHECK-NOT:  The test ended.
diff --git a/sycl/test-e2e/DeviceLib/string_test.cpp b/sycl/test-e2e/DeviceLib/string_test.cpp
index be4e7ed38ca27..9d7627dbfe0f5 100644
--- a/sycl/test-e2e/DeviceLib/string_test.cpp
+++ b/sycl/test-e2e/DeviceLib/string_test.cpp
@@ -1,7 +1,10 @@
 // UNSUPPORTED: hip
 // RUN: %{build} -fno-builtin -o %t.out
 // RUN: %{run} %t.out
-
+// TODO: Remove unsupported after fixing
+// https://github.com/intel/llvm/issues/12683
+// UNSUPPORTED: accelerator
+//
 // RUN: %{build} -fno-builtin -fsycl-device-lib-jit-link -o %t.out
 // RUN: %if !gpu %{ %{run} %t.out %}
 
diff --git a/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature_unsupported_hw.cpp b/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature_unsupported_hw.cpp
index b20ee36448ef0..c041616e7db84 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature_unsupported_hw.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature_unsupported_hw.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: gpu-intel-gen12
+// REQUIRES: gpu-intel-gen12, gpu
 
 // RUN: %{build} -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/syclcompat/atomic/atomic_arith.cpp b/sycl/test-e2e/syclcompat/atomic/atomic_arith.cpp
index 002621de9bfeb..bbd406a35e69e 100644
--- a/sycl/test-e2e/syclcompat/atomic/atomic_arith.cpp
+++ b/sycl/test-e2e/syclcompat/atomic/atomic_arith.cpp
@@ -32,7 +32,9 @@
 
 // UNSUPPORTED: hip
 
-// RUN: %clangxx -std=c++20 -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
+// FIXME: Remove "-fsycl-device-code-split=per_kernel" option after fixing
+// https://github.com/intel/llvm/issues/12743.
+// RUN: %clangxx -std=c++20 -fsycl -fsycl-targets=%{sycl_triple} -fsycl-device-code-split=per_kernel %s -o %t.out
 // RUN: %{run} %t.out
 
 #include <cstddef>
diff --git a/sycl/test-e2e/syclcompat/atomic/atomic_comp_exchange.cpp b/sycl/test-e2e/syclcompat/atomic/atomic_comp_exchange.cpp
index 9dc513ae5a86a..b315816034a51 100644
--- a/sycl/test-e2e/syclcompat/atomic/atomic_comp_exchange.cpp
+++ b/sycl/test-e2e/syclcompat/atomic/atomic_comp_exchange.cpp
@@ -32,7 +32,9 @@
 
 // UNSUPPORTED: hip
 
-// RUN: %clangxx -std=c++20 -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
+// FIXME: Remove "-fsycl-device-code-split=per_kernel" option after fixing
+// https://github.com/intel/llvm/issues/12743.
+// RUN: %clangxx -std=c++20 -fsycl -fsycl-targets=%{sycl_triple} -fsycl-device-code-split=per_kernel %s -o %t.out
 // RUN: %{run} %t.out
 
 #include <type_traits>

From 358843a8436593684c2bc0a7299480696b2d60a8 Mon Sep 17 00:00:00 2001
From: stdale-intel <stewart.t.dale@intel.com>
Date: Wed, 21 Feb 2024 13:09:48 -0800
Subject: [PATCH 15/30] [CI] Enable OSSF scorecard workflow to run on
 intel/llvm (#12779)

Enable the OSSF (https://github.com/ossf) scorecard workflow to run and
generate the repo security score for tracking current repo security
issues.

Currently enabled to run nightly while we resolve any open issues, will
then move to weekly once clean runs.
---
 .github/workflows/scorecard.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml
index 17132c9270a09..35e746eb7069d 100644
--- a/.github/workflows/scorecard.yml
+++ b/.github/workflows/scorecard.yml
@@ -2,7 +2,7 @@
 # by a third-party and are governed by separate terms of service, privacy
 # policy, and support documentation.
 
-# Check current LLVM-Project results here: https://securityscorecards.dev/viewer/?uri=github.com/llvm/llvm-project
+# Check current LLVM-Project results here: https://securityscorecards.dev/viewer/?uri=github.com/intel/llvm
 
 name: Scorecard supply-chain security
 on:
@@ -12,7 +12,7 @@ on:
   # To guarantee Maintained check is occasionally updated. See
   # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained
   schedule:
-    - cron: '38 20 * * *'
+    - cron: '30 20 * * *'
 
 # Declare default permissions as read only.
 permissions:
@@ -22,7 +22,7 @@ jobs:
   analysis:
     name: Scorecard analysis
     runs-on: ubuntu-latest
-    if: github.repository == 'llvm/llvm-project'
+    if: github.repository == 'intel/llvm'
     permissions:
       # Needed to upload the results to code-scanning dashboard.
       security-events: write

From 4bc97454d5362cabb6ff4ac68953c1b41e8c9fa1 Mon Sep 17 00:00:00 2001
From: Steffen Larsen <steffen.larsen@intel.com>
Date: Thu, 22 Feb 2024 10:02:54 +0100
Subject: [PATCH 16/30] [SYCL] Fix ballot_group when the sub-group is not full
 size (#12737)

Not all sub-groups are necessarily the max size of sub-groups in the
kernel invocation. As such, non-uniform groups should handle these
sub-groups properly. However, due to how the mask for the false-group in
ballot_group creates its mask, it thinks it has full 32-element size no
matter how big the actual sub-group is. This commit fixes this issue.

---------

Signed-off-by: Larsen, Steffen <steffen.larsen@intel.com>
---
 .../ext/oneapi/experimental/ballot_group.hpp  |  8 +-
 .../oneapi/experimental/fixed_size_group.hpp  |  2 +-
 .../NonUniformGroups/ballot_group.cpp         | 82 +++++++++--------
 .../NonUniformGroups/fixed_size_group.cpp     | 71 ++++++++-------
 .../NonUniformGroups/opportunistic_group.cpp  | 82 +++++++++--------
 .../NonUniformGroups/tangle_group.cpp         | 88 ++++++++++---------
 6 files changed, 186 insertions(+), 147 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp
index 985620152e4ae..079a637580b93 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp
@@ -153,7 +153,13 @@ get_ballot_group(Group group, bool predicate) {
   if (predicate) {
     return ballot_group<sycl::sub_group>(mask, predicate);
   } else {
-    return ballot_group<sycl::sub_group>(~mask, predicate);
+    // To negate the mask for the false-predicate group, we also need to exclude
+    // all parts of the mask that is not part of the group.
+    sub_group_mask::BitsType participant_filter =
+        (~sub_group_mask::BitsType{0}) >>
+        (sub_group_mask::max_bits - group.get_local_linear_range());
+    return ballot_group<sycl::sub_group>((~mask) & participant_filter,
+                                         predicate);
   }
 #endif
 #else
diff --git a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp
index c5543989998a2..3c7ef6b257d0a 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp
@@ -64,7 +64,7 @@ template <size_t PartitionSize, typename ParentGroup> class fixed_size_group {
 
   range_type get_group_range() const {
 #ifdef __SYCL_DEVICE_ONLY__
-    return __spirv_SubgroupMaxSize() / PartitionSize;
+    return __spirv_SubgroupSize() / PartitionSize;
 #else
     throw runtime_error("Non-uniform groups are not supported on host device.",
                         PI_ERROR_INVALID_DEVICE);
diff --git a/sycl/test-e2e/NonUniformGroups/ballot_group.cpp b/sycl/test-e2e/NonUniformGroups/ballot_group.cpp
index 2a4eba90a68e8..7f21d55bcc3a4 100644
--- a/sycl/test-e2e/NonUniformGroups/ballot_group.cpp
+++ b/sycl/test-e2e/NonUniformGroups/ballot_group.cpp
@@ -20,43 +20,51 @@ int main() {
     return 0;
   }
 
-  sycl::buffer<bool, 1> MatchBuf{sycl::range{32}};
-  sycl::buffer<bool, 1> LeaderBuf{sycl::range{32}};
-
-  const auto NDR = sycl::nd_range<1>{32, 32};
-  Q.submit([&](sycl::handler &CGH) {
-    sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only};
-    sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only};
-    const auto KernelFunc =
-        [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] {
-          auto WI = item.get_global_id();
-          auto SG = item.get_sub_group();
-
-          // Split into odd and even work-items.
-          bool Predicate = WI % 2 == 0;
-          auto BallotGroup = syclex::get_ballot_group(SG, Predicate);
-
-          // Check function return values match Predicate.
-          // NB: Test currently uses exactly one sub-group, but we use SG
-          //     below in case this changes in future.
-          bool Match = true;
-          auto GroupID = (Predicate) ? 1 : 0;
-          auto LocalID = SG.get_local_id() / 2;
-          Match &= (BallotGroup.get_group_id() == GroupID);
-          Match &= (BallotGroup.get_local_id() == LocalID);
-          Match &= (BallotGroup.get_group_range() == 2);
-          Match &= (BallotGroup.get_local_range() == 16);
-          MatchAcc[WI] = Match;
-          LeaderAcc[WI] = BallotGroup.leader();
-        };
-    CGH.parallel_for<TestKernel>(NDR, KernelFunc);
-  });
-
-  sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only};
-  sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only};
-  for (int WI = 0; WI < 32; ++WI) {
-    assert(MatchAcc[WI] == true);
-    assert(LeaderAcc[WI] == (WI < 2));
+  // Test for both the full sub-group size and a case with less work than a full
+  // sub-group.
+  for (size_t WGS : std::array<size_t, 2>{32, 16}) {
+    std::cout << "Testing for work size " << WGS << std::endl;
+
+    sycl::buffer<bool, 1> MatchBuf{sycl::range{WGS}};
+    sycl::buffer<bool, 1> LeaderBuf{sycl::range{WGS}};
+
+    const auto NDR = sycl::nd_range<1>{WGS, WGS};
+    Q.submit([&](sycl::handler &CGH) {
+      sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only};
+      sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only};
+      const auto KernelFunc =
+          [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] {
+            auto WI = item.get_global_id();
+            auto SG = item.get_sub_group();
+
+            // Split into odd and even work-items.
+            bool Predicate = WI % 2 == 0;
+            auto BallotGroup = syclex::get_ballot_group(SG, Predicate);
+
+            // Check function return values match Predicate.
+            // NB: Test currently uses exactly one sub-group, but we use SG
+            //     below in case this changes in future.
+            bool Match = true;
+            auto GroupID = (Predicate) ? 1 : 0;
+            auto LocalID = SG.get_local_id() / 2;
+            Match &= (BallotGroup.get_group_id() == GroupID);
+            Match &= (BallotGroup.get_local_id() == LocalID);
+            Match &= (BallotGroup.get_group_range() == 2);
+            Match &= (BallotGroup.get_local_range() ==
+                      SG.get_local_linear_range() / 2);
+            MatchAcc[WI] = Match;
+            LeaderAcc[WI] = BallotGroup.leader();
+          };
+      CGH.parallel_for<TestKernel>(NDR, KernelFunc);
+    });
+
+    sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only};
+    sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only};
+    for (int WI = 0; WI < WGS; ++WI) {
+      assert(MatchAcc[WI] == true);
+      assert(LeaderAcc[WI] == (WI < 2));
+    }
   }
+
   return 0;
 }
diff --git a/sycl/test-e2e/NonUniformGroups/fixed_size_group.cpp b/sycl/test-e2e/NonUniformGroups/fixed_size_group.cpp
index 2f043c5bed711..29567639ff442 100644
--- a/sycl/test-e2e/NonUniformGroups/fixed_size_group.cpp
+++ b/sycl/test-e2e/NonUniformGroups/fixed_size_group.cpp
@@ -14,36 +14,47 @@ template <size_t PartitionSize> class TestKernel;
 template <size_t PartitionSize> void test() {
   sycl::queue Q;
 
-  sycl::buffer<bool, 1> MatchBuf{sycl::range{32}};
-  sycl::buffer<bool, 1> LeaderBuf{sycl::range{32}};
-
-  const auto NDR = sycl::nd_range<1>{32, 32};
-  Q.submit([&](sycl::handler &CGH) {
-    sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only};
-    sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only};
-    const auto KernelFunc =
-        [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] {
-          auto WI = item.get_global_id();
-          auto SG = item.get_sub_group();
-
-          auto Partition = syclex::get_fixed_size_group<PartitionSize>(SG);
-
-          bool Match = true;
-          Match &= (Partition.get_group_id() == (WI / PartitionSize));
-          Match &= (Partition.get_local_id() == (WI % PartitionSize));
-          Match &= (Partition.get_group_range() == (32 / PartitionSize));
-          Match &= (Partition.get_local_range() == PartitionSize);
-          MatchAcc[WI] = Match;
-          LeaderAcc[WI] = Partition.leader();
-        };
-    CGH.parallel_for<TestKernel<PartitionSize>>(NDR, KernelFunc);
-  });
-
-  sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only};
-  sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only};
-  for (int WI = 0; WI < 32; ++WI) {
-    assert(MatchAcc[WI] == true);
-    assert(LeaderAcc[WI] == ((WI % PartitionSize) == 0));
+  // Test for both the full sub-group size and a case with less work than a full
+  // sub-group.
+  for (size_t WGS : std::array<size_t, 2>{32, 16}) {
+    if (WGS < PartitionSize)
+      continue;
+
+    std::cout << "Testing for work size " << WGS << " and partition size "
+              << PartitionSize << std::endl;
+
+    sycl::buffer<bool, 1> MatchBuf{sycl::range{WGS}};
+    sycl::buffer<bool, 1> LeaderBuf{sycl::range{WGS}};
+
+    const auto NDR = sycl::nd_range<1>{WGS, WGS};
+    Q.submit([&](sycl::handler &CGH) {
+      sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only};
+      sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only};
+      const auto KernelFunc =
+          [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] {
+            auto WI = item.get_global_id();
+            auto SG = item.get_sub_group();
+            auto SGS = SG.get_local_linear_range();
+
+            auto Partition = syclex::get_fixed_size_group<PartitionSize>(SG);
+
+            bool Match = true;
+            Match &= (Partition.get_group_id() == (WI / PartitionSize));
+            Match &= (Partition.get_local_id() == (WI % PartitionSize));
+            Match &= (Partition.get_group_range() == (SGS / PartitionSize));
+            Match &= (Partition.get_local_range() == PartitionSize);
+            MatchAcc[WI] = Match;
+            LeaderAcc[WI] = Partition.leader();
+          };
+      CGH.parallel_for<TestKernel<PartitionSize>>(NDR, KernelFunc);
+    });
+
+    sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only};
+    sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only};
+    for (int WI = 0; WI < WGS; ++WI) {
+      assert(MatchAcc[WI] == true);
+      assert(LeaderAcc[WI] == ((WI % PartitionSize) == 0));
+    }
   }
 }
 
diff --git a/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp b/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp
index 292d26859cbef..c926a8643f31d 100644
--- a/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp
+++ b/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp
@@ -20,50 +20,56 @@ int main() {
     return 0;
   }
 
-  sycl::buffer<bool, 1> MatchBuf{sycl::range{32}};
-  sycl::buffer<bool, 1> LeaderBuf{sycl::range{32}};
+  // Test for both the full sub-group size and a case with less work than a full
+  // sub-group.
+  for (size_t WGS : std::array<size_t, 2>{32, 16}) {
+    std::cout << "Testing for work size " << WGS << std::endl;
 
-  const auto NDR = sycl::nd_range<1>{32, 32};
-  Q.submit([&](sycl::handler &CGH) {
-    sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only};
-    sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only};
-    const auto KernelFunc =
-        [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] {
-          auto WI = item.get_global_id();
-          auto SG = item.get_sub_group();
+    sycl::buffer<bool, 1> MatchBuf{sycl::range{WGS}};
+    sycl::buffer<bool, 1> LeaderBuf{sycl::range{WGS}};
 
-          // Due to the unpredictable runtime behavior of opportunistic groups,
-          // some values may change from run to run. Check they're in expected
-          // ranges and consistent with other groups.
-          if (item.get_global_id() % 2 == 0) {
-            auto OpportunisticGroup =
-                syclex::this_kernel::get_opportunistic_group();
+    const auto NDR = sycl::nd_range<1>{WGS, WGS};
+    Q.submit([&](sycl::handler &CGH) {
+      sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only};
+      sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only};
+      const auto KernelFunc =
+          [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] {
+            auto WI = item.get_global_id();
+            auto SG = item.get_sub_group();
 
-            bool Match = true;
-            Match &= (OpportunisticGroup.get_group_id() == 0);
-            Match &= (OpportunisticGroup.get_local_id() <
-                      OpportunisticGroup.get_local_range());
-            Match &= (OpportunisticGroup.get_group_range() == 1);
-            Match &= (OpportunisticGroup.get_local_linear_range() <=
-                      SG.get_local_linear_range());
-            MatchAcc[WI] = Match;
-            LeaderAcc[WI] = OpportunisticGroup.leader();
-          }
-        };
-    CGH.parallel_for<TestKernel>(NDR, KernelFunc);
-  });
+            // Due to the unpredictable runtime behavior of opportunistic
+            // groups, some values may change from run to run. Check they're in
+            // expected ranges and consistent with other groups.
+            if (item.get_global_id() % 2 == 0) {
+              auto OpportunisticGroup =
+                  syclex::this_kernel::get_opportunistic_group();
 
-  sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only};
-  sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only};
-  uint32_t NumLeaders = 0;
-  for (int WI = 0; WI < 32; ++WI) {
-    if (WI % 2 == 0) {
-      assert(MatchAcc[WI] == true);
-      if (LeaderAcc[WI]) {
-        NumLeaders++;
+              bool Match = true;
+              Match &= (OpportunisticGroup.get_group_id() == 0);
+              Match &= (OpportunisticGroup.get_local_id() <
+                        OpportunisticGroup.get_local_range());
+              Match &= (OpportunisticGroup.get_group_range() == 1);
+              Match &= (OpportunisticGroup.get_local_linear_range() <=
+                        SG.get_local_linear_range());
+              MatchAcc[WI] = Match;
+              LeaderAcc[WI] = OpportunisticGroup.leader();
+            }
+          };
+      CGH.parallel_for<TestKernel>(NDR, KernelFunc);
+    });
+
+    sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only};
+    sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only};
+    uint32_t NumLeaders = 0;
+    for (int WI = 0; WI < WGS; ++WI) {
+      if (WI % 2 == 0) {
+        assert(MatchAcc[WI] == true);
+        if (LeaderAcc[WI]) {
+          NumLeaders++;
+        }
       }
     }
+    assert(NumLeaders > 0);
   }
-  assert(NumLeaders > 0);
   return 0;
 }
diff --git a/sycl/test-e2e/NonUniformGroups/tangle_group.cpp b/sycl/test-e2e/NonUniformGroups/tangle_group.cpp
index 80132d6aa3e30..a5fb9a0d31dc6 100644
--- a/sycl/test-e2e/NonUniformGroups/tangle_group.cpp
+++ b/sycl/test-e2e/NonUniformGroups/tangle_group.cpp
@@ -20,51 +20,59 @@ int main() {
     return 0;
   }
 
-  sycl::buffer<bool, 1> MatchBuf{sycl::range{32}};
-  sycl::buffer<bool, 1> LeaderBuf{sycl::range{32}};
+  // Test for both the full sub-group size and a case with less work than a full
+  // sub-group.
+  for (size_t WGS : std::array<size_t, 2>{32, 16}) {
+    std::cout << "Testing for work size " << WGS << std::endl;
 
-  const auto NDR = sycl::nd_range<1>{32, 32};
-  Q.submit([&](sycl::handler &CGH) {
-    sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only};
-    sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only};
-    const auto KernelFunc =
-        [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] {
-          auto WI = item.get_global_id();
-          auto SG = item.get_sub_group();
+    sycl::buffer<bool, 1> MatchBuf{sycl::range{WGS}};
+    sycl::buffer<bool, 1> LeaderBuf{sycl::range{WGS}};
 
-          // Split into odd and even work-items via control flow.
-          // Branches deliberately duplicated to test impact of optimizations.
-          // This only reliably works with optimizations disabled right now.
-          if (item.get_global_id() % 2 == 0) {
-            auto TangleGroup = syclex::get_tangle_group(SG);
+    const auto NDR = sycl::nd_range<1>{WGS, WGS};
+    Q.submit([&](sycl::handler &CGH) {
+      sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only};
+      sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only};
+      const auto KernelFunc =
+          [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] {
+            auto WI = item.get_global_id();
+            auto SG = item.get_sub_group();
 
-            bool Match = true;
-            Match &= (TangleGroup.get_group_id() == 0);
-            Match &= (TangleGroup.get_local_id() == SG.get_local_id() / 2);
-            Match &= (TangleGroup.get_group_range() == 1);
-            Match &= (TangleGroup.get_local_range() == 16);
-            MatchAcc[WI] = Match;
-            LeaderAcc[WI] = TangleGroup.leader();
-          } else {
-            auto TangleGroup = syclex::get_tangle_group(SG);
+            // Split into odd and even work-items via control flow.
+            // Branches deliberately duplicated to test impact of optimizations.
+            // This only reliably works with optimizations disabled right now.
+            if (item.get_global_id() % 2 == 0) {
+              auto TangleGroup = syclex::get_tangle_group(SG);
 
-            bool Match = true;
-            Match &= (TangleGroup.get_group_id() == 0);
-            Match &= (TangleGroup.get_local_id() == SG.get_local_id() / 2);
-            Match &= (TangleGroup.get_group_range() == 1);
-            Match &= (TangleGroup.get_local_range() == 16);
-            MatchAcc[WI] = Match;
-            LeaderAcc[WI] = TangleGroup.leader();
-          }
-        };
-    CGH.parallel_for<TestKernel>(NDR, KernelFunc);
-  });
+              bool Match = true;
+              Match &= (TangleGroup.get_group_id() == 0);
+              Match &= (TangleGroup.get_local_id() == SG.get_local_id() / 2);
+              Match &= (TangleGroup.get_group_range() == 1);
+              Match &= (TangleGroup.get_local_range() ==
+                        SG.get_local_linear_range() / 2);
+              MatchAcc[WI] = Match;
+              LeaderAcc[WI] = TangleGroup.leader();
+            } else {
+              auto TangleGroup = syclex::get_tangle_group(SG);
 
-  sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only};
-  sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only};
-  for (int WI = 0; WI < 32; ++WI) {
-    assert(MatchAcc[WI] == true);
-    assert(LeaderAcc[WI] == (WI < 2));
+              bool Match = true;
+              Match &= (TangleGroup.get_group_id() == 0);
+              Match &= (TangleGroup.get_local_id() == SG.get_local_id() / 2);
+              Match &= (TangleGroup.get_group_range() == 1);
+              Match &= (TangleGroup.get_local_range() ==
+                        SG.get_local_linear_range() / 2);
+              MatchAcc[WI] = Match;
+              LeaderAcc[WI] = TangleGroup.leader();
+            }
+          };
+      CGH.parallel_for<TestKernel>(NDR, KernelFunc);
+    });
+
+    sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only};
+    sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only};
+    for (int WI = 0; WI < WGS; ++WI) {
+      assert(MatchAcc[WI] == true);
+      assert(LeaderAcc[WI] == (WI < 2));
+    }
   }
   return 0;
 }

From 343d9531c637a27fdb7c4de2d8273bb7c7c6b0d1 Mon Sep 17 00:00:00 2001
From: Neil Spruit <neil.r.spruit@intel.com>
Date: Thu, 22 Feb 2024 04:42:38 -0800
Subject: [PATCH 17/30] [UR][L0] Fix the multi device event cache to allocate
 lists as pointers (#12778)

- fix to address a multi device crash given many devices causing a stack
overflow. Event Caches per device are now explicitly allocated.

- pre-commit PR for
https://github.com/oneapi-src/unified-runtime/pull/1366

---------

Signed-off-by: Spruit, Neil R <neil.r.spruit@intel.com>
Co-authored-by: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
---
 sycl/plugins/unified_runtime/CMakeLists.txt | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index 2c28c7682d5a2..c362246b6f77d 100644
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -57,13 +57,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT)
   include(FetchContent)
 
   set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
-  # commit 24078c26ab871572067f520d856f4c65271cb9e5
-  # Merge: 227a5edf 89a66af7
+  # commit b4150ad1512476eb6ea0f2ede3bd29a6e3fd2b9e
+  # Merge: 4814e717 123c00f1
   # Author: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
-  # Date:   Mon Feb 19 11:51:49 2024 +0100
-  #     Merge pull request #1299 from rafbiels/rafbiels/fix-cuda-maxreg-check
-  #     [CUDA] Fix MaxRegsPerBlock check in setKernelParams
-  set(UNIFIED_RUNTIME_TAG 24078c26ab871572067f520d856f4c65271cb9e5)
+  # Date:   Thu Feb 22 10:42:39 2024 +0000
+  #     Merge pull request #1366 from nrspruit/fix_multidevice_event_cache
+  #     [L0] Fix the multi device event cache to allocate lists as pointers
+  set(UNIFIED_RUNTIME_TAG b4150ad1512476eb6ea0f2ede3bd29a6e3fd2b9e)
 
   if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO)
     set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}")

From b0f584c675f9e90c12a4153a13b2821bcbe8cd0b Mon Sep 17 00:00:00 2001
From: Steffen Larsen <steffen.larsen@intel.com>
Date: Thu, 22 Feb 2024 13:43:30 +0100
Subject: [PATCH 18/30] [SYCL][Docs] Make external events wait with queue
 (#12766)

This commit adds the behavior to ext_oneapi_set_external_event that it
is also waited on when the queue is waited on.

---------

Signed-off-by: Larsen, Steffen <steffen.larsen@intel.com>
---
 ..._ext_oneapi_in_order_queue_events.asciidoc |  5 ++
 sycl/source/detail/queue_impl.cpp             |  5 ++
 .../InOrderEventsExt/set_external_event.cpp   | 62 ++++++++++++++++---
 3 files changed, 64 insertions(+), 8 deletions(-)

diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_in_order_queue_events.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_in_order_queue_events.asciidoc
index 012e9efd7de3f..d04247b7bf6a7 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_in_order_queue_events.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_in_order_queue_events.asciidoc
@@ -140,6 +140,11 @@ This is equivalent to calling `handler::depends_on()` in a command submission
 with the `externalEvent` from the most recent call to this member function since
 the previous command submission to the same queue.
 
+If `queue::wait()` or `queue::wait_and_throw()` is called prior to any command
+submission following a call to this member function, `externalEvent.wait()` is
+called and `externalEvent` will not be a dependency on the next command
+submitted to the queue.
+
 Calls to this member function throw a `sycl::exception` with `errc::invalid` if
 the queue does not have the `property::queue::in_order` property.
 
diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp
index eb0e274a191fb..bc8c26ca88cb0 100644
--- a/sycl/source/detail/queue_impl.cpp
+++ b/sycl/source/detail/queue_impl.cpp
@@ -516,6 +516,11 @@ void queue_impl::wait(const detail::code_location &CodeLoc) {
   for (const EventImplPtr &Event : StreamsServiceEvents)
     Event->wait(Event);
 
+  // If there is an external event set, we need to wait on it.
+  std::optional<event> ExternalEvent = popExternalEvent();
+  if (ExternalEvent)
+    ExternalEvent->wait();
+
 #ifdef XPTI_ENABLE_INSTRUMENTATION
   instrumentationEpilog(TelemetryEvent, Name, StreamID, IId);
 #endif
diff --git a/sycl/test-e2e/InOrderEventsExt/set_external_event.cpp b/sycl/test-e2e/InOrderEventsExt/set_external_event.cpp
index 61987d5b5b3cb..f1d2dfb47ddbf 100644
--- a/sycl/test-e2e/InOrderEventsExt/set_external_event.cpp
+++ b/sycl/test-e2e/InOrderEventsExt/set_external_event.cpp
@@ -8,16 +8,13 @@
 
 constexpr size_t N = 1024;
 
-int main() {
-  sycl::context Ctx;
-  sycl::device Dev = Ctx.get_devices()[0];
-
-  sycl::queue Q1{Ctx, Dev, {sycl::property::queue::in_order{}}};
-  sycl::queue Q2{Ctx, Dev, {sycl::property::queue::in_order{}}};
+int check_work(sycl::queue &Q1, sycl::queue &Q2) {
+  std::cout << "Checking ext_oneapi_set_external_event for a workload."
+            << std::endl;
 
   sycl::buffer<int> DevDataBuf{sycl::range{N}};
   sycl::accessor DevData{DevDataBuf};
-  int *HostData = (int *)malloc(N * sizeof(int) * 10);
+  int *HostData = new int[N * 10];
 
   for (size_t I = 0; I < 10; ++I) {
     Q1.fill(DevData, 0);
@@ -52,6 +49,55 @@ int main() {
       }
     }
   }
-  free(HostData);
+  delete[] HostData;
+  return Failures;
+}
+
+int check_wait(sycl::queue &Q1, sycl::queue &Q2) {
+  std::cout << "Checking ext_oneapi_set_external_event with wait on queue."
+            << std::endl;
+
+  sycl::buffer<int> DevDataBuf{sycl::range{N}};
+  sycl::accessor DevData{DevDataBuf};
+  int *HostData = new int[N];
+
+  Q1.fill(DevData, 0);
+  for (size_t I = 0; I < 10; ++I) {
+    Q1.submit([&](sycl::handler &h) {
+      h.require(DevData);
+      h.parallel_for(N, [=](sycl::item<1> Idx) { ++DevData[Idx]; });
+    });
+  }
+  sycl::event E = Q1.copy(DevData, HostData);
+
+  Q2.ext_oneapi_set_external_event(E);
+  Q2.wait_and_throw();
+
+  int Failures = 0;
+  for (size_t I = 0; I < N; ++I) {
+    int Expected = 10;
+    int Actual = HostData[I];
+    if (Expected != Actual) {
+      std::cout << "Result not matching the expected value at index " << I
+                << ": " << Expected << " != " << Actual << std::endl;
+      ++Failures;
+    }
+  }
+  delete[] HostData;
+  return Failures;
+}
+
+int main() {
+  sycl::context Ctx;
+  sycl::device Dev = Ctx.get_devices()[0];
+
+  sycl::queue Q1{Ctx, Dev, {sycl::property::queue::in_order{}}};
+  sycl::queue Q2{Ctx, Dev, {sycl::property::queue::in_order{}}};
+
+  int Failures = 0;
+
+  Failures += check_work(Q1, Q2);
+  Failures += check_wait(Q1, Q2);
+
   return Failures;
 }

From 0fc5129a5d0b87e39d66a9fb8ca697ccde546602 Mon Sep 17 00:00:00 2001
From: Vyacheslav Levytskyy
 <89994100+VyacheslavLevytskyy@users.noreply.github.com>
Date: Thu, 22 Feb 2024 14:17:50 +0100
Subject: [PATCH 19/30] [SYCL][Matrix] Update builtin name for the checked
 matrix construct SPIR-V instruction (#12787)

In order to support OpCooperativeMatrixConstructCheckedINTEL instruction
the builtin name must be updated to the corresponding spelling:
__spirv_CooperativeMatrixConstructCheckedINTEL
---
 sycl/include/CL/__spirv/spirv_ops.hpp                | 6 +++---
 sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp
index 9af5b7e75ae38..323f1328cbff5 100644
--- a/sycl/include/CL/__spirv/spirv_ops.hpp
+++ b/sycl/include/CL/__spirv/spirv_ops.hpp
@@ -51,9 +51,9 @@ template <typename T, typename Tp, std::size_t R, std::size_t C,
           __spv::Scope::Flag S = __spv::Scope::Flag::Subgroup>
 extern __DPCPP_SYCL_EXTERNAL
     __spv::__spirv_JointMatrixINTEL<Tp, R, C, L, S, U> *
-    __spirv_CompositeConstructCheckedINTEL(const T Value, size_t Height,
-                                           size_t Stride, size_t Width,
-                                           size_t CoordX, size_t CoordY);
+    __spirv_CooperativeMatrixConstructCheckedINTEL(const T Value, size_t Height,
+                                                   size_t Stride, size_t Width,
+                                                   size_t CoordX, size_t CoordY);
 
 template <typename T, typename Tp, std::size_t R, std::size_t C,
           __spv::MatrixUse U,
diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp
index 05626d5972574..1ef69d7cbb5ab 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp
@@ -616,7 +616,7 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_fill_checked(
   using storage_element_type =
       typename oneapi::detail::jm_type_interpretation_helper_trait<
           T>::storage_element_type;
-  Res.spvm = __spirv_CompositeConstructCheckedINTEL<
+  Res.spvm = __spirv_CooperativeMatrixConstructCheckedINTEL<
       storage_element_type, T, NumRows, NumCols,
       spv_matrix_use_traits<Use>::value,
       spv_matrix_layout_traits<Layout>::value>(

From d6eecfa0e709ec280db6c15fa84d9f573e6a9d42 Mon Sep 17 00:00:00 2001
From: Artur Gainullin <artur.gainullin@intel.com>
Date: Thu, 22 Feb 2024 07:17:12 -0800
Subject: [PATCH 20/30] [SYCL][Driver] Link with sycl libs at link step of
 clang-cl -fsycl (#12793)

This PR is addressing the following scenario:
clang-cl -I[path to sycl headers] sycl_program.cpp # program compiled
but without sycl libs
clang-cl -fsycl sycl_program.obj # user expects sycl libs to be linked
automatically here.

Without this fix this scenario fails at link step because sycl libraries
are not pulled in.

This scenario already works for clang driver, so only clang-cl needs a
fix.
---
 clang/lib/Driver/ToolChains/MSVC.cpp | 13 +++++++++----
 clang/test/Driver/sycl-offload.c     | 15 ++++++++++++++-
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp b/clang/lib/Driver/ToolChains/MSVC.cpp
index 52f811621ba82..5e08965320004 100644
--- a/clang/lib/Driver/ToolChains/MSVC.cpp
+++ b/clang/lib/Driver/ToolChains/MSVC.cpp
@@ -130,18 +130,23 @@ void visualstudio::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("-defaultlib:oldnames");
   }
 
-  if ((!C.getDriver().IsCLMode() && Args.hasArg(options::OPT_fsycl) &&
+  if ((Args.hasArg(options::OPT_fsycl) &&
        !Args.hasArg(options::OPT_nolibsycl)) ||
       Args.hasArg(options::OPT_fsycl_host_compiler_EQ)) {
     CmdArgs.push_back(Args.MakeArgString(std::string("-libpath:") +
                                          TC.getDriver().Dir + "/../lib"));
-    // When msvcrtd is added via --dependent-lib, we add the sycld
-    // equivalent.  Do not add the -defaultlib as it conflicts.
-    if (!isDependentLibAdded(Args, "msvcrtd")) {
+    if (!Args.hasArg(options::OPT__SLASH_MDd) &&
+        !isDependentLibAdded(Args, "msvcrtd")) {
       if (Args.hasArg(options::OPT_fpreview_breaking_changes))
         CmdArgs.push_back("-defaultlib:sycl" SYCL_MAJOR_VERSION "-preview.lib");
       else
         CmdArgs.push_back("-defaultlib:sycl" SYCL_MAJOR_VERSION ".lib");
+    } else {
+      if (Args.hasArg(options::OPT_fpreview_breaking_changes))
+        CmdArgs.push_back("-defaultlib:sycl" SYCL_MAJOR_VERSION
+                          "-previewd.lib");
+      else
+        CmdArgs.push_back("-defaultlib:sycl" SYCL_MAJOR_VERSION "d.lib");
     }
     CmdArgs.push_back("-defaultlib:sycl-devicelib-host.lib");
   }
diff --git a/clang/test/Driver/sycl-offload.c b/clang/test/Driver/sycl-offload.c
index 729c36ef5aebc..ee2bd1db9ed16 100644
--- a/clang/test/Driver/sycl-offload.c
+++ b/clang/test/Driver/sycl-offload.c
@@ -363,7 +363,7 @@
 // RUN: %clang -fsycl -target x86_64-unknown-windows-msvc %s -o %t -### 2>&1 | FileCheck -check-prefix=CHECK-LINK-SYCL %s
 // RUN: %clang_cl -fsycl %s -o %t -### 2>&1 | FileCheck -check-prefix=CHECK-LINK-SYCL-CL %s
 // CHECK-LINK-SYCL-CL: "--dependent-lib=sycl{{[0-9]*}}"
-// CHECK-LINK-SYCL-CL-NOT: "-defaultlib:sycl{{[0-9]*}}.lib"
+// CHECK-LINK-SYCL-CL: "-defaultlib:sycl{{[0-9]*}}.lib"
 // CHECK-LINK-SYCL: "-defaultlib:sycl{{[0-9]*}}.lib"
 
 /// Check no SYCL runtime is linked with -nolibsycl
@@ -688,3 +688,16 @@
 // FSYCL-PREVIEW-BREAKING-CHANGES-DEBUG-CHECK: --dependent-lib=sycl{{[0-9]*}}-previewd
 // FSYCL-PREVIEW-BREAKING-CHANGES-DEBUG-CHECK-NOT: -defaultlib:sycl{{[0-9]*}}.lib
 // FSYCL-PREVIEW-BREAKING-CHANGES-DEBUG-CHECK-NOT: -defaultlib:sycl{{[0-9]*}}-preview.lib
+
+
+/// Check that at link step of "clang-cl -fsycl" we pull in sycl.lib even if at the compilation step sycl libraries were not provided (this is possible if user compiles manually without -fsycl by provided paths to the headers).
+// RUN: %clang_cl -### -fsycl -nolibsycl -target x86_64-unknown-windows-msvc -c %s 2>&1 | FileCheck -check-prefix FSYCL-CL-COMPILE-NOLIBS-CHECK %s
+// RUN: %clang_cl -### -fsycl %s 2>&1 | FileCheck -check-prefix FSYCL-CL-LINK-CHECK %s
+// FSYCL-CL-COMPILE-NOLIBS-CHECK-NOT: "--dependent-lib=sycl{{[0-9]*}}"
+// FSYCL-CL-LINK-CHECK: "-defaultlib:sycl{{[0-9]*}}.lib"
+
+/// Check that at link step of "clang-cl -fsycl /MDd" we pull in sycld.lib even if at the compilation step sycl libraries were not provided (this is possible if user compiles manually without -fsycl by provided paths to the headers).
+// RUN: %clang_cl -### -fsycl -nolibsycl /MDd -target x86_64-unknown-windows-msvc -c %s 2>&1 | FileCheck -check-prefix FSYCL-CL-COMPILE-NOLIBS-MDd-CHECK %s
+// RUN: %clang_cl -### -fsycl /MDd %s 2>&1 | FileCheck -check-prefix FSYCL-CL-LINK--MDd-CHECK %s
+// FSYCL-CL-COMPILE-NOLIBS-MDd-CHECK-NOT: "--dependent-lib=sycl{{[0-9]*}}d"
+// FSYCL-CL-LINK--MDd-CHECK: "-defaultlib:sycl{{[0-9]*}}d.lib"

From 66d35e2c7885946d246925d9357eaac43bf69099 Mon Sep 17 00:00:00 2001
From: 0x12CC <68250218+0x12CC@users.noreply.github.com>
Date: Thu, 22 Feb 2024 10:25:08 -0500
Subject: [PATCH 21/30] [SYCL] Use PI APIs for cooperative kernels (#12367)

This change updates the SYCL runtime to use
`piextKernelSuggestMaxCooperativeGroupCount` and
`piextEnqueueCooperativeKernelLaunch` for cooperative kernels. These
functions are used to implement the query and launch kernels as
described in the sycl_ext_oneapi_root_group extension.

---------

Signed-off-by: Michael Aziz <michael.aziz@intel.com>
Co-authored-by: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
Co-authored-by: Artur Gainullin <artur.gainullin@intel.com>
Co-authored-by: Steffen Larsen <steffen.larsen@intel.com>
---
 sycl/include/sycl/detail/cg.hpp               |  8 ++--
 sycl/include/sycl/detail/pi.def               |  2 +
 sycl/include/sycl/detail/pi.h                 | 22 +++++++++-
 sycl/include/sycl/detail/spirv.hpp            |  6 +++
 sycl/include/sycl/detail/type_traits.hpp      |  6 +--
 .../ext/oneapi/experimental/root_group.hpp    | 40 ++++++++----------
 sycl/include/sycl/handler.hpp                 |  7 ++++
 sycl/include/sycl/sycl.hpp                    |  1 +
 sycl/plugins/cuda/pi_cuda.cpp                 | 17 ++++++++
 sycl/plugins/hip/pi_hip.cpp                   | 17 ++++++++
 sycl/plugins/level_zero/pi_level_zero.cpp     | 17 ++++++++
 sycl/plugins/opencl/pi_opencl.cpp             | 17 ++++++++
 sycl/plugins/unified_runtime/pi2ur.hpp        | 37 +++++++++++++++++
 sycl/source/detail/graph_impl.cpp             |  2 +-
 sycl/source/detail/handler_impl.hpp           |  2 +
 sycl/source/detail/jit_compiler.cpp           |  2 +-
 sycl/source/detail/kernel_impl.hpp            | 20 +++++++++
 sycl/source/detail/scheduler/commands.cpp     | 37 +++++++++++------
 sycl/source/detail/scheduler/commands.hpp     |  3 +-
 .../source/detail/scheduler/graph_builder.cpp |  9 +++-
 sycl/source/feature_test.hpp.in               |  1 +
 sycl/source/handler.cpp                       | 41 ++++++++++++++-----
 sycl/source/kernel.cpp                        | 12 ++++++
 .../KernelFusion/cooperative_kernel.cpp       | 30 ++++++++++++++
 .../NonUniformGroups/is_fixed_topology.cpp    |  4 +-
 sycl/test/abi/pi_cuda_symbol_check.dump       |  2 +
 sycl/test/abi/pi_hip_symbol_check.dump        |  2 +
 sycl/test/abi/pi_level_zero_symbol_check.dump |  2 +
 sycl/test/abi/pi_opencl_symbol_check.dump     |  2 +
 sycl/test/abi/sycl_symbols_linux.dump         |  2 +
 sycl/test/abi/sycl_symbols_windows.dump       |  2 +
 sycl/unittests/helpers/PiMockPlugin.hpp       | 17 ++++++++
 .../arg_mask/EliminatedArgMask.cpp            |  2 +-
 .../scheduler/SchedulerTestUtils.hpp          |  2 +-
 .../scheduler/StreamInitDependencyOnHost.cpp  |  2 +-
 35 files changed, 334 insertions(+), 61 deletions(-)
 create mode 100644 sycl/test-e2e/KernelFusion/cooperative_kernel.cpp

diff --git a/sycl/include/sycl/detail/cg.hpp b/sycl/include/sycl/detail/cg.hpp
index 23a005bfcfcce..5c2ffdc712c00 100644
--- a/sycl/include/sycl/detail/cg.hpp
+++ b/sycl/include/sycl/detail/cg.hpp
@@ -125,7 +125,7 @@ class CG {
   CG(CG &&CommandGroup) = default;
   CG(const CG &CommandGroup) = default;
 
-  CGTYPE getType() { return MType; }
+  CGTYPE getType() const { return MType; }
 
   std::vector<std::vector<char>> &getArgsStorage() {
     return MData.MArgsStorage;
@@ -176,6 +176,7 @@ class CGExecKernel : public CG {
   std::vector<std::shared_ptr<detail::stream_impl>> MStreams;
   std::vector<std::shared_ptr<const void>> MAuxiliaryResources;
   sycl::detail::pi::PiKernelCacheConfig MKernelCacheConfig;
+  bool MKernelIsCooperative = false;
 
   CGExecKernel(NDRDescT NDRDesc, std::shared_ptr<HostKernelBase> HKernel,
                std::shared_ptr<detail::kernel_impl> SyclKernel,
@@ -186,14 +187,15 @@ class CGExecKernel : public CG {
                std::vector<std::shared_ptr<const void>> AuxiliaryResources,
                CGTYPE Type,
                sycl::detail::pi::PiKernelCacheConfig KernelCacheConfig,
-               detail::code_location loc = {})
+               bool KernelIsCooperative, detail::code_location loc = {})
       : CG(Type, std::move(CGData), std::move(loc)),
         MNDRDesc(std::move(NDRDesc)), MHostKernel(std::move(HKernel)),
         MSyclKernel(std::move(SyclKernel)),
         MKernelBundle(std::move(KernelBundle)), MArgs(std::move(Args)),
         MKernelName(std::move(KernelName)), MStreams(std::move(Streams)),
         MAuxiliaryResources(std::move(AuxiliaryResources)),
-        MKernelCacheConfig(std::move(KernelCacheConfig)) {
+        MKernelCacheConfig(std::move(KernelCacheConfig)),
+        MKernelIsCooperative(KernelIsCooperative) {
     assert(getType() == Kernel && "Wrong type of exec kernel CG.");
   }
 
diff --git a/sycl/include/sycl/detail/pi.def b/sycl/include/sycl/detail/pi.def
index d5709aac9dbac..c6b962b8b0f48 100644
--- a/sycl/include/sycl/detail/pi.def
+++ b/sycl/include/sycl/detail/pi.def
@@ -87,6 +87,7 @@ _PI_API(piextKernelSetArgPointer)
 _PI_API(piKernelSetExecInfo)
 _PI_API(piextKernelCreateWithNativeHandle)
 _PI_API(piextKernelGetNativeHandle)
+_PI_API(piextKernelSuggestMaxCooperativeGroupCount)
 // Event
 _PI_API(piEventCreate)
 _PI_API(piEventGetInfo)
@@ -105,6 +106,7 @@ _PI_API(piSamplerRetain)
 _PI_API(piSamplerRelease)
 // Queue commands
 _PI_API(piEnqueueKernelLaunch)
+_PI_API(piextEnqueueCooperativeKernelLaunch)
 _PI_API(piEnqueueEventsWait)
 _PI_API(piEnqueueEventsWaitWithBarrier)
 _PI_API(piEnqueueMemBufferRead)
diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h
index 5059125da7646..56fdeb7a1051b 100644
--- a/sycl/include/sycl/detail/pi.h
+++ b/sycl/include/sycl/detail/pi.h
@@ -152,9 +152,11 @@
 // 15.43 Changed the signature of piextMemGetNativeHandle to also take a
 // pi_device
 // 15.44 Add coarse-grain memory advice flag for HIP.
+// 15.45 Added piextKernelSuggestMaxCooperativeGroupCount and
+//       piextEnqueueCooperativeKernelLaunch.
 
 #define _PI_H_VERSION_MAJOR 15
-#define _PI_H_VERSION_MINOR 44
+#define _PI_H_VERSION_MINOR 45
 
 #define _PI_STRING_HELPER(a) #a
 #define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b)
@@ -1670,6 +1672,18 @@ __SYCL_EXPORT pi_result piextKernelCreateWithNativeHandle(
 __SYCL_EXPORT pi_result
 piextKernelGetNativeHandle(pi_kernel kernel, pi_native_handle *nativeHandle);
 
+/// Gets the max work group count for a cooperative kernel.
+///
+/// \param kernel is the PI kernel being queried.
+/// \param local_work_size is the number of work items in a work group that will
+/// be used when the kernel is launched. \param dynamic_shared_memory_size is
+/// the size of dynamic shared memory, for each work group, in bytes, that will
+/// be used when the kernel is launched." \param group_count_ret is a pointer to
+/// where the query result will be stored.
+__SYCL_EXPORT pi_result piextKernelSuggestMaxCooperativeGroupCount(
+    pi_kernel kernel, size_t local_work_size, size_t dynamic_shared_memory_size,
+    pi_uint32 *group_count_ret);
+
 //
 // Events
 //
@@ -1752,6 +1766,12 @@ __SYCL_EXPORT pi_result piEnqueueKernelLaunch(
     const size_t *local_work_size, pi_uint32 num_events_in_wait_list,
     const pi_event *event_wait_list, pi_event *event);
 
+__SYCL_EXPORT pi_result piextEnqueueCooperativeKernelLaunch(
+    pi_queue queue, pi_kernel kernel, pi_uint32 work_dim,
+    const size_t *global_work_offset, const size_t *global_work_size,
+    const size_t *local_work_size, pi_uint32 num_events_in_wait_list,
+    const pi_event *event_wait_list, pi_event *event);
+
 __SYCL_EXPORT pi_result piEnqueueEventsWait(pi_queue command_queue,
                                             pi_uint32 num_events_in_wait_list,
                                             const pi_event *event_wait_list,
diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp
index 54bb7d229c372..586ccd4e6fce6 100644
--- a/sycl/include/sycl/detail/spirv.hpp
+++ b/sycl/include/sycl/detail/spirv.hpp
@@ -23,6 +23,7 @@ struct sub_group;
 namespace experimental {
 template <typename ParentGroup> class ballot_group;
 template <size_t PartitionSize, typename ParentGroup> class fixed_size_group;
+template <int Dimensions> class root_group;
 template <typename ParentGroup> class tangle_group;
 class opportunistic_group;
 } // namespace experimental
@@ -51,6 +52,11 @@ namespace spirv {
 
 template <typename Group> struct group_scope {};
 
+template <int Dimensions>
+struct group_scope<sycl::ext::oneapi::experimental::root_group<Dimensions>> {
+  static constexpr __spv::Scope::Flag value = __spv::Scope::Flag::Device;
+};
+
 template <int Dimensions> struct group_scope<group<Dimensions>> {
   static constexpr __spv::Scope::Flag value = __spv::Scope::Flag::Workgroup;
 };
diff --git a/sycl/include/sycl/detail/type_traits.hpp b/sycl/include/sycl/detail/type_traits.hpp
index df4ab8f37e17a..ed824f44bcdca 100644
--- a/sycl/include/sycl/detail/type_traits.hpp
+++ b/sycl/include/sycl/detail/type_traits.hpp
@@ -45,9 +45,9 @@ template <class T>
 inline constexpr bool is_fixed_topology_group_v =
     is_fixed_topology_group<T>::value;
 
-#ifdef SYCL_EXT_ONEAPI_ROOT_GROUP
-template <> struct is_fixed_topology_group<root_group> : std::true_type {};
-#endif
+template <int Dimensions> class root_group;
+template <int Dimensions>
+struct is_fixed_topology_group<root_group<Dimensions>> : std::true_type {};
 
 template <int Dimensions>
 struct is_fixed_topology_group<sycl::group<Dimensions>> : std::true_type {};
diff --git a/sycl/include/sycl/ext/oneapi/experimental/root_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/root_group.hpp
index 8cbc88ccf6194..f21099737b97f 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/root_group.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/root_group.hpp
@@ -10,10 +10,10 @@
 
 #include <sycl/builtins.hpp>
 #include <sycl/ext/oneapi/properties/properties.hpp>
+#include <sycl/group.hpp>
 #include <sycl/memory_enums.hpp>
-#include <sycl/queue.hpp>
-
-#define SYCL_EXT_ONEAPI_ROOT_GROUP 1
+#include <sycl/nd_item.hpp>
+#include <sycl/sub_group.hpp>
 
 namespace sycl {
 inline namespace _V1 {
@@ -106,31 +106,25 @@ template <int Dimensions> root_group<Dimensions> get_root_group() {
 
 } // namespace ext::oneapi::experimental
 
-template <>
-typename ext::oneapi::experimental::info::kernel_queue_specific::
-    max_num_work_group_sync::return_type
-    kernel::ext_oneapi_get_info<
-        ext::oneapi::experimental::info::kernel_queue_specific::
-            max_num_work_group_sync>(const queue &q) const {
-  // TODO: query the backend to return a value >= 1.
-  return 1;
-}
-
 template <int dimensions>
 void group_barrier(ext::oneapi::experimental::root_group<dimensions> G,
                    memory_scope FenceScope = decltype(G)::fence_scope) {
-  (void)G;
-  (void)FenceScope;
 #ifdef __SYCL_DEVICE_ONLY__
-  // TODO: Change __spv::Scope::Workgroup to __spv::Scope::Device once backends
-  // support device scope. __spv::Scope::Workgroup is only valid when
-  // max_num_work_group_sync is 1, so that all work items in a root group will
-  // also be in the same work group.
-  __spirv_ControlBarrier(__spv::Scope::Workgroup, __spv::Scope::Workgroup,
-                         __spv::MemorySemanticsMask::SubgroupMemory |
-                             __spv::MemorySemanticsMask::WorkgroupMemory |
-                             __spv::MemorySemanticsMask::CrossWorkgroupMemory);
+  // Root group barrier synchronizes using a work group barrier if there's only
+  // one work group. This allows backends to ignore the ControlBarrier with
+  // Device scope if their maximum number of work groups is 1. This is a
+  // workaround that's not intended to reduce the bar for SPIR-V modules
+  // acceptance, but rather make a pessimistic case work until we have full
+  // support for the device barrier built-in from backends.
+  const auto ChildGroup = ext::oneapi::experimental::get_child_group(G);
+  if (ChildGroup.get_group_linear_range() == 1) {
+    group_barrier(ChildGroup);
+  } else {
+    detail::spirv::ControlBarrier(G, FenceScope, memory_order::seq_cst);
+  }
 #else
+  (void)G;
+  (void)FenceScope;
   throw sycl::runtime_error("Barriers are not supported on host device",
                             PI_ERROR_INVALID_DEVICE);
 #endif
diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp
index 0bfd001261b4a..7a6371624b3ef 100644
--- a/sycl/include/sycl/handler.hpp
+++ b/sycl/include/sycl/handler.hpp
@@ -32,6 +32,7 @@
 #include <sycl/ext/oneapi/device_global/device_global.hpp>
 #include <sycl/ext/oneapi/device_global/properties.hpp>
 #include <sycl/ext/oneapi/experimental/graph.hpp>
+#include <sycl/ext/oneapi/experimental/root_group.hpp>
 #include <sycl/ext/oneapi/kernel_properties/properties.hpp>
 #include <sycl/ext/oneapi/properties/properties.hpp>
 #include <sycl/group.hpp>
@@ -933,6 +934,10 @@ class __SYCL_EXPORT handler {
     } else {
       std::ignore = Props;
     }
+
+    constexpr bool UsesRootSync = PropertiesT::template has_property<
+        sycl::ext::oneapi::experimental::use_root_sync_key>();
+    setKernelIsCooperative(UsesRootSync);
   }
 
   /// Checks whether it is possible to copy the source shape to the destination
@@ -3622,6 +3627,8 @@ class __SYCL_EXPORT handler {
 
   // Set value of the gpu cache configuration for the kernel.
   void setKernelCacheConfig(sycl::detail::pi::PiKernelCacheConfig);
+  // Set value of the kernel is cooperative flag
+  void setKernelIsCooperative(bool);
 
   template <
       ext::oneapi::experimental::detail::UnsupportedGraphFeatures FeatureT>
diff --git a/sycl/include/sycl/sycl.hpp b/sycl/include/sycl/sycl.hpp
index be923a7bfc14a..11c69ded8e798 100644
--- a/sycl/include/sycl/sycl.hpp
+++ b/sycl/include/sycl/sycl.hpp
@@ -90,6 +90,7 @@
 #include <sycl/ext/oneapi/experimental/fixed_size_group.hpp>
 #include <sycl/ext/oneapi/experimental/opportunistic_group.hpp>
 #include <sycl/ext/oneapi/experimental/prefetch.hpp>
+#include <sycl/ext/oneapi/experimental/root_group.hpp>
 #include <sycl/ext/oneapi/experimental/tangle_group.hpp>
 #include <sycl/ext/oneapi/filter_selector.hpp>
 #include <sycl/ext/oneapi/functional.hpp>
diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index de715de0835fd..02fe3af901cb8 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -544,6 +544,16 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
       NumEventsInWaitList, EventWaitList, OutEvent);
 }
 
+pi_result piextEnqueueCooperativeKernelLaunch(
+    pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
+    const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
+    const size_t *LocalWorkSize, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventWaitList, pi_event *OutEvent) {
+  return pi2ur::piextEnqueueCooperativeKernelLaunch(
+      Queue, Kernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, LocalWorkSize,
+      NumEventsInWaitList, EventWaitList, OutEvent);
+}
+
 pi_result piextKernelCreateWithNativeHandle(pi_native_handle NativeHandle,
                                             pi_context Context,
                                             pi_program Program,
@@ -559,6 +569,13 @@ pi_result piextKernelGetNativeHandle(pi_kernel Kernel,
   return pi2ur::piextKernelGetNativeHandle(Kernel, NativeHandle);
 }
 
+pi_result piextKernelSuggestMaxCooperativeGroupCount(
+    pi_kernel Kernel, size_t LocalWorkSize, size_t DynamicSharedMemorySize,
+    pi_uint32 *GroupCountRet) {
+  return pi2ur::piextKernelSuggestMaxCooperativeGroupCount(
+      Kernel, LocalWorkSize, DynamicSharedMemorySize, GroupCountRet);
+}
+
 pi_result piEventCreate(pi_context Context, pi_event *RetEvent) {
   return pi2ur::piEventCreate(Context, RetEvent);
 }
diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp
index 126ada92348f6..b895727c9d0fa 100644
--- a/sycl/plugins/hip/pi_hip.cpp
+++ b/sycl/plugins/hip/pi_hip.cpp
@@ -547,6 +547,16 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
       NumEventsInWaitList, EventWaitList, OutEvent);
 }
 
+pi_result piextEnqueueCooperativeKernelLaunch(
+    pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
+    const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
+    const size_t *LocalWorkSize, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventWaitList, pi_event *OutEvent) {
+  return pi2ur::piextEnqueueCooperativeKernelLaunch(
+      Queue, Kernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, LocalWorkSize,
+      NumEventsInWaitList, EventWaitList, OutEvent);
+}
+
 pi_result piextKernelCreateWithNativeHandle(pi_native_handle NativeHandle,
                                             pi_context Context,
                                             pi_program Program,
@@ -562,6 +572,13 @@ pi_result piextKernelGetNativeHandle(pi_kernel Kernel,
   return pi2ur::piextKernelGetNativeHandle(Kernel, NativeHandle);
 }
 
+pi_result piextKernelSuggestMaxCooperativeGroupCount(
+    pi_kernel Kernel, size_t LocalWorkSize, size_t DynamicSharedMemorySize,
+    pi_uint32 *GroupCountRet) {
+  return pi2ur::piextKernelSuggestMaxCooperativeGroupCount(
+      Kernel, LocalWorkSize, DynamicSharedMemorySize, GroupCountRet);
+}
+
 pi_result piEventCreate(pi_context Context, pi_event *RetEvent) {
   return pi2ur::piEventCreate(Context, RetEvent);
 }
diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index 0fc36a231be6c..c923c802f1d3f 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -558,6 +558,16 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
       NumEventsInWaitList, EventWaitList, OutEvent);
 }
 
+pi_result piextEnqueueCooperativeKernelLaunch(
+    pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
+    const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
+    const size_t *LocalWorkSize, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventWaitList, pi_event *OutEvent) {
+  return pi2ur::piextEnqueueCooperativeKernelLaunch(
+      Queue, Kernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, LocalWorkSize,
+      NumEventsInWaitList, EventWaitList, OutEvent);
+}
+
 pi_result piextKernelCreateWithNativeHandle(pi_native_handle NativeHandle,
                                             pi_context Context,
                                             pi_program Program,
@@ -573,6 +583,13 @@ pi_result piextKernelGetNativeHandle(pi_kernel Kernel,
   return pi2ur::piextKernelGetNativeHandle(Kernel, NativeHandle);
 }
 
+pi_result piextKernelSuggestMaxCooperativeGroupCount(
+    pi_kernel Kernel, size_t LocalWorkSize, size_t DynamicSharedMemorySize,
+    pi_uint32 *GroupCountRet) {
+  return pi2ur::piextKernelSuggestMaxCooperativeGroupCount(
+      Kernel, LocalWorkSize, DynamicSharedMemorySize, GroupCountRet);
+}
+
 //
 // Events
 //
diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp
index c09be92f89406..45fb66575ec42 100644
--- a/sycl/plugins/opencl/pi_opencl.cpp
+++ b/sycl/plugins/opencl/pi_opencl.cpp
@@ -527,6 +527,16 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
       NumEventsInWaitList, EventWaitList, OutEvent);
 }
 
+pi_result piextEnqueueCooperativeKernelLaunch(
+    pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
+    const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
+    const size_t *LocalWorkSize, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventWaitList, pi_event *OutEvent) {
+  return pi2ur::piextEnqueueCooperativeKernelLaunch(
+      Queue, Kernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, LocalWorkSize,
+      NumEventsInWaitList, EventWaitList, OutEvent);
+}
+
 pi_result piextKernelCreateWithNativeHandle(pi_native_handle NativeHandle,
                                             pi_context Context,
                                             pi_program Program,
@@ -541,6 +551,13 @@ pi_result piextKernelGetNativeHandle(pi_kernel Kernel,
   return pi2ur::piextKernelGetNativeHandle(Kernel, NativeHandle);
 }
 
+pi_result piextKernelSuggestMaxCooperativeGroupCount(
+    pi_kernel Kernel, size_t LocalWorkSize, size_t DynamicSharedMemorySize,
+    pi_uint32 *GroupCountRet) {
+  return pi2ur::piextKernelSuggestMaxCooperativeGroupCount(
+      Kernel, LocalWorkSize, DynamicSharedMemorySize, GroupCountRet);
+}
+
 pi_result piEventCreate(pi_context Context, pi_event *RetEvent) {
   return pi2ur::piEventCreate(Context, RetEvent);
 }
diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp
index e3a92022567d0..87ee60f41e2da 100644
--- a/sycl/plugins/unified_runtime/pi2ur.hpp
+++ b/sycl/plugins/unified_runtime/pi2ur.hpp
@@ -2599,6 +2599,19 @@ inline pi_result piextKernelGetNativeHandle(pi_kernel Kernel,
   return PI_SUCCESS;
 }
 
+inline pi_result piextKernelSuggestMaxCooperativeGroupCount(
+    pi_kernel Kernel, size_t LocalWorkSize, size_t DynamicSharedMemorySize,
+    pi_uint32 *GroupCountRet) {
+  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
+  PI_ASSERT(GroupCountRet, PI_ERROR_INVALID_VALUE);
+
+  ur_kernel_handle_t UrKernel = reinterpret_cast<ur_kernel_handle_t>(Kernel);
+  HANDLE_ERRORS(urKernelSuggestMaxCooperativeGroupCountExp(
+      UrKernel, LocalWorkSize, DynamicSharedMemorySize, GroupCountRet));
+
+  return PI_SUCCESS;
+}
+
 /// API for writing data from host to a device global variable.
 ///
 /// \param Queue is the queue
@@ -3669,6 +3682,30 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
   return PI_SUCCESS;
 }
 
+inline pi_result piextEnqueueCooperativeKernelLaunch(
+    pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
+    const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
+    const size_t *LocalWorkSize, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventsWaitList, pi_event *OutEvent) {
+
+  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+  PI_ASSERT((WorkDim > 0) && (WorkDim < 4), PI_ERROR_INVALID_WORK_DIMENSION);
+
+  ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
+  ur_kernel_handle_t UrKernel = reinterpret_cast<ur_kernel_handle_t>(Kernel);
+  const ur_event_handle_t *UrEventsWaitList =
+      reinterpret_cast<const ur_event_handle_t *>(EventsWaitList);
+
+  ur_event_handle_t *UREvent = reinterpret_cast<ur_event_handle_t *>(OutEvent);
+
+  HANDLE_ERRORS(urEnqueueCooperativeKernelLaunchExp(
+      UrQueue, UrKernel, WorkDim, GlobalWorkOffset, GlobalWorkSize,
+      LocalWorkSize, NumEventsInWaitList, UrEventsWaitList, UREvent));
+
+  return PI_SUCCESS;
+}
+
 inline pi_result
 piEnqueueMemImageWrite(pi_queue Queue, pi_mem Image, pi_bool BlockingWrite,
                        pi_image_offset Origin, pi_image_region Region,
diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp
index 980ea7e3e9eba..bdfc90537b520 100644
--- a/sycl/source/detail/graph_impl.cpp
+++ b/sycl/source/detail/graph_impl.cpp
@@ -907,7 +907,7 @@ exec_graph_impl::enqueue(const std::shared_ptr<sycl::detail::queue_impl> &Queue,
               // TODO: Pass accessor mem allocations
               nullptr,
               // TODO: Extract from handler
-              PI_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT);
+              PI_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT, CG->MKernelIsCooperative);
           if (Res != pi_result::PI_SUCCESS) {
             throw sycl::exception(
                 sycl::make_error_code(sycl::errc::kernel),
diff --git a/sycl/source/detail/handler_impl.hpp b/sycl/source/detail/handler_impl.hpp
index a6f4622587fcf..c96d60bd85ecd 100644
--- a/sycl/source/detail/handler_impl.hpp
+++ b/sycl/source/detail/handler_impl.hpp
@@ -106,6 +106,8 @@ class handler_impl {
   sycl::detail::pi::PiKernelCacheConfig MKernelCacheConfig =
       PI_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT;
 
+  bool MKernelIsCooperative = false;
+
   // Extra information for bindless image copy
   sycl::detail::pi::PiMemImageDesc MImageDesc;
   sycl::detail::pi::PiMemImageFormat MImageFormat;
diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
index 469c445b0e66c..0721eace0af6a 100644
--- a/sycl/source/detail/jit_compiler.cpp
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -930,7 +930,7 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
   FusedCG.reset(new detail::CGExecKernel(
       NDRDesc, nullptr, nullptr, std::move(KernelBundleImplPtr),
       std::move(CGData), std::move(FusedArgs), FusedOrCachedKernelName, {}, {},
-      CG::CGTYPE::Kernel, KernelCacheConfig));
+      CG::CGTYPE::Kernel, KernelCacheConfig, false /* KernelIsCooperative */));
   return FusedCG;
 }
 
diff --git a/sycl/source/detail/kernel_impl.hpp b/sycl/source/detail/kernel_impl.hpp
index 057a2dcf69e15..e5952fd4d22c7 100644
--- a/sycl/source/detail/kernel_impl.hpp
+++ b/sycl/source/detail/kernel_impl.hpp
@@ -16,6 +16,7 @@
 #include <sycl/detail/pi.h>
 #include <sycl/detail/pi.hpp>
 #include <sycl/device.hpp>
+#include <sycl/ext/oneapi/experimental/root_group.hpp>
 #include <sycl/info/info_desc.hpp>
 
 #include <cassert>
@@ -141,6 +142,9 @@ class kernel_impl {
   typename Param::return_type get_info(const device &Device,
                                        const range<3> &WGSize) const;
 
+  template <typename Param>
+  typename Param::return_type ext_oneapi_get_info(const queue &q) const;
+
   /// Get a reference to a raw kernel object.
   ///
   /// \return a reference to a valid PiKernel instance with raw kernel object.
@@ -257,6 +261,22 @@ kernel_impl::get_info(const device &Device,
       getPlugin());
 }
 
+template <>
+inline typename ext::oneapi::experimental::info::kernel_queue_specific::
+    max_num_work_group_sync::return_type
+    kernel_impl::ext_oneapi_get_info<
+        ext::oneapi::experimental::info::kernel_queue_specific::
+            max_num_work_group_sync>(const queue &Queue) const {
+  const auto &Plugin = getPlugin();
+  const auto &Handle = getHandleRef();
+  const auto MaxWorkGroupSize =
+      Queue.get_device().get_info<info::device::max_work_group_size>();
+  pi_uint32 GroupCount = 0;
+  Plugin->call<PiApiKind::piextKernelSuggestMaxCooperativeGroupCount>(
+      Handle, MaxWorkGroupSize, /* DynamicSharedMemorySize */ 0, &GroupCount);
+  return GroupCount;
+}
+
 } // namespace detail
 } // namespace _V1
 } // namespace sycl
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index 955adae8423dc..b113da757bd0c 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -300,9 +300,12 @@ bool Command::isHostTask() const {
 }
 
 bool Command::isFusable() const {
-  return (MType == CommandType::RUN_CG) &&
-         ((static_cast<const ExecCGCommand *>(this))->getCG().getType() ==
-          CG::CGTYPE::Kernel);
+  if ((MType != CommandType::RUN_CG)) {
+    return false;
+  }
+  const auto &CG = (static_cast<const ExecCGCommand &>(*this)).getCG();
+  return (CG.getType() == CG::CGTYPE::Kernel) &&
+         (!static_cast<const CGExecKernel &>(CG).MKernelIsCooperative);
 }
 
 static void flushCrossQueueDeps(const std::vector<EventImplPtr> &EventImpls,
@@ -2343,7 +2346,8 @@ static pi_result SetKernelParamsAndLaunch(
     std::vector<sycl::detail::pi::PiEvent> &RawEvents,
     const detail::EventImplPtr &OutEventImpl,
     const KernelArgMask *EliminatedArgMask,
-    const std::function<void *(Requirement *Req)> &getMemAllocationFunc) {
+    const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
+    bool IsCooperative) {
   const PluginPtr &Plugin = Queue->getPlugin();
 
   auto setFunc = [&Plugin, Kernel, &DeviceImageImpl, &getMemAllocationFunc,
@@ -2381,11 +2385,18 @@ static pi_result SetKernelParamsAndLaunch(
   }
   if (OutEventImpl != nullptr)
     OutEventImpl->setHostEnqueueTime();
-  pi_result Error = Plugin->call_nocheck<PiApiKind::piEnqueueKernelLaunch>(
-      Queue->getHandleRef(), Kernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0],
-      &NDRDesc.GlobalSize[0], LocalSize, RawEvents.size(),
-      RawEvents.empty() ? nullptr : &RawEvents[0],
-      OutEventImpl ? &OutEventImpl->getHandleRef() : nullptr);
+  pi_result Error =
+      [&](auto... Args) {
+        if (IsCooperative) {
+          return Plugin
+              ->call_nocheck<PiApiKind::piextEnqueueCooperativeKernelLaunch>(
+                  Args...);
+        }
+        return Plugin->call_nocheck<PiApiKind::piEnqueueKernelLaunch>(Args...);
+      }(Queue->getHandleRef(), Kernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0],
+        &NDRDesc.GlobalSize[0], LocalSize, RawEvents.size(),
+        RawEvents.empty() ? nullptr : &RawEvents[0],
+        OutEventImpl ? &OutEventImpl->getHandleRef() : nullptr);
   return Error;
 }
 
@@ -2525,7 +2536,8 @@ pi_int32 enqueueImpKernel(
     std::vector<sycl::detail::pi::PiEvent> &RawEvents,
     const detail::EventImplPtr &OutEventImpl,
     const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
-    sycl::detail::pi::PiKernelCacheConfig KernelCacheConfig) {
+    sycl::detail::pi::PiKernelCacheConfig KernelCacheConfig,
+    const bool KernelIsCooperative) {
 
   // Run OpenCL kernel
   auto ContextImpl = Queue->getContextImplPtr();
@@ -2616,7 +2628,8 @@ pi_int32 enqueueImpKernel(
 
     Error = SetKernelParamsAndLaunch(Queue, Args, DeviceImageImpl, Kernel,
                                      NDRDesc, EventsWaitList, OutEventImpl,
-                                     EliminatedArgMask, getMemAllocationFunc);
+                                     EliminatedArgMask, getMemAllocationFunc,
+                                     KernelIsCooperative);
 
     const PluginPtr &Plugin = Queue->getPlugin();
     if (!SyclKernelImpl && !MSyclKernel) {
@@ -2988,7 +3001,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
     return enqueueImpKernel(
         MQueue, NDRDesc, Args, ExecKernel->getKernelBundle(), SyclKernel,
         KernelName, RawEvents, EventImpl, getMemAllocationFunc,
-        ExecKernel->MKernelCacheConfig);
+        ExecKernel->MKernelCacheConfig, ExecKernel->MKernelIsCooperative);
   }
   case CG::CGTYPE::CopyUSM: {
     CGCopyUSM *Copy = (CGCopyUSM *)MCommandGroup.get();
diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp
index 7898e3f65b812..8dc12120bdd9a 100644
--- a/sycl/source/detail/scheduler/commands.hpp
+++ b/sycl/source/detail/scheduler/commands.hpp
@@ -631,7 +631,8 @@ pi_int32 enqueueImpKernel(
     std::vector<sycl::detail::pi::PiEvent> &RawEvents,
     const detail::EventImplPtr &Event,
     const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
-    sycl::detail::pi::PiKernelCacheConfig KernelCacheConfig);
+    sycl::detail::pi::PiKernelCacheConfig KernelCacheConfig,
+    bool KernelIsCooperative);
 
 class KernelFusionCommand;
 
diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index e9ed3f2bb6e4b..7b50192cf3b43 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -1016,8 +1016,13 @@ Scheduler::GraphBuildResult Scheduler::GraphBuilder::addCG(
     } else {
       std::string s;
       std::stringstream ss(s);
-      ss << "Not fusing '" << NewCmd->getTypeString()
-         << "' command group. Can only fuse device kernel command groups.";
+      if (NewCmd->getCG().getType() == CG::CGTYPE::Kernel) {
+        ss << "Not fusing kernel with 'use_root_sync' property. Can only fuse "
+              "non-cooperative device kernels.";
+      } else {
+        ss << "Not fusing '" << NewCmd->getTypeString()
+           << "' command group. Can only fuse device kernel command groups.";
+      }
       printFusionWarning(ss.str());
     }
   }
diff --git a/sycl/source/feature_test.hpp.in b/sycl/source/feature_test.hpp.in
index f4575c22b8c6c..d640bab2624dc 100644
--- a/sycl/source/feature_test.hpp.in
+++ b/sycl/source/feature_test.hpp.in
@@ -50,6 +50,7 @@ inline namespace _V1 {
 #define SYCL_EXT_ONEAPI_ND_RANGE_REDUCTIONS 1
 #define SYCL_EXT_ONEAPI_DEFAULT_CONTEXT 1
 #define SYCL_EXT_ONEAPI_USE_PINNED_HOST_MEMORY_PROPERTY 1
+#define SYCL_EXT_ONEAPI_ROOT_GROUP 1
 #define SYCL_EXT_ONEAPI_SRGB 1
 #define SYCL_EXT_ONEAPI_SUB_GROUP 1
 #define SYCL_EXT_ONEAPI_PROPERTIES 1
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
index c7ec3e02e042c..d2d3cc7171548 100644
--- a/sycl/source/handler.cpp
+++ b/sycl/source/handler.cpp
@@ -256,17 +256,34 @@ event handler::finalize() {
             // Capture the host timestamp for profiling (queue time)
             if (NewEvent != nullptr)
               NewEvent->setHostEnqueueTime();
-            MQueue->getPlugin()->call<detail::PiApiKind::piEnqueueKernelLaunch>(
-                nullptr, reinterpret_cast<pi_kernel>(MHostKernel->getPtr()),
-                MNDRDesc.Dims, &MNDRDesc.GlobalOffset[0],
-                &MNDRDesc.GlobalSize[0], &MNDRDesc.LocalSize[0], 0, nullptr,
-                nullptr);
+            [&](auto... Args) {
+              if (MImpl->MKernelIsCooperative) {
+                MQueue->getPlugin()
+                    ->call<
+                        detail::PiApiKind::piextEnqueueCooperativeKernelLaunch>(
+                        Args...);
+              } else {
+                MQueue->getPlugin()
+                    ->call<detail::PiApiKind::piEnqueueKernelLaunch>(Args...);
+              }
+            }(/* queue */
+              nullptr,
+              /* kernel */
+              reinterpret_cast<pi_kernel>(MHostKernel->getPtr()),
+              /* work_dim */
+              MNDRDesc.Dims,
+              /* global_work_offset */ &MNDRDesc.GlobalOffset[0],
+              /* global_work_size */ &MNDRDesc.GlobalSize[0],
+              /* local_work_size */ &MNDRDesc.LocalSize[0],
+              /* num_events_in_wait_list */ 0,
+              /* event_wait_list */ nullptr,
+              /* event */ nullptr);
             Result = PI_SUCCESS;
           } else {
-            Result =
-                enqueueImpKernel(MQueue, MNDRDesc, MArgs, KernelBundleImpPtr,
-                                 MKernel, MKernelName, RawEvents, NewEvent,
-                                 nullptr, MImpl->MKernelCacheConfig);
+            Result = enqueueImpKernel(
+                MQueue, MNDRDesc, MArgs, KernelBundleImpPtr, MKernel,
+                MKernelName, RawEvents, NewEvent, nullptr,
+                MImpl->MKernelCacheConfig, MImpl->MKernelIsCooperative);
           }
         }
 #ifdef XPTI_ENABLE_INSTRUMENTATION
@@ -325,7 +342,7 @@ event handler::finalize() {
         std::move(MImpl->MKernelBundle), std::move(CGData), std::move(MArgs),
         MKernelName, std::move(MStreamStorage),
         std::move(MImpl->MAuxiliaryResources), MCGType,
-        MImpl->MKernelCacheConfig, MCodeLoc));
+        MImpl->MKernelCacheConfig, MImpl->MKernelIsCooperative, MCodeLoc));
     break;
   }
   case detail::CG::CopyAccToPtr:
@@ -1399,6 +1416,10 @@ void handler::setKernelCacheConfig(
   MImpl->MKernelCacheConfig = Config;
 }
 
+void handler::setKernelIsCooperative(bool KernelIsCooperative) {
+  MImpl->MKernelIsCooperative = KernelIsCooperative;
+}
+
 void handler::ext_oneapi_graph(
     ext::oneapi::experimental::command_graph<
         ext::oneapi::experimental::graph_state::executable>
diff --git a/sycl/source/kernel.cpp b/sycl/source/kernel.cpp
index 369b631c88464..1ebaf3f718852 100644
--- a/sycl/source/kernel.cpp
+++ b/sycl/source/kernel.cpp
@@ -90,6 +90,18 @@ template __SYCL_EXPORT uint32_t
 kernel::get_info<info::kernel_device_specific::max_sub_group_size>(
     const device &, const sycl::range<3> &) const;
 
+template <typename Param>
+typename Param::return_type
+kernel::ext_oneapi_get_info(const queue &Queue) const {
+  return impl->ext_oneapi_get_info<Param>(Queue);
+}
+
+template __SYCL_EXPORT typename ext::oneapi::experimental::info::
+    kernel_queue_specific::max_num_work_group_sync::return_type
+    kernel::ext_oneapi_get_info<
+        ext::oneapi::experimental::info::kernel_queue_specific::
+            max_num_work_group_sync>(const queue &Queue) const;
+
 kernel::kernel(std::shared_ptr<detail::kernel_impl> Impl) : impl(Impl) {}
 
 pi_native_handle kernel::getNative() const { return impl->getNative(); }
diff --git a/sycl/test-e2e/KernelFusion/cooperative_kernel.cpp b/sycl/test-e2e/KernelFusion/cooperative_kernel.cpp
new file mode 100644
index 0000000000000..17f65b43ddadc
--- /dev/null
+++ b/sycl/test-e2e/KernelFusion/cooperative_kernel.cpp
@@ -0,0 +1,30 @@
+// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: env SYCL_RT_WARNING_LEVEL=2 %{run} %t.out 2>&1 | FileCheck %s
+
+// Test cooperative kernels are not fused
+
+// This test currently fails on AMD HIP due to an unresolved problem when
+// submitting a kernel with the use_root_sync property to a command group with
+// fusion enabled.
+// XFAIL: hip_amd
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+  ext::codeplay::experimental::fusion_wrapper fw(q);
+
+  {
+    // CHECK: Not fusing kernel with 'use_root_sync' property. Can only fuse non-cooperative device kernels.
+    fw.start_fusion();
+    q.submit([&](handler &cgh) {
+      const auto props = sycl::ext::oneapi::experimental::properties{
+          sycl::ext::oneapi::experimental::use_root_sync};
+      cgh.parallel_for(sycl::range<1>{1}, props, [=](sycl::id<1>) {});
+    });
+    fw.complete_fusion();
+  }
+
+}
diff --git a/sycl/test-e2e/NonUniformGroups/is_fixed_topology.cpp b/sycl/test-e2e/NonUniformGroups/is_fixed_topology.cpp
index 080b225415564..f602b4c8ec071 100644
--- a/sycl/test-e2e/NonUniformGroups/is_fixed_topology.cpp
+++ b/sycl/test-e2e/NonUniformGroups/is_fixed_topology.cpp
@@ -4,7 +4,9 @@
 namespace syclex = sycl::ext::oneapi::experimental;
 
 #ifdef SYCL_EXT_ONEAPI_ROOT_GROUP
-static_assert(syclex::is_fixed_topology_group_v<syclex::root_group>);
+static_assert(syclex::is_fixed_topology_group_v<syclex::root_group<1>>);
+static_assert(syclex::is_fixed_topology_group_v<syclex::root_group<2>>);
+static_assert(syclex::is_fixed_topology_group_v<syclex::root_group<3>>);
 #endif
 static_assert(syclex::is_fixed_topology_group_v<sycl::group<1>>);
 static_assert(syclex::is_fixed_topology_group_v<sycl::group<2>>);
diff --git a/sycl/test/abi/pi_cuda_symbol_check.dump b/sycl/test/abi/pi_cuda_symbol_check.dump
index 0aaff91e58336..13b555bec8880 100644
--- a/sycl/test/abi/pi_cuda_symbol_check.dump
+++ b/sycl/test/abi/pi_cuda_symbol_check.dump
@@ -109,6 +109,7 @@ piextDeviceSelectBinary
 piextDisablePeerAccess
 piextEnablePeerAccess
 piextEnqueueCommandBuffer
+piextEnqueueCooperativeKernelLaunch
 piextEnqueueReadHostPipe
 piextEnqueueWriteHostPipe
 piextEventCreateWithNativeHandle
@@ -120,6 +121,7 @@ piextKernelGetNativeHandle
 piextKernelSetArgMemObj
 piextKernelSetArgPointer
 piextKernelSetArgSampler
+piextKernelSuggestMaxCooperativeGroupCount
 piextMemCreateWithNativeHandle
 piextMemGetNativeHandle
 piextMemImageAllocate
diff --git a/sycl/test/abi/pi_hip_symbol_check.dump b/sycl/test/abi/pi_hip_symbol_check.dump
index 3bf9f9a3bf31d..4c091716caedb 100644
--- a/sycl/test/abi/pi_hip_symbol_check.dump
+++ b/sycl/test/abi/pi_hip_symbol_check.dump
@@ -109,6 +109,7 @@ piextDeviceSelectBinary
 piextDisablePeerAccess
 piextEnablePeerAccess
 piextEnqueueCommandBuffer
+piextEnqueueCooperativeKernelLaunch
 piextEnqueueReadHostPipe
 piextEnqueueWriteHostPipe
 piextEventCreateWithNativeHandle
@@ -120,6 +121,7 @@ piextKernelGetNativeHandle
 piextKernelSetArgMemObj
 piextKernelSetArgPointer
 piextKernelSetArgSampler
+piextKernelSuggestMaxCooperativeGroupCount
 piextMemCreateWithNativeHandle
 piextMemGetNativeHandle
 piextMemImageAllocate
diff --git a/sycl/test/abi/pi_level_zero_symbol_check.dump b/sycl/test/abi/pi_level_zero_symbol_check.dump
index 80ba2ad78d8ee..7a90e461a30f6 100644
--- a/sycl/test/abi/pi_level_zero_symbol_check.dump
+++ b/sycl/test/abi/pi_level_zero_symbol_check.dump
@@ -108,6 +108,7 @@ piextDeviceSelectBinary
 piextDisablePeerAccess
 piextEnablePeerAccess
 piextEnqueueCommandBuffer
+piextEnqueueCooperativeKernelLaunch
 piextEnqueueReadHostPipe
 piextEnqueueWriteHostPipe
 piextEventCreateWithNativeHandle
@@ -119,6 +120,7 @@ piextKernelGetNativeHandle
 piextKernelSetArgMemObj
 piextKernelSetArgPointer
 piextKernelSetArgSampler
+piextKernelSuggestMaxCooperativeGroupCount
 piextMemCreateWithNativeHandle
 piextMemGetNativeHandle
 piextMemImageAllocate
diff --git a/sycl/test/abi/pi_opencl_symbol_check.dump b/sycl/test/abi/pi_opencl_symbol_check.dump
index b2c3e857e049a..159e427835651 100644
--- a/sycl/test/abi/pi_opencl_symbol_check.dump
+++ b/sycl/test/abi/pi_opencl_symbol_check.dump
@@ -108,6 +108,7 @@ piextDeviceSelectBinary
 piextDisablePeerAccess
 piextEnablePeerAccess
 piextEnqueueCommandBuffer
+piextEnqueueCooperativeKernelLaunch
 piextEnqueueReadHostPipe
 piextEnqueueWriteHostPipe
 piextEventCreateWithNativeHandle
@@ -119,6 +120,7 @@ piextKernelGetNativeHandle
 piextKernelSetArgMemObj
 piextKernelSetArgPointer
 piextKernelSetArgSampler
+piextKernelSuggestMaxCooperativeGroupCount
 piextMemCreateWithNativeHandle
 piextMemGetNativeHandle
 piextMemImageAllocate
diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump
index c6f7422a4808e..4fe3042f807ab 100644
--- a/sycl/test/abi/sycl_symbols_linux.dump
+++ b/sycl/test/abi/sycl_symbols_linux.dump
@@ -4139,6 +4139,7 @@ _ZN4sycl3_V17handler22ext_oneapi_fill2d_implEPvmPKvmmm
 _ZN4sycl3_V17handler22memcpyFromDeviceGlobalEPvPKvbmm
 _ZN4sycl3_V17handler22setHandlerKernelBundleENS0_6kernelE
 _ZN4sycl3_V17handler22setHandlerKernelBundleERKSt10shared_ptrINS0_6detail18kernel_bundle_implEE
+_ZN4sycl3_V17handler22setKernelIsCooperativeEb
 _ZN4sycl3_V17handler22verifyUsedKernelBundleERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE
 _ZN4sycl3_V17handler24GetRangeRoundingSettingsERmS2_S2_
 _ZN4sycl3_V17handler24ext_intel_read_host_pipeERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEPvmb
@@ -4622,6 +4623,7 @@ _ZNK4sycl3_V16kernel11get_backendEv
 _ZNK4sycl3_V16kernel11get_contextEv
 _ZNK4sycl3_V16kernel13getNativeImplEv
 _ZNK4sycl3_V16kernel17get_kernel_bundleEv
+_ZNK4sycl3_V16kernel19ext_oneapi_get_infoINS0_3ext6oneapi12experimental4info21kernel_queue_specific23max_num_work_group_syncEEENT_11return_typeERKNS0_5queueE
 _ZNK4sycl3_V16kernel3getEv
 _ZNK4sycl3_V16kernel7is_hostEv
 _ZNK4sycl3_V16kernel8get_infoINS0_4info22kernel_device_specific15work_group_sizeEEENS0_6detail35is_kernel_device_specific_info_descIT_E11return_typeERKNS0_6deviceE
diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump
index 1c4b29549b5c1..435176097e7bd 100644
--- a/sycl/test/abi/sycl_symbols_windows.dump
+++ b/sycl/test/abi/sycl_symbols_windows.dump
@@ -13,6 +13,7 @@
 ??$create_sub_devices@$0BAIH@@device@_V1@sycl@@QEBA?AV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@AEBV?$vector@_KV?$allocator@_K@std@@@4@@Z
 ??$create_sub_devices@$0BAII@@device@_V1@sycl@@QEBA?AV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@W4partition_affinity_domain@info@12@@Z
 ??$create_sub_devices@$0BAIJ@@device@_V1@sycl@@QEBA?AV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@XZ
+??$ext_oneapi_get_info@Umax_num_work_group_sync@kernel_queue_specific@info@experimental@oneapi@ext@_V1@sycl@@@kernel@_V1@sycl@@QEBA_KAEBVqueue@12@@Z
 ??$getPlugin@$00@pi@detail@_V1@sycl@@YAAEBV?$shared_ptr@Vplugin@detail@_V1@sycl@@@std@@XZ
 ??$getPlugin@$01@pi@detail@_V1@sycl@@YAAEBV?$shared_ptr@Vplugin@detail@_V1@sycl@@@std@@XZ
 ??$getPlugin@$02@pi@detail@_V1@sycl@@YAAEBV?$shared_ptr@Vplugin@detail@_V1@sycl@@@std@@XZ
@@ -1493,6 +1494,7 @@
 ?setHandlerKernelBundle@handler@_V1@sycl@@AEAAXAEBV?$shared_ptr@Vkernel_bundle_impl@detail@_V1@sycl@@@std@@@Z
 ?setHandlerKernelBundle@handler@_V1@sycl@@AEAAXVkernel@23@@Z
 ?setKernelCacheConfig@handler@_V1@sycl@@AEAAXW4_pi_kernel_cache_config@@@Z
+?setKernelIsCooperative@handler@_V1@sycl@@AEAAX_N@Z
 ?setLocalAccessorArgHelper@handler@_V1@sycl@@AEAAXHAEAVLocalAccessorBaseHost@detail@23@@Z
 ?setPitches@image_impl@detail@_V1@sycl@@AEAAXAEBV?$range@$01@34@@Z
 ?setPitches@image_impl@detail@_V1@sycl@@AEAAXXZ
diff --git a/sycl/unittests/helpers/PiMockPlugin.hpp b/sycl/unittests/helpers/PiMockPlugin.hpp
index 31eac5598f588..5ab408d2eed01 100644
--- a/sycl/unittests/helpers/PiMockPlugin.hpp
+++ b/sycl/unittests/helpers/PiMockPlugin.hpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <cstddef>
 #include <sycl/detail/pi.hpp>
 
 #include <atomic>
@@ -853,6 +854,13 @@ mock_piextKernelGetNativeHandle(pi_kernel kernel,
   return PI_SUCCESS;
 }
 
+inline pi_result mock_piextKernelSuggestMaxCooperativeGroupCount(
+    pi_kernel kernel, size_t local_work_size, size_t dynamic_shared_memory_size,
+    pi_uint32 *group_count_ret) {
+  *group_count_ret = 1;
+  return PI_SUCCESS;
+}
+
 //
 // Events
 //
@@ -970,6 +978,15 @@ inline pi_result mock_piEnqueueKernelLaunch(
   return PI_SUCCESS;
 }
 
+inline pi_result mock_piextEnqueueCooperativeKernelLaunch(
+    pi_queue queue, pi_kernel kernel, pi_uint32 work_dim,
+    const size_t *global_work_offset, const size_t *global_work_size,
+    const size_t *local_work_size, pi_uint32 num_events_in_wait_list,
+    const pi_event *event_wait_list, pi_event *event) {
+  *event = createDummyHandle<pi_event>();
+  return PI_SUCCESS;
+}
+
 inline pi_result mock_piEnqueueEventsWait(pi_queue command_queue,
                                           pi_uint32 num_events_in_wait_list,
                                           const pi_event *event_wait_list,
diff --git a/sycl/unittests/program_manager/arg_mask/EliminatedArgMask.cpp b/sycl/unittests/program_manager/arg_mask/EliminatedArgMask.cpp
index ec54369f234a8..b3b27a502bcee 100644
--- a/sycl/unittests/program_manager/arg_mask/EliminatedArgMask.cpp
+++ b/sycl/unittests/program_manager/arg_mask/EliminatedArgMask.cpp
@@ -124,7 +124,7 @@ class MockHandler : public sycl::handler {
           std::move(CGH->CGData), std::move(CGH->MArgs),
           std::move(CGH->MKernelName), std::move(CGH->MStreamStorage),
           std::move(MImpl->MAuxiliaryResources), CGH->MCGType, {},
-          CGH->MCodeLoc));
+          MImpl->MKernelIsCooperative, CGH->MCodeLoc));
       break;
     }
     default:
diff --git a/sycl/unittests/scheduler/SchedulerTestUtils.hpp b/sycl/unittests/scheduler/SchedulerTestUtils.hpp
index e07da538a99f9..a1d1eb8058826 100644
--- a/sycl/unittests/scheduler/SchedulerTestUtils.hpp
+++ b/sycl/unittests/scheduler/SchedulerTestUtils.hpp
@@ -307,7 +307,7 @@ class MockHandlerCustomFinalize : public MockHandler {
           getNDRDesc(), std::move(getHostKernel()), getKernel(),
           std::move(MImpl->MKernelBundle), std::move(CGData), getArgs(),
           getKernelName(), getStreamStorage(), MImpl->MAuxiliaryResources,
-          getCGType(), {}, getCodeLoc()));
+          getCGType(), {}, MImpl->MKernelIsCooperative, getCodeLoc()));
       break;
     }
     case sycl::detail::CG::CodeplayHostTask: {
diff --git a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp
index 0da5a71c4eaac..18c0b3e1a8070 100644
--- a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp
+++ b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp
@@ -34,7 +34,7 @@ class MockHandlerStreamInit : public MockHandler {
                                         getRequirements(), getEvents()),
           getArgs(), getKernelName(), getStreamStorage(),
           std::move(MImpl->MAuxiliaryResources), getCGType(), {},
-          getCodeLoc()));
+          MImpl->MKernelIsCooperative, getCodeLoc()));
       break;
     }
     default:

From 033943c618d1cbf0f01dbc80f7f19c01ba2f7028 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 22 Feb 2024 08:49:43 -0800
Subject: [PATCH 22/30] Bump cryptography from 42.0.2 to 42.0.4 in
 /llvm/utils/git (#12791)

Bumps [cryptography](https://github.com/pyca/cryptography) from 42.0.2
to 42.0.4.
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a
href="https://github.com/pyca/cryptography/blob/main/CHANGELOG.rst">cryptography's
changelog</a>.</em></p>
<blockquote>
<p>42.0.4 - 2024-02-20</p>
<pre><code>
* Fixed a null-pointer-dereference and segfault that could occur when
creating
a PKCS#12 bundle. Credit to **Alexander-Programming** for reporting the
  issue. **CVE-2024-26130**
* Fixed ASN.1 encoding for PKCS7/SMIME signed messages. The fields
``SMIMECapabilities``
and ``SignatureAlgorithmIdentifier`` should now be correctly encoded
according to the
  definitions in :rfc:`2633` :rfc:`3370`.
<p>.. _v42-0-3:</p>
<p>42.0.3 - 2024-02-15
</code></pre></p>
<ul>
<li>Fixed an initialization issue that caused key loading failures for
some
users.</li>
</ul>
<p>.. _v42-0-2:</p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/pyca/cryptography/commit/fe18470f7d05f963e7267e34fdf985d81ea6ceea"><code>fe18470</code></a>
Bump for 42.0.4 release (<a
href="https://redirect.github.com/pyca/cryptography/issues/10445">#10445</a>)</li>
<li><a
href="https://github.com/pyca/cryptography/commit/aaa2dd06ed470695de818405a982d4c459869803"><code>aaa2dd0</code></a>
Fix ASN.1 issues in PKCS#7 and S/MIME signing (<a
href="https://redirect.github.com/pyca/cryptography/issues/10373">#10373</a>)
(<a
href="https://redirect.github.com/pyca/cryptography/issues/10442">#10442</a>)</li>
<li><a
href="https://github.com/pyca/cryptography/commit/7a4d012991061974da5d9cb7614de65eac94f49b"><code>7a4d012</code></a>
Fixes <a
href="https://redirect.github.com/pyca/cryptography/issues/10422">#10422</a>
-- don't crash when a PKCS#12 key and cert don't match (<a
href="https://redirect.github.com/pyca/cryptography/issues/10423">#10423</a>)
...</li>
<li><a
href="https://github.com/pyca/cryptography/commit/df314bb182bdfd661333969a94325e4680d785f6"><code>df314bb</code></a>
backport actions m1 switch to 42.0.x (<a
href="https://redirect.github.com/pyca/cryptography/issues/10415">#10415</a>)</li>
<li><a
href="https://github.com/pyca/cryptography/commit/c49a7a5271178c6e8ef36fa1c499f62c63ec19b9"><code>c49a7a5</code></a>
changelog and version bump for 42.0.3 (<a
href="https://redirect.github.com/pyca/cryptography/issues/10396">#10396</a>)</li>
<li><a
href="https://github.com/pyca/cryptography/commit/396bcf64c5be826ec00e7d7f45838c858c049cbc"><code>396bcf6</code></a>
fix provider loading take two (<a
href="https://redirect.github.com/pyca/cryptography/issues/10390">#10390</a>)
(<a
href="https://redirect.github.com/pyca/cryptography/issues/10395">#10395</a>)</li>
<li><a
href="https://github.com/pyca/cryptography/commit/0e0e46f5f73f477b8ee9682738c42129d5d60177"><code>0e0e46f</code></a>
backport: initialize openssl's legacy provider in rust (<a
href="https://redirect.github.com/pyca/cryptography/issues/10323">#10323</a>)
(<a
href="https://redirect.github.com/pyca/cryptography/issues/10333">#10333</a>)</li>
<li>See full diff in <a
href="https://github.com/pyca/cryptography/compare/42.0.2...42.0.4">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=cryptography&package-manager=pip&previous-version=42.0.2&new-version=42.0.4)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)
You can disable automated security fix PRs for this repo from the
[Security Alerts page](https://github.com/intel/llvm/network/alerts).

</details>

---------

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Alexey Bader <alexey.bader@intel.com>
---
 llvm/utils/git/requirements.txt            | 2 +-
 llvm/utils/git/requirements_formatting.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/utils/git/requirements.txt b/llvm/utils/git/requirements.txt
index befc81ee1fa18..4f5bbbfef12fe 100644
--- a/llvm/utils/git/requirements.txt
+++ b/llvm/utils/git/requirements.txt
@@ -14,7 +14,7 @@ cffi==1.15.1
     #   pynacl
 charset-normalizer==2.1.1
     # via requests
-cryptography==42.0.2
+cryptography==42.0.4
     # via pyjwt
 deprecated==1.2.13
     # via pygithub
diff --git a/llvm/utils/git/requirements_formatting.txt b/llvm/utils/git/requirements_formatting.txt
index 56b3273d384c6..3218c38ff93ec 100644
--- a/llvm/utils/git/requirements_formatting.txt
+++ b/llvm/utils/git/requirements_formatting.txt
@@ -18,7 +18,7 @@ charset-normalizer==3.2.0
     # via requests
 click==8.1.7
     # via black
-cryptography==42.0.2
+cryptography==42.0.4
     # via pyjwt
 darker==1.7.2
     # via -r llvm/utils/git/requirements_formatting.txt.in

From 2d77b21592bbb209d5a176e30433ce17ba568d37 Mon Sep 17 00:00:00 2001
From: Piotr Balcer <piotr.balcer@intel.com>
Date: Thu, 22 Feb 2024 19:22:58 +0100
Subject: [PATCH 23/30] [UR][L0] fix a deadlock in queue sync and event status
 query (#12795)

---
 sycl/plugins/unified_runtime/CMakeLists.txt | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index c362246b6f77d..66f0c4e70c543 100644
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -57,13 +57,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT)
   include(FetchContent)
 
   set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
-  # commit b4150ad1512476eb6ea0f2ede3bd29a6e3fd2b9e
-  # Merge: 4814e717 123c00f1
+  # commit 588615e90bfd2b889834120dfff172236c6b8aa8
+  # Merge: 4e69cc60 47084751
   # Author: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
-  # Date:   Thu Feb 22 10:42:39 2024 +0000
-  #     Merge pull request #1366 from nrspruit/fix_multidevice_event_cache
-  #     [L0] Fix the multi device event cache to allocate lists as pointers
-  set(UNIFIED_RUNTIME_TAG b4150ad1512476eb6ea0f2ede3bd29a6e3fd2b9e)
+  # Date:   Thu Feb 22 16:10:13 2024 +0000
+  #     Merge pull request #1371 from pbalcer/l0-query-status-sync-deadlock
+  #     [L0] fix a deadlock in queue sync and event status query
+  set(UNIFIED_RUNTIME_TAG 588615e90bfd2b889834120dfff172236c6b8aa8)
 
   if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO)
     set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}")

From ebec5bfb2d70acd1237a27ec8889717130ea9d49 Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Thu, 22 Feb 2024 10:31:02 -0800
Subject: [PATCH 24/30] [CI] Run performance tests on AMD/NVIDIA runners too
 (#12790)

---
 .github/workflows/sycl-linux-precommit.yml | 28 +++++++++++++++++-----
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/sycl-linux-precommit.yml b/.github/workflows/sycl-linux-precommit.yml
index 61abda9de327a..eb3b319cf14f9 100644
--- a/.github/workflows/sycl-linux-precommit.yml
+++ b/.github/workflows/sycl-linux-precommit.yml
@@ -109,21 +109,37 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - name: Perf tests on Intel GEN12 Graphics system
+          - name: Intel GEN12 Graphics system
             runner: '["Linux", "gen12"]'
-          - name: Perf tests on Intel Arc A-Series Graphics system
+            image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest
+            image_extra_opts: --device=/dev/dri
+            reset_gpu: true
+          - name: Intel Arc A-Series Graphics system
             runner: '["Linux", "arc"]'
+            image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest
+            image_extra_opts: --device=/dev/dri
+            reset_gpu: true
+          - name: AMD system
+            runner: '["Linux", "amdgpu"]'
+            image: ghcr.io/intel/llvm/ubuntu2204_build:latest
+            image_extra_opts: --device=/dev/dri --device=/dev/kfd
+            extra_cmake_args: -DHIP_PLATFORM="AMD" -DAMD_ARCH="gfx1031"
+          - name: CUDA system
+            runner: '["Linux", "cuda"]'
+            image: ghcr.io/intel/llvm/ubuntu2204_build:latest
+            image_extra_opts: --gpus all
     uses: ./.github/workflows/sycl-linux-run-tests.yml
     with:
-      name: ${{ matrix.name }}
+      name: Perf tests on ${{ matrix.name }}
       runner: ${{ matrix. runner }}
-      image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest
-      image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN
+      image: ${{ matrix.image }}
+      image_options: -u 1001 --privileged --cap-add SYS_ADMIN ${{ matrix.image_extra_opts }}
       target_devices: all
-      reset_gpu: true
+      reset_gpu: ${{ matrix.reset_gpu }}
 
       env: '{"LIT_FILTER":"PerformanceTests/"}'
       extra_lit_opts: -a -j 1 --param enable-perf-tests=True
+      extra_cmake_args: ${{ matrix.extra_cmake_args }}
 
       ref: ${{ github.sha }}
       merge_ref: ''

From 4be884493938a3212412c5b1f1913bb59784debe Mon Sep 17 00:00:00 2001
From: Mike Rice <michael.p.rice@intel.com>
Date: Thu, 22 Feb 2024 11:30:24 -0800
Subject: [PATCH 25/30] Revert "[sycl-web] Fit LIT tests failures in
 CodeGenSYCL. (#10820)" (#12776)

This reverts commit 66c18fee337fdf04f04f48e0fefb68157915b8a8.

This restores the lost vtable type changes from upstream
(8acdcf4016876d122733991561be706b64026e73).
---
 clang/lib/CodeGen/CGVTT.cpp                   |  13 +-
 clang/lib/CodeGen/CGVTables.cpp               |  32 +-
 clang/lib/CodeGen/CodeGenModule.cpp           |   2 +-
 clang/lib/CodeGen/ItaniumCXXABI.cpp           |  34 +--
 .../CodeGenCXX/dynamic-cast-address-space.cpp |   2 -
 .../CodeGenCXX/vtable-align-address-space.cpp |  13 +
 .../vtable-assume-load-address-space.cpp      | 288 ++++++++++++++++++
 .../vtable-consteval-address-space.cpp        |  44 +++
 .../vtable-constexpr-address-space.cpp        |  27 ++
 .../vtable-key-function-address-space.cpp     |  33 ++
 .../vtable-layout-extreme-address-space.cpp   | 210 +++++++++++++
 .../vtable-linkage-address-space.cpp          | 217 +++++++++++++
 ...e-pointer-initialization-address-space.cpp |  60 ++++
 clang/test/CodeGenCXX/vtt-address-space.cpp   |  17 +-
 .../CodeGenCXX/vtt-layout-address-space.cpp   |  89 ++++++
 clang/test/Headers/hip-header.hip             |  21 +-
 16 files changed, 1051 insertions(+), 51 deletions(-)
 create mode 100644 clang/test/CodeGenCXX/vtable-align-address-space.cpp
 create mode 100644 clang/test/CodeGenCXX/vtable-assume-load-address-space.cpp
 create mode 100644 clang/test/CodeGenCXX/vtable-consteval-address-space.cpp
 create mode 100644 clang/test/CodeGenCXX/vtable-constexpr-address-space.cpp
 create mode 100644 clang/test/CodeGenCXX/vtable-key-function-address-space.cpp
 create mode 100644 clang/test/CodeGenCXX/vtable-layout-extreme-address-space.cpp
 create mode 100644 clang/test/CodeGenCXX/vtable-linkage-address-space.cpp
 create mode 100644 clang/test/CodeGenCXX/vtable-pointer-initialization-address-space.cpp
 create mode 100644 clang/test/CodeGenCXX/vtt-layout-address-space.cpp

diff --git a/clang/lib/CodeGen/CGVTT.cpp b/clang/lib/CodeGen/CGVTT.cpp
index bceeb3aab0f3d..1d3f14f1c5344 100644
--- a/clang/lib/CodeGen/CGVTT.cpp
+++ b/clang/lib/CodeGen/CGVTT.cpp
@@ -42,8 +42,8 @@ CodeGenVTables::EmitVTTDefinition(llvm::GlobalVariable *VTT,
                                   llvm::GlobalVariable::LinkageTypes Linkage,
                                   const CXXRecordDecl *RD) {
   VTTBuilder Builder(CGM.getContext(), RD, /*GenerateDefinition=*/true);
-  llvm::ArrayType *ArrayType =
-      llvm::ArrayType::get(CGM.DefaultInt8PtrTy, Builder.getVTTComponents().size());
+  llvm::ArrayType *ArrayType = llvm::ArrayType::get(
+      CGM.GlobalsInt8PtrTy, Builder.getVTTComponents().size());
 
   SmallVector<llvm::GlobalVariable *, 8> VTables;
   SmallVector<VTableAddressPointsMapTy, 8> VTableAddressPoints;
@@ -81,9 +81,6 @@ CodeGenVTables::EmitVTTDefinition(llvm::GlobalVariable *VTT,
          VTable->getValueType(), VTable, Idxs, /*InBounds=*/true,
          /*InRangeIndex=*/1);
 
-     Init = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
-         Init, CGM.Int8PtrTy);
-
      VTTComponents.push_back(Init);
   }
 
@@ -117,9 +114,9 @@ llvm::GlobalVariable *CodeGenVTables::GetAddrOfVTT(const CXXRecordDecl *RD) {
 
   VTTBuilder Builder(CGM.getContext(), RD, /*GenerateDefinition=*/false);
 
-  llvm::ArrayType *ArrayType =
-    llvm::ArrayType::get(CGM.Int8PtrTy, Builder.getVTTComponents().size());
-  llvm::Align Align = CGM.getDataLayout().getABITypeAlign(CGM.Int8PtrTy);
+  llvm::ArrayType *ArrayType = llvm::ArrayType::get(
+      CGM.GlobalsInt8PtrTy, Builder.getVTTComponents().size());
+  llvm::Align Align = CGM.getDataLayout().getABITypeAlign(CGM.GlobalsInt8PtrTy);
 
   llvm::GlobalVariable *GV = CGM.CreateOrReplaceCXXRuntimeVariable(
       Name, ArrayType, llvm::GlobalValue::ExternalLinkage, Align);
diff --git a/clang/lib/CodeGen/CGVTables.cpp b/clang/lib/CodeGen/CGVTables.cpp
index 913c24bc7e859..8dee3f74b44b4 100644
--- a/clang/lib/CodeGen/CGVTables.cpp
+++ b/clang/lib/CodeGen/CGVTables.cpp
@@ -692,7 +692,7 @@ bool CodeGenVTables::useRelativeLayout() const {
 llvm::Type *CodeGenModule::getVTableComponentType() const {
   if (UseRelativeLayout(*this))
     return Int32Ty;
-  return Int8PtrTy;
+  return GlobalsInt8PtrTy;
 }
 
 llvm::Type *CodeGenVTables::getVTableComponentType() const {
@@ -704,7 +704,7 @@ static void AddPointerLayoutOffset(const CodeGenModule &CGM,
                                    CharUnits offset) {
   builder.add(llvm::ConstantExpr::getIntToPtr(
       llvm::ConstantInt::get(CGM.PtrDiffTy, offset.getQuantity()),
-      CGM.Int8PtrTy));
+      CGM.GlobalsInt8PtrTy));
 }
 
 static void AddRelativeLayoutOffset(const CodeGenModule &CGM,
@@ -741,7 +741,7 @@ void CodeGenVTables::addVTableComponent(ConstantArrayBuilder &builder,
                                   vtableHasLocalLinkage,
                                   /*isCompleteDtor=*/false);
     else
-      return builder.add(llvm::ConstantExpr::getBitCast(rtti, CGM.Int8PtrTy));
+      return builder.add(rtti);
 
   case VTableComponent::CK_FunctionPointer:
   case VTableComponent::CK_CompleteDtorPointer:
@@ -760,7 +760,8 @@ void CodeGenVTables::addVTableComponent(ConstantArrayBuilder &builder,
               ? MD->hasAttr<CUDADeviceAttr>()
               : (MD->hasAttr<CUDAHostAttr>() || !MD->hasAttr<CUDADeviceAttr>());
       if (!CanEmitMethod)
-        return builder.add(llvm::ConstantExpr::getNullValue(CGM.Int8PtrTy));
+        return builder.add(
+            llvm::ConstantExpr::getNullValue(CGM.GlobalsInt8PtrTy));
       // Method is acceptable, continue processing as usual.
     }
 
@@ -773,20 +774,20 @@ void CodeGenVTables::addVTableComponent(ConstantArrayBuilder &builder,
       // with the local symbol. As a temporary solution, fill these components
       // with zero. We shouldn't be calling these in the first place anyway.
       if (useRelativeLayout())
-        return llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
+        return llvm::ConstantPointerNull::get(CGM.GlobalsInt8PtrTy);
 
       // For NVPTX devices in OpenMP emit special functon as null pointers,
       // otherwise linking ends up with unresolved references.
       if (CGM.getLangOpts().OpenMP && CGM.getLangOpts().OpenMPIsTargetDevice &&
           CGM.getTriple().isNVPTX())
-        return llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
+        return llvm::ConstantPointerNull::get(CGM.GlobalsInt8PtrTy);
       llvm::FunctionType *fnTy =
           llvm::FunctionType::get(CGM.VoidTy, /*isVarArg=*/false);
       llvm::Constant *fn = cast<llvm::Constant>(
           CGM.CreateRuntimeFunction(fnTy, name).getCallee());
       if (auto f = dyn_cast<llvm::Function>(fn))
         f->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
-      return llvm::ConstantExpr::getBitCast(fn, CGM.Int8PtrTy);
+      return fn;
     };
 
     llvm::Constant *fnPtr;
@@ -824,15 +825,26 @@ void CodeGenVTables::addVTableComponent(ConstantArrayBuilder &builder,
       return addRelativeComponent(
           builder, fnPtr, vtableAddressPoint, vtableHasLocalLinkage,
           component.getKind() == VTableComponent::CK_CompleteDtorPointer);
-    } else
-      return builder.add(llvm::ConstantExpr::getBitCast(fnPtr, CGM.Int8PtrTy));
+    } else {
+      // TODO: this icky and only exists due to functions being in the generic
+      //       address space, rather than the global one, even though they are
+      //       globals;  fixing said issue might be intrusive, and will be done
+      //       later.
+      unsigned FnAS = fnPtr->getType()->getPointerAddressSpace();
+      unsigned GVAS = CGM.GlobalsInt8PtrTy->getPointerAddressSpace();
+
+      if (FnAS != GVAS)
+        fnPtr =
+            llvm::ConstantExpr::getAddrSpaceCast(fnPtr, CGM.GlobalsInt8PtrTy);
+      return builder.add(fnPtr);
+    }
   }
 
   case VTableComponent::CK_UnusedFunctionPointer:
     if (useRelativeLayout())
       return builder.add(llvm::ConstantExpr::getNullValue(CGM.Int32Ty));
     else
-      return builder.addNullPointer(CGM.Int8PtrTy);
+      return builder.addNullPointer(CGM.GlobalsInt8PtrTy);
   }
 
   llvm_unreachable("Unexpected vtable component kind");
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 133b39bc8fc46..1c5b7149dbf51 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -7982,7 +7982,7 @@ llvm::Constant *CodeGenModule::GetAddrOfRTTIDescriptor(QualType Ty,
   // FIXME: should we even be calling this method if RTTI is disabled
   // and it's not for EH?
   if (!shouldEmitRTTI(ForEH))
-    return llvm::Constant::getNullValue(Int8PtrTy);
+    return llvm::Constant::getNullValue(GlobalsInt8PtrTy);
 
   if (ForEH && Ty->isObjCObjectPointerType() &&
       LangOpts.ObjCRuntime.isGNUFamily())
diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp
index bc197762278ed..ef18a057f2a73 100644
--- a/clang/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp
@@ -670,7 +670,7 @@ CGCallee ItaniumCXXABI::EmitLoadOfMemberFunctionPointer(
   CGF.EmitBlock(FnVirtual);
 
   // Cast the adjusted this to a pointer to vtable pointer and load.
-  llvm::Type *VTableTy = Builder.getInt8PtrTy();
+  llvm::Type *VTableTy = CGF.CGM.GlobalsInt8PtrTy;
   CharUnits VTablePtrAlign =
     CGF.CGM.getDynamicOffsetAlignment(ThisAddr.getAlignment(), RD,
                                       CGF.getPointerAlign());
@@ -1942,11 +1942,11 @@ llvm::Value *ItaniumCXXABI::getVTableAddressPointInStructorWithVTT(
   /// Load the VTT.
   llvm::Value *VTT = CGF.LoadCXXVTT();
   if (VirtualPointerIndex)
-    VTT = CGF.Builder.CreateConstInBoundsGEP1_64(
-        CGF.VoidPtrTy, VTT, VirtualPointerIndex);
+    VTT = CGF.Builder.CreateConstInBoundsGEP1_64(CGF.GlobalsVoidPtrTy, VTT,
+                                                 VirtualPointerIndex);
 
   // And load the address point from the VTT.
-  return CGF.Builder.CreateAlignedLoad(CGF.VoidPtrTy, VTT,
+  return CGF.Builder.CreateAlignedLoad(CGF.GlobalsVoidPtrTy, VTT,
                                        CGF.getPointerAlign());
 }
 
@@ -1974,12 +1974,13 @@ llvm::GlobalVariable *ItaniumCXXABI::getAddrOfVTable(const CXXRecordDecl *RD,
       CGM.getItaniumVTableContext().getVTableLayout(RD);
   llvm::Type *VTableType = CGM.getVTables().getVTableType(VTLayout);
 
-  // Use pointer alignment for the vtable. Otherwise we would align them based
-  // on the size of the initializer which doesn't make sense as only single
-  // values are read.
+  // Use pointer to global alignment for the vtable. Otherwise we would align
+  // them based on the size of the initializer which doesn't make sense as only
+  // single values are read.
+  LangAS AS = CGM.GetGlobalVarAddressSpace(nullptr);
   unsigned PAlign = CGM.getItaniumVTableContext().isRelativeLayout()
                         ? 32
-                        : CGM.getTarget().getPointerAlign(LangAS::Default);
+                        : CGM.getTarget().getPointerAlign(AS);
 
   VTable = CGM.CreateOrReplaceCXXRuntimeVariable(
       Name, VTableType, llvm::GlobalValue::ExternalLinkage,
@@ -3281,10 +3282,9 @@ ItaniumRTTIBuilder::GetAddrOfExternalRTTIDescriptor(QualType Ty) {
     // Note for the future: If we would ever like to do deferred emission of
     // RTTI, check if emitting vtables opportunistically need any adjustment.
 
-    GV = new llvm::GlobalVariable(CGM.getModule(), CGM.Int8PtrTy,
-                                  /*isConstant=*/true,
-                                  llvm::GlobalValue::ExternalLinkage, nullptr,
-                                  Name);
+    GV = new llvm::GlobalVariable(
+        CGM.getModule(), CGM.GlobalsInt8PtrTy,
+        /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr, Name);
     const CXXRecordDecl *RD = Ty->getAsCXXRecordDecl();
     CGM.setGVProperties(GV, RD);
     // Import the typeinfo symbol when all non-inline virtual methods are
@@ -3680,8 +3680,8 @@ void ItaniumRTTIBuilder::BuildVTablePointer(const Type *Ty) {
   if (CGM.getItaniumVTableContext().isRelativeLayout())
     VTable = CGM.getModule().getNamedAlias(VTableName);
   if (!VTable) {
-    llvm::Type *Ty = llvm::ArrayType::get(CGM.DefaultInt8PtrTy, 0);
-    VTable = CGM.CreateRuntimeVariable(Ty, VTableName);
+    llvm::Type *Ty = llvm::ArrayType::get(CGM.GlobalsInt8PtrTy, 0);
+    VTable = CGM.getModule().getOrInsertGlobal(VTableName, Ty);
   }
 
   CGM.setDSOLocal(cast<llvm::GlobalValue>(VTable->stripPointerCasts()));
@@ -3698,7 +3698,7 @@ void ItaniumRTTIBuilder::BuildVTablePointer(const Type *Ty) {
         llvm::ConstantExpr::getInBoundsGetElementPtr(CGM.Int8Ty, VTable, Eight);
   } else {
     llvm::Constant *Two = llvm::ConstantInt::get(PtrDiffTy, 2);
-    VTable = llvm::ConstantExpr::getInBoundsGetElementPtr(CGM.DefaultInt8PtrTy,
+    VTable = llvm::ConstantExpr::getInBoundsGetElementPtr(CGM.GlobalsInt8PtrTy,
                                                           VTable, Two);
   }
 
@@ -3835,7 +3835,7 @@ llvm::Constant *ItaniumRTTIBuilder::BuildTypeInfo(
         llvm::ConstantInt::get(CGM.Int64Ty, ((uint64_t)1) << 63);
     TypeNameField = llvm::ConstantExpr::getAdd(TypeNameField, flag);
     TypeNameField =
-        llvm::ConstantExpr::getIntToPtr(TypeNameField, CGM.Int8PtrTy);
+        llvm::ConstantExpr::getIntToPtr(TypeNameField, CGM.GlobalsInt8PtrTy);
   } else {
     TypeNameField = TypeName;
   }
@@ -3965,7 +3965,7 @@ llvm::Constant *ItaniumRTTIBuilder::BuildTypeInfo(
     GV->setComdat(M.getOrInsertComdat(GV->getName()));
 
   CharUnits Align = CGM.getContext().toCharUnitsFromBits(
-      CGM.getTarget().getPointerAlign(LangAS::Default));
+      CGM.getTarget().getPointerAlign(CGM.GetGlobalVarAddressSpace(nullptr)));
   GV->setAlignment(Align.getAsAlign());
 
   // The Itanium ABI specifies that type_info objects must be globally
diff --git a/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp b/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp
index e891566df8117..83a408984b760 100644
--- a/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp
+++ b/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp
@@ -1,5 +1,3 @@
-// XFAIL: *
-//
 // RUN: %clang_cc1 -I%S %s -triple amdgcn-amd-amdhsa -emit-llvm -fcxx-exceptions -fexceptions -o - | FileCheck %s
 struct A { virtual void f(); };
 struct B : A { };
diff --git a/clang/test/CodeGenCXX/vtable-align-address-space.cpp b/clang/test/CodeGenCXX/vtable-align-address-space.cpp
new file mode 100644
index 0000000000000..5eac0bd75dc5e
--- /dev/null
+++ b/clang/test/CodeGenCXX/vtable-align-address-space.cpp
@@ -0,0 +1,13 @@
+// RUN: %clang_cc1 %s -triple=amdgcn-amd-amdhsa -std=c++11 -emit-llvm -o - | FileCheck %s
+
+struct A {
+  virtual void f();
+  virtual void g();
+  virtual void h();
+};
+
+void A::f() {}
+
+// CHECK: @_ZTV1A ={{.*}} unnamed_addr addrspace(1) constant { [5 x ptr addrspace(1)] } { [5 x ptr addrspace(1)] [ptr addrspace(1) null, ptr addrspace(1) @_ZTI1A, ptr addrspace(1) addrspacecast (ptr @_ZN1A1fEv to ptr addrspace(1)), ptr addrspace(1) addrspacecast (ptr @_ZN1A1gEv to ptr addrspace(1)), ptr addrspace(1) addrspacecast (ptr @_ZN1A1hEv to ptr addrspace(1))]
+// CHECK: @_ZTS1A ={{.*}} constant [3 x i8] c"1A\00", align 1
+// CHECK: @_ZTI1A ={{.*}} addrspace(1) constant { ptr addrspace(1), ptr addrspace(1) } { ptr addrspace(1) getelementptr inbounds (ptr addrspace(1), ptr addrspace(1) @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), ptr addrspace(1) @_ZTS1A }, align 8
diff --git a/clang/test/CodeGenCXX/vtable-assume-load-address-space.cpp b/clang/test/CodeGenCXX/vtable-assume-load-address-space.cpp
new file mode 100644
index 0000000000000..251d12bbb62f3
--- /dev/null
+++ b/clang/test/CodeGenCXX/vtable-assume-load-address-space.cpp
@@ -0,0 +1,288 @@
+// RUN: %clang_cc1 %s -triple=amdgcn-amd-amdhsa -std=c++11 -emit-llvm -o %t.ll -O1 -disable-llvm-passes -fms-extensions -fstrict-vtable-pointers
+// FIXME: Assume load should not require -fstrict-vtable-pointers
+
+// RUN: FileCheck --check-prefix=CHECK1 --input-file=%t.ll %s
+// RUN: FileCheck --check-prefix=CHECK2 --input-file=%t.ll %s
+// RUN: FileCheck --check-prefix=CHECK3 --input-file=%t.ll %s
+// RUN: FileCheck --check-prefix=CHECK4 --input-file=%t.ll %s
+// RUN: FileCheck --check-prefix=CHECK5 --input-file=%t.ll %s
+// RUN: FileCheck --check-prefix=CHECK6 --input-file=%t.ll %s
+// RUN: FileCheck --check-prefix=CHECK7 --input-file=%t.ll %s
+// RUN: FileCheck --check-prefix=CHECK8 --input-file=%t.ll %s
+namespace test1 {
+
+struct A {
+  A();
+  virtual void foo();
+};
+
+struct B : A {
+  virtual void foo();
+};
+
+void g(A *a) { a->foo(); }
+
+// CHECK1-LABEL: define{{.*}} void @_ZN5test14fooAEv()
+// CHECK1: call void @_ZN5test11AC1Ev(ptr
+// CHECK1: %[[VTABLE:.*]] = load ptr addrspace(1), ptr %{{.*}}
+// CHECK1: %[[CMP:.*]] = icmp eq ptr addrspace(1) %[[VTABLE]], getelementptr inbounds ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test11AE, i32 0, inrange i32 0, i32 2)
+// CHECK1: call void @llvm.assume(i1 %[[CMP]])
+// CHECK1-LABEL: {{^}}}
+
+void fooA() {
+  A a;
+  g(&a);
+}
+
+// CHECK1-LABEL: define{{.*}} void @_ZN5test14fooBEv()
+// CHECK1: call void @_ZN5test11BC1Ev(ptr {{[^,]*}} %{{.*}})
+// CHECK1: %[[VTABLE:.*]] = load ptr addrspace(1), ptr %{{.*}}
+// CHECK1: %[[CMP:.*]] = icmp eq ptr addrspace(1) %[[VTABLE]], getelementptr inbounds ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test11BE, i32 0, inrange i32 0, i32 2)
+// CHECK1: call void @llvm.assume(i1 %[[CMP]])
+// CHECK1-LABEL: {{^}}}
+
+void fooB() {
+  B b;
+  g(&b);
+}
+// there should not be any assumes in the ctor that calls base ctor
+// CHECK1-LABEL: define linkonce_odr void @_ZN5test11BC2Ev(ptr
+// CHECK1-NOT: @llvm.assume(
+// CHECK1-LABEL: {{^}}}
+}
+namespace test2 {
+struct A {
+  A();
+  virtual void foo();
+};
+
+struct B {
+  B();
+  virtual void bar();
+};
+
+struct C : A, B {
+  C();
+  virtual void foo();
+};
+void g(A *a) { a->foo(); }
+void h(B *b) { b->bar(); }
+
+// CHECK2-LABEL: define{{.*}} void @_ZN5test24testEv()
+// CHECK2: call void @_ZN5test21CC1Ev(ptr
+// CHECK2: %[[VTABLE:.*]] = load ptr addrspace(1), ptr {{.*}}
+// CHECK2: %[[CMP:.*]] = icmp eq ptr addrspace(1) %[[VTABLE]], getelementptr inbounds ({ [3 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test21CE, i32 0, inrange i32 0, i32 2)
+// CHECK2: call void @llvm.assume(i1 %[[CMP]])
+
+// CHECK2: %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %{{.*}}, i64 8
+// CHECK2: %[[VTABLE2:.*]] = load ptr addrspace(1), ptr %[[ADD_PTR]]
+// CHECK2: %[[CMP2:.*]] = icmp eq ptr addrspace(1) %[[VTABLE2]], getelementptr inbounds ({ [3 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test21CE, i32 0, inrange i32 1, i32 2)
+// CHECK2: call void @llvm.assume(i1 %[[CMP2]])
+
+// CHECK2: call void @_ZN5test21gEPNS_1AE(
+// CHECK2-LABEL: {{^}}}
+
+void test() {
+  C c;
+  g(&c);
+  h(&c);
+}
+}
+
+namespace test3 {
+struct A {
+  A();
+};
+
+struct B : A {
+  B();
+  virtual void foo();
+};
+
+struct C : virtual A, B {
+  C();
+  virtual void foo();
+};
+void g(B *a) { a->foo(); }
+
+// CHECK3-LABEL: define{{.*}} void @_ZN5test34testEv()
+// CHECK3: call void @_ZN5test31CC1Ev(ptr
+// CHECK3: %[[CMP:.*]] = icmp eq ptr addrspace(1) %{{.*}}, getelementptr inbounds ({ [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test31CE, i32 0, inrange i32 0, i32 3)
+// CHECK3: call void @llvm.assume(i1 %[[CMP]])
+// CHECK3-LABLEL: }
+void test() {
+  C c;
+  g(&c);
+}
+} // test3
+
+namespace test4 {
+struct A {
+  A();
+  virtual void foo();
+};
+
+struct B : virtual A {
+  B();
+  virtual void foo();
+};
+struct C : B {
+  C();
+  virtual void foo();
+};
+
+void g(C *c) { c->foo(); }
+
+// CHECK4-LABEL: define{{.*}} void @_ZN5test44testEv()
+// CHECK4: call void @_ZN5test41CC1Ev(ptr
+// CHECK4: %[[VTABLE:.*]] = load ptr addrspace(1), ptr %{{.*}}
+// CHECK4: %[[CMP:.*]] = icmp eq ptr addrspace(1) %[[VTABLE]], getelementptr inbounds ({ [5 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test41CE, i32 0, inrange i32 0, i32 4)
+// CHECK4: call void @llvm.assume(i1 %[[CMP]]
+
+// CHECK4: %[[VTABLE2:.*]] = load ptr addrspace(1), ptr %{{.*}}
+// CHECK4: %[[CMP2:.*]] = icmp eq ptr addrspace(1) %[[VTABLE2]], getelementptr inbounds ({ [5 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test41CE, i32 0, inrange i32 0, i32 4)
+// CHECK4: call void @llvm.assume(i1 %[[CMP2]])
+// CHECK4-LABEL: {{^}}}
+
+void test() {
+  C c;
+  g(&c);
+}
+} // test4
+
+namespace test6 {
+struct A {
+  A();
+  virtual void foo();
+  virtual ~A() {}
+};
+struct B : A {
+  B();
+};
+// FIXME: Because A's vtable is external, and no virtual functions are hidden,
+// it's safe to generate assumption loads.
+// CHECK5-LABEL: define{{.*}} void @_ZN5test61gEv()
+// CHECK5: call void @_ZN5test61AC1Ev(
+// CHECK5-NOT: call void @llvm.assume(
+
+// We can't emit assumption loads for B, because if we would refer to vtable
+// it would refer to functions that will not be able to find (like implicit
+// inline destructor).
+
+// CHECK5-LABEL:   call void @_ZN5test61BC1Ev(
+// CHECK5-NOT: call void @llvm.assume(
+// CHECK5-LABEL: {{^}}}
+void g() {
+  A *a = new A;
+  B *b = new B;
+}
+}
+
+namespace test7 {
+// Because A's key function is defined here, vtable is generated in this TU
+// CHECK6: @_ZTVN5test71AE ={{.*}} unnamed_addr addrspace(1) constant
+struct A {
+  A();
+  virtual void foo();
+  virtual void bar();
+};
+void A::foo() {}
+
+// CHECK6-LABEL: define{{.*}} void @_ZN5test71gEv()
+// CHECK6: call void @_ZN5test71AC1Ev(
+// CHECK6: call void @llvm.assume(
+// CHECK6-LABEL: {{^}}}
+void g() {
+  A *a = new A();
+  a->bar();
+}
+}
+
+namespace test8 {
+
+struct A {
+  virtual void foo();
+  virtual void bar();
+};
+
+// CHECK7-DAG: @_ZTVN5test81BE = available_externally unnamed_addr addrspace(1) constant
+struct B : A {
+  B();
+  void foo();
+  void bar();
+};
+
+// CHECK7-DAG: @_ZTVN5test81CE = linkonce_odr unnamed_addr addrspace(1) constant
+struct C : A {
+  C();
+  void bar();
+  void foo() {}
+};
+inline void C::bar() {}
+
+struct D : A {
+  D();
+  void foo();
+  void inline bar();
+};
+void D::bar() {}
+
+// CHECK7-DAG: @_ZTVN5test81EE = linkonce_odr unnamed_addr addrspace(1) constant
+struct E : A {
+  E();
+};
+
+// CHECK7-LABEL: define{{.*}} void @_ZN5test81bEv()
+// CHECK7: call void @llvm.assume(
+// CHECK7-LABEL: {{^}}}
+void b() {
+  B b;
+  b.bar();
+}
+
+// FIXME: C has inline virtual functions which prohibits as from generating
+// assumption loads, but because vtable is generated in this TU (key function
+// defined here) it would be correct to refer to it.
+// CHECK7-LABEL: define{{.*}} void @_ZN5test81cEv()
+// CHECK7-NOT: call void @llvm.assume(
+// CHECK7-LABEL: {{^}}}
+void c() {
+  C c;
+  c.bar();
+}
+
+// FIXME: We could generate assumption loads here.
+// CHECK7-LABEL: define{{.*}} void @_ZN5test81dEv()
+// CHECK7-NOT: call void @llvm.assume(
+// CHECK7-LABEL: {{^}}}
+void d() {
+  D d;
+  d.bar();
+}
+
+// CHECK7-LABEL: define{{.*}} void @_ZN5test81eEv()
+// CHECK7: call void @llvm.assume(
+// CHECK7-LABEL: {{^}}}
+void e() {
+  E e;
+  e.bar();
+}
+}
+
+namespace test9 {
+
+struct S {
+  S();
+  __attribute__((visibility("hidden"))) virtual void doStuff();
+};
+
+// CHECK8-LABEL: define{{.*}} void @_ZN5test94testEv()
+// CHECK8-NOT: @llvm.assume(
+// CHECK8: }
+void test() {
+  S *s = new S();
+  s->doStuff();
+  delete s;
+}
+}
+
diff --git a/clang/test/CodeGenCXX/vtable-consteval-address-space.cpp b/clang/test/CodeGenCXX/vtable-consteval-address-space.cpp
new file mode 100644
index 0000000000000..bf91e8736b43d
--- /dev/null
+++ b/clang/test/CodeGenCXX/vtable-consteval-address-space.cpp
@@ -0,0 +1,44 @@
+// RUN: %clang_cc1 -std=c++20 -triple=amdgcn-amd-amdhsa %s -emit-llvm -o - | FileCheck %s --check-prefix=ITANIUM --implicit-check-not=DoNotEmit
+
+// FIXME: The MSVC ABI rule in use here was discussed with MS folks prior to
+// them implementing virtual consteval functions, but we do not know for sure
+// if this is the ABI rule they will use.
+
+// ITANIUM-DAG: @_ZTV1A = {{.*}} addrspace(1) constant { [2 x ptr addrspace(1)] } {{.*}} null, {{.*}} @_ZTI1A
+struct A {
+  virtual consteval void DoNotEmit_f() {}
+};
+// ITANIUM-DAG: @a = addrspace(1) global { {{.*}} ptr addrspace(1) @_ZTV1A,
+A a;
+
+// ITANIUM-DAG: @_ZTV1B = {{.*}} addrspace(1) constant { [4 x ptr addrspace(1)] } {{.*}} addrspace(1) null, ptr addrspace(1) @_ZTI1B, ptr addrspace(1) addrspacecast (ptr @_ZN1B1fEv to ptr addrspace(1)), ptr addrspace(1) addrspacecast (ptr @_ZN1B1hEv to ptr addrspace(1))
+struct B {
+  virtual void f() {}
+  virtual consteval void DoNotEmit_g() {}
+  virtual void h() {}
+};
+// ITANIUM-DAG: @b = addrspace(1) global { {{.*}} @_ZTV1B,
+B b;
+
+// ITANIUM-DAG: @_ZTV1C = {{.*}} addrspace(1) constant { [4 x ptr addrspace(1)] } {{.*}} addrspace(1) null, ptr addrspace(1) @_ZTI1C, ptr addrspace(1) addrspacecast (ptr @_ZN1CD1Ev to ptr addrspace(1)), ptr addrspace(1) addrspacecast (ptr @_ZN1CD0Ev to ptr addrspace(1))
+struct C {
+  virtual ~C() = default;
+  virtual consteval C &operator=(const C&) = default;
+};
+// ITANIUM-DAG: @c = addrspace(1) global { {{.*}} @_ZTV1C,
+C c;
+
+// ITANIUM-DAG: @_ZTV1D = {{.*}} addrspace(1) constant { [4 x ptr addrspace(1)] } {{.*}} addrspace(1) null, ptr addrspace(1) @_ZTI1D, ptr addrspace(1) addrspacecast (ptr @_ZN1DD1Ev to ptr addrspace(1)), ptr addrspace(1) addrspacecast (ptr @_ZN1DD0Ev to ptr addrspace(1))
+struct D : C {};
+// ITANIUM-DAG: @d = addrspace(1) global { ptr addrspace(1) } { {{.*}} @_ZTV1D,
+D d;
+
+// ITANIUM-DAG: @_ZTV1E = {{.*}} addrspace(1) constant { [3 x ptr addrspace(1)] } {{.*}} addrspace(1) null, ptr addrspace(1) @_ZTI1E, ptr addrspace(1) addrspacecast (ptr @_ZN1E1fEv to ptr addrspace(1))
+struct E { virtual void f() {} };
+// ITANIUM-DAG: @e = addrspace(1) global { {{.*}} @_ZTV1E,
+E e;
+
+// ITANIUM-DAG: @_ZTV1F = {{.*}} addrspace(1) constant { [3 x ptr addrspace(1)] } {{.*}} addrspace(1) null, ptr addrspace(1) @_ZTI1F, ptr addrspace(1) addrspacecast (ptr @_ZN1E1fEv to ptr addrspace(1))
+struct F : E { virtual consteval void DoNotEmit_g(); };
+// ITANIUM-DAG: @f = addrspace(1) global { ptr addrspace(1) } { {{.*}} @_ZTV1F,
+F f;
diff --git a/clang/test/CodeGenCXX/vtable-constexpr-address-space.cpp b/clang/test/CodeGenCXX/vtable-constexpr-address-space.cpp
new file mode 100644
index 0000000000000..67746328ce0aa
--- /dev/null
+++ b/clang/test/CodeGenCXX/vtable-constexpr-address-space.cpp
@@ -0,0 +1,27 @@
+// RUN: %clang_cc1 -std=c++20 -triple=amdgcn-amd-amdhsa %s -emit-llvm -o - | FileCheck %s --implicit-check-not=DoNotEmit
+
+// constexpr virtual functions can be called at runtime and go in the vtable as
+// normal. But they are implicitly inline so are never the key function.
+
+struct DoNotEmit {
+  virtual constexpr void f();
+};
+constexpr void DoNotEmit::f() {}
+
+// CHECK-DAG: @_ZTV1B = {{.*}} addrspace(1) constant { [3 x ptr addrspace(1)] } { {{.*}} null, {{.*}} @_ZTI1B, {{.*}} @_ZN1B1fEv
+struct B {
+  // CHECK-DAG: define {{.*}} @_ZN1B1fEv
+  virtual constexpr void f() {}
+};
+B b;
+
+struct CBase {
+  virtual constexpr void f(); // not key function
+};
+
+// CHECK-DAG: @_ZTV1C = {{.*}} addrspace(1) constant {{.*}} null, {{.*}} @_ZTI1C, {{.*}} @_ZN1C1fEv
+struct C : CBase {
+  void f(); // key function
+};
+// CHECK-DAG: define {{.*}} @_ZN1C1fEv
+void C::f() {}
diff --git a/clang/test/CodeGenCXX/vtable-key-function-address-space.cpp b/clang/test/CodeGenCXX/vtable-key-function-address-space.cpp
new file mode 100644
index 0000000000000..2163bfaadbfb7
--- /dev/null
+++ b/clang/test/CodeGenCXX/vtable-key-function-address-space.cpp
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 %s -triple=amdgcn-amd-amdhsa -emit-llvm -o - | FileCheck %s
+// PR5697
+namespace PR5697 {
+struct A {
+  virtual void f() { }
+  A();
+  A(int);
+};
+
+// A does not have a key function, so the first constructor we emit should
+// cause the vtable to be defined (without assertions.)
+// CHECK: @_ZTVN6PR56971AE = linkonce_odr unnamed_addr addrspace(1) constant
+A::A() { }
+A::A(int) { }
+}
+
+// Make sure that we don't assert when building the vtable for a class
+// template specialization or explicit instantiation with a key
+// function.
+template<typename T>
+struct Base {
+  virtual ~Base();
+};
+
+template<typename T>
+struct Derived : public Base<T> { };
+
+template<>
+struct Derived<char> : public Base<char> {
+  virtual void anchor();
+};
+
+void Derived<char>::anchor() { }
diff --git a/clang/test/CodeGenCXX/vtable-layout-extreme-address-space.cpp b/clang/test/CodeGenCXX/vtable-layout-extreme-address-space.cpp
new file mode 100644
index 0000000000000..8bc9dd9d46c92
--- /dev/null
+++ b/clang/test/CodeGenCXX/vtable-layout-extreme-address-space.cpp
@@ -0,0 +1,210 @@
+// RUN: %clang_cc1 %s -triple=amdgcn-amd-amdhsa -std=c++11 -emit-llvm-only -fdump-vtable-layouts 2>&1 | FileCheck %s
+
+// A collection of big class hierarchies and their vtables.
+
+namespace Test1 {
+
+class C0
+{
+};
+class C1
+ :  virtual public C0
+{
+  int k0;
+};
+class C2
+ :  public C0
+ ,  virtual public C1
+{
+  int k0;
+};
+class C3
+ :  virtual public C0
+ ,  virtual public C1
+ ,  public C2
+{
+  int k0;
+  int k1;
+  int k2;
+  int k3;
+};
+class C4
+ :  public C2
+ ,  virtual public C3
+ ,  public C0
+{
+  int k0;
+};
+class C5
+ :  public C0
+ ,  virtual public C4
+ ,  public C2
+ ,  public C1
+ ,  virtual public C3
+{
+  int k0;
+};
+class C6
+ :  virtual public C3
+ ,  public C0
+ ,  public C5
+ ,  public C4
+ ,  public C1
+{
+  int k0;
+};
+class C7
+ :  virtual public C5
+ ,  virtual public C6
+ ,  virtual public C3
+ ,  public C4
+ ,  virtual public C2
+{
+  int k0;
+  int k1;
+};
+class C8
+ :  public C7
+ ,  public C5
+ ,  public C3
+ ,  virtual public C4
+ ,  public C1
+ ,  public C2
+{
+  int k0;
+  int k1;
+};
+
+// CHECK:     Vtable for 'Test1::C9' (87 entries).
+// CHECK-NEXT:   0 | vbase_offset (344)
+// CHECK-NEXT:   1 | vbase_offset (312)
+// CHECK-NEXT:   2 | vbase_offset (184)
+// CHECK-NEXT:   3 | vbase_offset (168)
+// CHECK-NEXT:   4 | vbase_offset (120)
+// CHECK-NEXT:   5 | vbase_offset (48)
+// CHECK-NEXT:   6 | vbase_offset (148)
+// CHECK-NEXT:   7 | vbase_offset (152)
+// CHECK-NEXT:   8 | offset_to_top (0)
+// CHECK-NEXT:   9 | Test1::C9 RTTI
+// CHECK-NEXT:       -- (Test1::C2, 0) vtable address --
+// CHECK-NEXT:       -- (Test1::C9, 0) vtable address --
+// CHECK-NEXT:  10 | void Test1::C9::f()
+// CHECK-NEXT:  11 | vbase_offset (104)
+// CHECK-NEXT:  12 | vbase_offset (132)
+// CHECK-NEXT:  13 | vbase_offset (136)
+// CHECK-NEXT:  14 | offset_to_top (-16)
+// CHECK-NEXT:  15 | Test1::C9 RTTI
+// CHECK-NEXT:       -- (Test1::C2, 16) vtable address --
+// CHECK-NEXT:       -- (Test1::C4, 16) vtable address --
+// CHECK-NEXT:  16 | vbase_offset (72)
+// CHECK-NEXT:  17 | vbase_offset (120)
+// CHECK-NEXT:  18 | vbase_offset (100)
+// CHECK-NEXT:  19 | vbase_offset (104)
+// CHECK-NEXT:  20 | offset_to_top (-48)
+// CHECK-NEXT:  21 | Test1::C9 RTTI
+// CHECK-NEXT:       -- (Test1::C2, 48) vtable address --
+// CHECK-NEXT:       -- (Test1::C5, 48) vtable address --
+// CHECK-NEXT:       -- (Test1::C6, 48) vtable address --
+// CHECK-NEXT:  22 | vbase_offset (84)
+// CHECK-NEXT:  23 | offset_to_top (-64)
+// CHECK-NEXT:  24 | Test1::C9 RTTI
+// CHECK-NEXT:       -- (Test1::C1, 64) vtable address --
+// CHECK-NEXT:  25 | vbase_offset (32)
+// CHECK-NEXT:  26 | vbase_offset (60)
+// CHECK-NEXT:  27 | vbase_offset (64)
+// CHECK-NEXT:  28 | offset_to_top (-88)
+// CHECK-NEXT:  29 | Test1::C9 RTTI
+// CHECK-NEXT:       -- (Test1::C2, 88) vtable address --
+// CHECK-NEXT:       -- (Test1::C4, 88) vtable address --
+// CHECK-NEXT:  30 | vbase_offset (44)
+// CHECK-NEXT:  31 | offset_to_top (-104)
+// CHECK-NEXT:  32 | Test1::C9 RTTI
+// CHECK-NEXT:       -- (Test1::C1, 104) vtable address --
+// CHECK-NEXT:  33 | vbase_offset (28)
+// CHECK-NEXT:  34 | vbase_offset (32)
+// CHECK-NEXT:  35 | offset_to_top (-120)
+// CHECK-NEXT:  36 | Test1::C9 RTTI
+// CHECK-NEXT:       -- (Test1::C2, 120) vtable address --
+// CHECK-NEXT:       -- (Test1::C3, 120) vtable address --
+// CHECK-NEXT:  37 | vbase_offset (-4)
+// CHECK-NEXT:  38 | offset_to_top (-152)
+// CHECK-NEXT:  39 | Test1::C9 RTTI
+// CHECK-NEXT:       -- (Test1::C1, 152) vtable address --
+// CHECK-NEXT:  40 | vbase_offset (-48)
+// CHECK-NEXT:  41 | vbase_offset (-20)
+// CHECK-NEXT:  42 | vbase_offset (-16)
+// CHECK-NEXT:  43 | offset_to_top (-168)
+// CHECK-NEXT:  44 | Test1::C9 RTTI
+// CHECK-NEXT:       -- (Test1::C2, 168) vtable address --
+// CHECK-NEXT:       -- (Test1::C4, 168) vtable address --
+// CHECK-NEXT:  45 | vbase_offset (160)
+// CHECK-NEXT:  46 | vbase_offset (-136)
+// CHECK-NEXT:  47 | vbase_offset (-16)
+// CHECK-NEXT:  48 | vbase_offset (128)
+// CHECK-NEXT:  49 | vbase_offset (-64)
+// CHECK-NEXT:  50 | vbase_offset (-36)
+// CHECK-NEXT:  51 | vbase_offset (-32)
+// CHECK-NEXT:  52 | offset_to_top (-184)
+// CHECK-NEXT:  53 | Test1::C9 RTTI
+// CHECK-NEXT:       -- (Test1::C2, 184) vtable address --
+// CHECK-NEXT:       -- (Test1::C4, 184) vtable address --
+// CHECK-NEXT:       -- (Test1::C7, 184) vtable address --
+// CHECK-NEXT:       -- (Test1::C8, 184) vtable address --
+// CHECK-NEXT:  54 | vbase_offset (-88)
+// CHECK-NEXT:  55 | vbase_offset (-40)
+// CHECK-NEXT:  56 | vbase_offset (-60)
+// CHECK-NEXT:  57 | vbase_offset (-56)
+// CHECK-NEXT:  58 | offset_to_top (-208)
+// CHECK-NEXT:  59 | Test1::C9 RTTI
+// CHECK-NEXT:       -- (Test1::C2, 208) vtable address --
+// CHECK-NEXT:       -- (Test1::C5, 208) vtable address --
+// CHECK-NEXT:  60 | vbase_offset (-76)
+// CHECK-NEXT:  61 | offset_to_top (-224)
+// CHECK-NEXT:  62 | Test1::C9 RTTI
+// CHECK-NEXT:       -- (Test1::C1, 224) vtable address --
+// CHECK-NEXT:  63 | vbase_offset (-92)
+// CHECK-NEXT:  64 | vbase_offset (-88)
+// CHECK-NEXT:  65 | offset_to_top (-240)
+// CHECK-NEXT:  66 | Test1::C9 RTTI
+// CHECK-NEXT:       -- (Test1::C2, 240) vtable address --
+// CHECK-NEXT:       -- (Test1::C3, 240) vtable address --
+// CHECK-NEXT:  67 | vbase_offset (-124)
+// CHECK-NEXT:  68 | offset_to_top (-272)
+// CHECK-NEXT:  69 | Test1::C9 RTTI
+// CHECK-NEXT:       -- (Test1::C1, 272) vtable address --
+// CHECK-NEXT:  70 | vbase_offset (-140)
+// CHECK-NEXT:  71 | vbase_offset (-136)
+// CHECK-NEXT:  72 | offset_to_top (-288)
+// CHECK-NEXT:  73 | Test1::C9 RTTI
+// CHECK-NEXT:       -- (Test1::C2, 288) vtable address --
+// CHECK-NEXT:  74 | vbase_offset (-192)
+// CHECK-NEXT:  75 | vbase_offset (-144)
+// CHECK-NEXT:  76 | vbase_offset (-164)
+// CHECK-NEXT:  77 | vbase_offset (-160)
+// CHECK-NEXT:  78 | offset_to_top (-312)
+// CHECK-NEXT:  79 | Test1::C9 RTTI
+// CHECK-NEXT:       -- (Test1::C2, 312) vtable address --
+// CHECK-NEXT:       -- (Test1::C5, 312) vtable address --
+// CHECK-NEXT:  80 | vbase_offset (-180)
+// CHECK-NEXT:  81 | offset_to_top (-328)
+// CHECK-NEXT:  82 | Test1::C9 RTTI
+// CHECK-NEXT:       -- (Test1::C1, 328) vtable address --
+// CHECK-NEXT:  83 | vbase_offset (-196)
+// CHECK-NEXT:  84 | vbase_offset (-192)
+// CHECK-NEXT:  85 | offset_to_top (-344)
+// CHECK-NEXT:  86 | Test1::C9 RTTI
+class C9
+ :  virtual public C6
+ ,  public C2
+ ,  public C4
+ ,  virtual public C8
+{
+  int k0;
+  int k1;
+  int k2;
+  int k3;
+  virtual void f();
+};
+void C9::f() { }
+
+}
diff --git a/clang/test/CodeGenCXX/vtable-linkage-address-space.cpp b/clang/test/CodeGenCXX/vtable-linkage-address-space.cpp
new file mode 100644
index 0000000000000..988dfb254c0db
--- /dev/null
+++ b/clang/test/CodeGenCXX/vtable-linkage-address-space.cpp
@@ -0,0 +1,217 @@
+// RUN: %clang_cc1 %s -triple=amdgcn-amd-amdhsa -emit-llvm -o %t
+// RUN: %clang_cc1 %s -triple=amdgcn-amd-amdhsa -emit-llvm -std=c++03 -o %t.03
+// RUN: %clang_cc1 %s -triple=amdgcn-amd-amdhsa -emit-llvm -std=c++11 -o %t.11
+// RUN: %clang_cc1 %s -triple=amdgcn-amd-amdhsa -disable-llvm-passes -O3 -emit-llvm -o %t.opt
+// RUN: FileCheck %s < %t
+// RUN: FileCheck %s < %t.03
+// RUN: FileCheck %s < %t.11
+// RUN: FileCheck --check-prefix=CHECK-OPT %s < %t.opt
+
+namespace {
+  struct A {
+    virtual void f() { }
+  };
+}
+
+void f() { A b; }
+
+struct B {
+  B();
+  virtual void f();
+};
+
+B::B() { }
+
+struct C : virtual B {
+  C();
+  virtual void f() { }
+};
+
+C::C() { }
+
+struct D {
+  virtual void f();
+};
+
+void D::f() { }
+
+static struct : D { } e;
+
+// Force 'e' to be constructed and therefore have a vtable defined.
+void use_e() {
+  e.f();
+}
+
+// The destructor is the key function.
+template<typename T>
+struct E {
+  virtual ~E();
+};
+
+template<typename T> E<T>::~E() { }
+
+// Anchor is the key function
+template<>
+struct E<char> {
+  virtual void anchor();
+};
+
+void E<char>::anchor() { }
+
+template struct E<short>;
+extern template struct E<int>;
+
+void use_E() {
+  E<int> ei;
+  (void)ei;
+  E<long> el;
+  (void)el;
+}
+
+// No key function
+template<typename T>
+struct F {
+  virtual void foo() { }
+};
+
+// No key function
+template<>
+struct F<char> {
+  virtual void foo() { }
+};
+
+template struct F<short>;
+extern template struct F<int>;
+
+void use_F() {
+  F<char> fc;
+  fc.foo();
+  F<int> fi;
+  fi.foo();
+  F<long> fl;
+  (void)fl;
+}
+
+// B has a key function that is not defined in this translation unit so its vtable
+// has external linkage.
+// CHECK-DAG: @_ZTV1B = external unnamed_addr addrspace(1) constant
+
+// C has no key function, so its vtable should have weak_odr linkage
+// and hidden visibility (rdar://problem/7523229).
+// CHECK-DAG: @_ZTV1C = linkonce_odr unnamed_addr addrspace(1) constant {{.*}}, comdat, align 8{{$}}
+// CHECK-DAG: @_ZTS1C = linkonce_odr addrspace(1) constant {{.*}}, comdat, align 1{{$}}
+// CHECK-DAG: @_ZTI1C = linkonce_odr addrspace(1) constant {{.*}}, comdat, align 8{{$}}
+// CHECK-DAG: @_ZTT1C = linkonce_odr unnamed_addr addrspace(1) constant {{.*}}, comdat, align 8{{$}}
+
+// D has a key function that is defined in this translation unit so its vtable is
+// defined in the translation unit.
+// CHECK-DAG: @_ZTV1D ={{.*}} unnamed_addr addrspace(1) constant
+// CHECK-DAG: @_ZTS1D ={{.*}} addrspace(1) constant
+// CHECK-DAG: @_ZTI1D ={{.*}} addrspace(1) constant
+
+// E<char> is an explicit specialization with a key function defined
+// in this translation unit, so its vtable should have external
+// linkage.
+// CHECK-DAG: @_ZTV1EIcE ={{.*}} unnamed_addr addrspace(1) constant
+// CHECK-DAG: @_ZTS1EIcE ={{.*}} addrspace(1) constant
+// CHECK-DAG: @_ZTI1EIcE ={{.*}} addrspace(1) constant
+
+// E<short> is an explicit template instantiation with a key function
+// defined in this translation unit, so its vtable should have
+// weak_odr linkage.
+// CHECK-DAG: @_ZTV1EIsE = weak_odr unnamed_addr addrspace(1) constant {{.*}}, comdat,
+// CHECK-DAG: @_ZTS1EIsE = weak_odr addrspace(1) constant {{.*}}, comdat, align 1{{$}}
+// CHECK-DAG: @_ZTI1EIsE = weak_odr addrspace(1) constant {{.*}}, comdat, align 8{{$}}
+
+// F<short> is an explicit template instantiation without a key
+// function, so its vtable should have weak_odr linkage
+// CHECK-DAG: @_ZTV1FIsE = weak_odr unnamed_addr addrspace(1) constant {{.*}}, comdat,
+// CHECK-DAG: @_ZTS1FIsE = weak_odr addrspace(1) constant {{.*}}, comdat, align 1{{$}}
+// CHECK-DAG: @_ZTI1FIsE = weak_odr addrspace(1) constant {{.*}}, comdat, align 8{{$}}
+
+// E<long> is an implicit template instantiation with a key function
+// defined in this translation unit, so its vtable should have
+// linkonce_odr linkage.
+// CHECK-DAG: @_ZTV1EIlE = linkonce_odr unnamed_addr addrspace(1) constant {{.*}}, comdat,
+// CHECK-DAG: @_ZTS1EIlE = linkonce_odr addrspace(1) constant {{.*}}, comdat, align 1{{$}}
+// CHECK-DAG: @_ZTI1EIlE = linkonce_odr addrspace(1) constant {{.*}}, comdat, align 8{{$}}
+
+// F<long> is an implicit template instantiation with no key function,
+// so its vtable should have linkonce_odr linkage.
+// CHECK-DAG: @_ZTV1FIlE = linkonce_odr unnamed_addr addrspace(1) constant {{.*}}, comdat,
+// CHECK-DAG: @_ZTS1FIlE = linkonce_odr addrspace(1) constant {{.*}}, comdat, align 1{{$}}
+// CHECK-DAG: @_ZTI1FIlE = linkonce_odr addrspace(1) constant {{.*}}, comdat, align 8{{$}}
+
+// F<int> is an explicit template instantiation declaration without a
+// key function, so its vtable should have external linkage.
+// CHECK-DAG: @_ZTV1FIiE = external unnamed_addr addrspace(1) constant
+// CHECK-OPT-DAG: @_ZTV1FIiE = available_externally unnamed_addr addrspace(1) constant
+
+// E<int> is an explicit template instantiation declaration. It has a
+// key function is not instantiated, so we know that vtable definition
+// will be generated in TU where key function will be defined
+// so we can mark it as external (without optimizations) and
+// available_externally (with optimizations) because all of the inline
+// virtual functions have been emitted.
+// CHECK-DAG: @_ZTV1EIiE = external unnamed_addr addrspace(1) constant
+// CHECK-OPT-DAG: @_ZTV1EIiE = available_externally unnamed_addr addrspace(1) constant
+
+// The anonymous struct for e has no linkage, so the vtable should have
+// internal linkage.
+// CHECK-DAG: @"_ZTV3$_0" = internal unnamed_addr addrspace(1) constant
+// CHECK-DAG: @"_ZTS3$_0" = internal addrspace(1) constant
+// CHECK-DAG: @"_ZTI3$_0" = internal addrspace(1) constant
+
+// The A vtable should have internal linkage since it is inside an anonymous
+// namespace.
+// CHECK-DAG: @_ZTVN12_GLOBAL__N_11AE = internal unnamed_addr addrspace(1) constant
+// CHECK-DAG: @_ZTSN12_GLOBAL__N_11AE = internal addrspace(1) constant
+// CHECK-DAG: @_ZTIN12_GLOBAL__N_11AE = internal addrspace(1) constant
+
+// F<char> is an explicit specialization without a key function, so
+// its vtable should have linkonce_odr linkage.
+// CHECK-DAG: @_ZTV1FIcE = linkonce_odr unnamed_addr addrspace(1) constant {{.*}}, comdat,
+// CHECK-DAG: @_ZTS1FIcE = linkonce_odr addrspace(1) constant {{.*}}, comdat, align 1{{$}}
+// CHECK-DAG: @_ZTI1FIcE = linkonce_odr addrspace(1) constant {{.*}}, comdat, align 8{{$}}
+
+// CHECK-DAG: @_ZTV1GIiE = linkonce_odr unnamed_addr addrspace(1) constant {{.*}}, comdat,
+template <typename T>
+class G {
+public:
+  G() {}
+  virtual void f0();
+  virtual void f1();
+};
+template <>
+void G<int>::f1() {}
+template <typename T>
+void G<T>::f0() {}
+void G_f0()  { new G<int>(); }
+
+// H<int> has a key function without a body but it's a template instantiation
+// so its VTable must be emitted.
+// CHECK-DAG: @_ZTV1HIiE = linkonce_odr unnamed_addr addrspace(1) constant {{.*}}, comdat,
+template <typename T>
+class H {
+public:
+  virtual ~H();
+};
+
+void use_H() {
+  H<int> h;
+}
+
+// I<int> has an explicit instantiation declaration and needs a VTT and
+// construction vtables.
+
+// CHECK-DAG: @_ZTV1IIiE = external unnamed_addr addrspace(1) constant
+// CHECK-DAG: @_ZTT1IIiE = external unnamed_addr addrspace(1) constant
+// CHECK-NOT: @_ZTC1IIiE
+//
+// CHECK-OPT-DAG: @_ZTV1IIiE = available_externally unnamed_addr addrspace(1) constant
+// CHECK-OPT-DAG: @_ZTT1IIiE = available_externally unnamed_addr addrspace(1) constant
+struct VBase1 { virtual void f(); }; struct VBase2 : virtual VBase1 {};
+template<typename T>
+struct I : VBase2 {};
+extern template struct I<int>;
+I<int> i;
diff --git a/clang/test/CodeGenCXX/vtable-pointer-initialization-address-space.cpp b/clang/test/CodeGenCXX/vtable-pointer-initialization-address-space.cpp
new file mode 100644
index 0000000000000..247864862fecf
--- /dev/null
+++ b/clang/test/CodeGenCXX/vtable-pointer-initialization-address-space.cpp
@@ -0,0 +1,60 @@
+// RUN: %clang_cc1 %s -triple=amdgcn-amd-amdhsa -std=c++11 -emit-llvm -o - | FileCheck %s
+
+struct Field {
+  Field();
+  ~Field();
+};
+
+struct Base {
+  Base();
+  ~Base();
+};
+
+struct A : Base {
+  A();
+  ~A();
+
+  virtual void f();
+
+  Field field;
+};
+
+// CHECK-LABEL: define{{.*}} void @_ZN1AC2Ev(ptr {{[^,]*}} %this) unnamed_addr
+// CHECK: call void @_ZN4BaseC2Ev(
+// CHECK: store ptr addrspace(1) getelementptr inbounds ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTV1A, i32 0, inrange i32 0, i32 2)
+// CHECK: call void @_ZN5FieldC1Ev(
+// CHECK: ret void
+A::A() { }
+
+// CHECK-LABEL: define{{.*}} void @_ZN1AD2Ev(ptr {{[^,]*}} %this) unnamed_addr
+// CHECK: store ptr addrspace(1) getelementptr inbounds ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTV1A, i32 0, inrange i32 0, i32 2)
+// CHECK: call void @_ZN5FieldD1Ev(
+// CHECK: call void @_ZN4BaseD2Ev(
+// CHECK: ret void
+A::~A() { }
+
+struct B : Base {
+  virtual void f();
+
+  Field field;
+};
+
+void f() { B b; }
+
+// CHECK-LABEL: define linkonce_odr void @_ZN1BC1Ev(ptr {{[^,]*}} %this) unnamed_addr
+// CHECK: call void @_ZN1BC2Ev(
+
+// CHECK-LABEL: define linkonce_odr void @_ZN1BD1Ev(ptr {{[^,]*}} %this) unnamed_addr
+// CHECK: call void @_ZN1BD2Ev(
+
+// CHECK-LABEL: define linkonce_odr void @_ZN1BC2Ev(ptr {{[^,]*}} %this) unnamed_addr
+// CHECK: call void @_ZN4BaseC2Ev(
+// CHECK: store ptr addrspace(1) getelementptr inbounds ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTV1B, i32 0, inrange i32 0, i32 2)
+// CHECK: call void @_ZN5FieldC1Ev
+// CHECK: ret void
+
+// CHECK-LABEL: define linkonce_odr void @_ZN1BD2Ev(ptr {{[^,]*}} %this) unnamed_addr
+// CHECK: store ptr addrspace(1) getelementptr inbounds ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTV1B, i32 0, inrange i32 0, i32 2)
+// CHECK: call void @_ZN5FieldD1Ev(
+// CHECK: call void @_ZN4BaseD2Ev(
+// CHECK: ret void
diff --git a/clang/test/CodeGenCXX/vtt-address-space.cpp b/clang/test/CodeGenCXX/vtt-address-space.cpp
index 595587923d5f6..e567ae49811a4 100644
--- a/clang/test/CodeGenCXX/vtt-address-space.cpp
+++ b/clang/test/CodeGenCXX/vtt-address-space.cpp
@@ -1,7 +1,4 @@
 // RUN: %clang_cc1 %s -triple=amdgcn-amd-amdhsa -std=c++11 -emit-llvm -o - | FileCheck %s
-// This is temporarily disabled as it requires fixing typeinfo & vptr handling
-// as well; it will be enabled once those fixes are in.
-// XFAIL: *
 
 // This is the sample from the C++ Itanium ABI, p2.6.2.
 namespace Test {
@@ -21,10 +18,10 @@ namespace Test {
   D d;
 }
 
-// CHECK: @_ZTTN4Test1DE = linkonce_odr unnamed_addr addrspace(1) constant [13 x ptr] [ptr addrspacecast (ptr addrspace(1) getelementptr inbounds ({ [5 x ptr], [7 x ptr], [4 x ptr], [3 x ptr] }, ptr addrspace(1) @_ZTVN4Test1DE, i32 0, inrange i32 0, i32 5) to ptr), ptr addrspacecast (ptr addrspace(1) getelementptr inbounds ({ [3 x ptr], [4 x ptr] }, ptr addrspace(1) @_ZTCN4Test1DE0_NS_2C1E, i32 0, inrange i32 0, i32 3) to ptr), ptr addrspacecast (ptr addrspace(1) getelementptr inbounds ({ [3 x ptr], [4 x ptr] }, ptr addrspace(1) @_ZTCN4Test1DE0_NS_2C1E, i32 0, inrange i32 1, i32 3) to ptr), ptr addrspacecast (ptr addrspace(1) getelementptr inbounds ({ [7 x ptr], [3 x ptr], [4 x ptr] }, ptr addrspace(1) @_ZTCN4Test1DE16_NS_2C2E, i32 0, inrange i32 0, i32 6) to ptr), ptr addrspacecast (ptr addrspace(1) getelementptr inbounds ({ [7 x ptr], [3 x ptr], [4 x ptr] }, ptr addrspace(1) @_ZTCN4Test1DE16_NS_2C2E, i32 0, inrange i32 0, i32 6) to ptr), ptr addrspacecast (ptr addrspace(1) getelementptr inbounds ({ [7 x ptr], [3 x ptr], [4 x ptr] }, ptr addrspace(1) @_ZTCN4Test1DE16_NS_2C2E, i32 0, inrange i32 1, i32 3) to ptr), ptr addrspacecast (ptr addrspace(1) getelementptr inbounds ({ [7 x ptr], [3 x ptr], [4 x ptr] }, ptr addrspace(1) @_ZTCN4Test1DE16_NS_2C2E, i32 0, inrange i32 2, i32 3) to ptr), ptr addrspacecast (ptr addrspace(1) getelementptr inbounds ({ [5 x ptr], [7 x ptr], [4 x ptr], [3 x ptr] }, ptr addrspace(1) @_ZTVN4Test1DE, i32 0, inrange i32 2, i32 3) to ptr), ptr addrspacecast (ptr addrspace(1) getelementptr inbounds ({ [5 x ptr], [7 x ptr], [4 x ptr], [3 x ptr] }, ptr addrspace(1) @_ZTVN4Test1DE, i32 0, inrange i32 1, i32 6) to ptr), ptr addrspacecast (ptr addrspace(1) getelementptr inbounds ({ [5 x ptr], [7 x ptr], [4 x ptr], [3 x ptr] }, ptr addrspace(1) @_ZTVN4Test1DE, i32 0, inrange i32 1, i32 6) to ptr), ptr addrspacecast (ptr addrspace(1) getelementptr inbounds ({ [5 x ptr], [7 x ptr], [4 x ptr], [3 x ptr] }, ptr addrspace(1) @_ZTVN4Test1DE, i32 0, inrange i32 3, i32 3) to ptr), ptr addrspacecast (ptr addrspace(1) getelementptr inbounds ({ [3 x ptr], [4 x ptr] }, ptr addrspace(1) @_ZTCN4Test1DE64_NS_2V2E, i32 0, inrange i32 0, i32 3) to ptr), ptr addrspacecast (ptr addrspace(1) getelementptr inbounds ({ [3 x ptr], [4 x ptr] }, ptr addrspace(1) @_ZTCN4Test1DE64_NS_2V2E, i32 0, inrange i32 1, i32 3) to ptr)], comdat, align 8
-// CHECK: call void @_ZN4Test2V2C2Ev(ptr noundef nonnull align 8 dereferenceable(20) %2, ptr addrspace(1) noundef getelementptr inbounds ([13 x ptr], ptr addrspace(1) @_ZTTN4Test1DE, i64 0, i64 11))
-// CHECK: call void @_ZN4Test2C1C2Ev(ptr noundef nonnull align 8 dereferenceable(12) %this1, ptr addrspace(1) noundef getelementptr inbounds ([13 x ptr], ptr addrspace(1) @_ZTTN4Test1DE, i64 0, i64 1))
-// CHECK: call void @_ZN4Test2C2C2Ev(ptr noundef nonnull align 8 dereferenceable(12) %3, ptr addrspace(1) noundef getelementptr inbounds ([13 x ptr], ptr addrspace(1) @_ZTTN4Test1DE, i64 0, i64 3))
-// CHECK-NEXT: define linkonce_odr void @_ZN4Test2V2C2Ev(ptr noundef nonnull align 8 dereferenceable(20) %this, ptr addrspace(1) noundef %vtt)
-// CHECK-NEXT: define linkonce_odr void @_ZN4Test2C1C2Ev(ptr noundef nonnull align 8 dereferenceable(12) %this, ptr addrspace(1) noundef %vtt)
-// CHECK-NEXT: define linkonce_odr void @_ZN4Test2C2C2Ev(ptr noundef nonnull align 8 dereferenceable(12) %this, ptr addrspace(1) noundef %vtt)
+// CHECK: linkonce_odr unnamed_addr addrspace(1) constant [13 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds ({ [5 x ptr addrspace(1)], [7 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN4Test1DE, i32 0, inrange i32 0, i32 5), ptr addrspace(1) getelementptr inbounds ({ [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN4Test1DE0_NS_2C1E, i32 0, inrange i32 0, i32 3), ptr addrspace(1) getelementptr inbounds ({ [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN4Test1DE0_NS_2C1E, i32 0, inrange i32 1, i32 3), ptr addrspace(1) getelementptr inbounds ({ [7 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN4Test1DE16_NS_2C2E, i32 0, inrange i32 0, i32 6), ptr addrspace(1) getelementptr inbounds ({ [7 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN4Test1DE16_NS_2C2E, i32 0, inrange i32 0, i32 6), ptr addrspace(1) getelementptr inbounds ({ [7 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN4Test1DE16_NS_2C2E, i32 0, inrange i32 1, i32 3), ptr addrspace(1) getelementptr inbounds ({ [7 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN4Test1DE16_NS_2C2E, i32 0, inrange i32 2, i32 3), ptr addrspace(1) getelementptr inbounds ({ [5 x ptr addrspace(1)], [7 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN4Test1DE, i32 0, inrange i32 2, i32 3), ptr addrspace(1) getelementptr inbounds ({ [5 x ptr addrspace(1)], [7 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN4Test1DE, i32 0, inrange i32 1, i32 6), ptr addrspace(1) getelementptr inbounds ({ [5 x ptr addrspace(1)], [7 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN4Test1DE, i32 0, inrange i32 1, i32 6), ptr addrspace(1) getelementptr inbounds ({ [5 x ptr addrspace(1)], [7 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN4Test1DE, i32 0, inrange i32 3, i32 3), ptr addrspace(1) getelementptr inbounds ({ [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN4Test1DE64_NS_2V2E, i32 0, inrange i32 0, i32 3), ptr addrspace(1) getelementptr inbounds ({ [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN4Test1DE64_NS_2V2E, i32 0, inrange i32 1, i32 3)], comdat, align 8
+// CHECK: call void @_ZN4Test2V2C2Ev(ptr noundef nonnull align 8 dereferenceable(20) %2, ptr addrspace(1) noundef getelementptr inbounds ([13 x ptr addrspace(1)], ptr addrspace(1) @_ZTTN4Test1DE, i64 0, i64 11))
+// CHECK: call void @_ZN4Test2C1C2Ev(ptr noundef nonnull align 8 dereferenceable(12) %this1, ptr addrspace(1) noundef getelementptr inbounds ([13 x ptr addrspace(1)], ptr addrspace(1) @_ZTTN4Test1DE, i64 0, i64 1))
+// CHECK: call void @_ZN4Test2C2C2Ev(ptr noundef nonnull align 8 dereferenceable(12) %3, ptr addrspace(1) noundef getelementptr inbounds ([13 x ptr addrspace(1)], ptr addrspace(1) @_ZTTN4Test1DE, i64 0, i64 3))
+// CHECK: define linkonce_odr void @_ZN4Test2V2C2Ev(ptr noundef nonnull align 8 dereferenceable(20) %this, ptr addrspace(1) noundef %vtt)
+// CHECK: define linkonce_odr void @_ZN4Test2C1C2Ev(ptr noundef nonnull align 8 dereferenceable(12) %this, ptr addrspace(1) noundef %vtt)
+// CHECK: define linkonce_odr void @_ZN4Test2C2C2Ev(ptr noundef nonnull align 8 dereferenceable(12) %this, ptr addrspace(1) noundef %vtt)
diff --git a/clang/test/CodeGenCXX/vtt-layout-address-space.cpp b/clang/test/CodeGenCXX/vtt-layout-address-space.cpp
new file mode 100644
index 0000000000000..2f80c9ec5f9fe
--- /dev/null
+++ b/clang/test/CodeGenCXX/vtt-layout-address-space.cpp
@@ -0,0 +1,89 @@
+// RUN: %clang_cc1 %s -triple=amdgcn-amd-amdhsa -std=c++11 -emit-llvm -o - | FileCheck %s
+
+// Test1::B should just have a single entry in its VTT, which points to the vtable.
+namespace Test1 {
+struct A { };
+
+struct B : virtual A {
+  virtual void f();
+};
+
+void B::f() { }
+}
+
+// Check that we don't add a secondary virtual pointer for Test2::A, since Test2::A doesn't have any virtual member functions or bases.
+namespace Test2 {
+  struct A { };
+
+  struct B : A { virtual void f(); };
+  struct C : virtual B { };
+
+  C c;
+}
+
+// This is the sample from the C++ Itanium ABI, p2.6.2.
+namespace Test3 {
+  class A1 { int i; };
+  class A2 { int i; virtual void f(); };
+  class V1 : public A1, public A2 { int i; };
+  class B1 { int i; };
+  class B2 { int i; };
+  class V2 : public B1, public B2, public virtual V1 { int i; };
+  class V3 {virtual void g(); };
+  class C1 : public virtual V1 { int i; };
+  class C2 : public virtual V3, virtual V2 { int i; };
+  class X1 { int i; };
+  class C3 : public X1 { int i; };
+  class D : public C1, public C2, public C3 { int i;  };
+
+  D d;
+}
+
+// This is the sample from the C++ Itanium ABI, p2.6.2, with the change suggested
+// (making A2 a virtual base of V1)
+namespace Test4 {
+  class A1 { int i; };
+  class A2 { int i; virtual void f(); };
+  class V1 : public A1, public virtual A2 { int i; };
+  class B1 { int i; };
+  class B2 { int i; };
+  class V2 : public B1, public B2, public virtual V1 { int i; };
+  class V3 {virtual void g(); };
+  class C1 : public virtual V1 { int i; };
+  class C2 : public virtual V3, virtual V2 { int i; };
+  class X1 { int i; };
+  class C3 : public X1 { int i; };
+  class D : public C1, public C2, public C3 { int i;  };
+
+  D d;
+}
+
+namespace Test5 {
+  struct A {
+    virtual void f() = 0;
+    virtual void anchor();
+  };
+
+  void A::anchor() {
+  }
+}
+
+namespace Test6 {
+  struct A {
+    virtual void f() = delete;
+    virtual void anchor();
+  };
+
+  void A::anchor() {
+  }
+}
+
+// CHECK: @_ZTTN5Test11BE ={{.*}} unnamed_addr addrspace(1) constant [1 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds ({ [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test11BE, i32 0, inrange i32 0, i32 3)]
+// CHECK: @_ZTVN5Test51AE ={{.*}} unnamed_addr addrspace(1) constant { [4 x ptr addrspace(1)] } { [4 x ptr addrspace(1)] [ptr addrspace(1) null, ptr addrspace(1) @_ZTIN5Test51AE, ptr addrspace(1) addrspacecast (ptr @__cxa_pure_virtual to ptr addrspace(1)), ptr addrspace(1) addrspacecast (ptr @_ZN5Test51A6anchorEv to ptr addrspace(1))] }
+// CHECK: @_ZTVN5Test61AE ={{.*}} unnamed_addr addrspace(1) constant { [4 x ptr addrspace(1)] } { [4 x ptr addrspace(1)] [ptr addrspace(1) null, ptr addrspace(1) @_ZTIN5Test61AE, ptr addrspace(1) addrspacecast (ptr @__cxa_deleted_virtual to ptr addrspace(1)), ptr addrspace(1) addrspacecast (ptr @_ZN5Test61A6anchorEv to ptr addrspace(1))] }
+// CHECK: @_ZTTN5Test21CE = linkonce_odr unnamed_addr addrspace(1) constant [2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds ({ [5 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test21CE, i32 0, inrange i32 0, i32 4), ptr addrspace(1) getelementptr inbounds ({ [5 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test21CE, i32 0, inrange i32 0, i32 4)]
+// CHECK: @_ZTTN5Test31DE = linkonce_odr unnamed_addr addrspace(1) constant [13 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds ({ [5 x ptr addrspace(1)], [7 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test31DE, i32 0, inrange i32 0, i32 5), ptr addrspace(1) getelementptr inbounds ({ [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test31DE0_NS_2C1E, i32 0, inrange i32 0, i32 3), ptr addrspace(1) getelementptr inbounds ({ [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test31DE0_NS_2C1E, i32 0, inrange i32 1, i32 3), ptr addrspace(1) getelementptr inbounds ({ [7 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test31DE16_NS_2C2E, i32 0, inrange i32 0, i32 6), ptr addrspace(1) getelementptr inbounds ({ [7 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test31DE16_NS_2C2E, i32 0, inrange i32 0, i32 6), ptr addrspace(1) getelementptr inbounds ({ [7 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test31DE16_NS_2C2E, i32 0, inrange i32 1, i32 3), ptr addrspace(1) getelementptr inbounds ({ [7 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test31DE16_NS_2C2E, i32 0, inrange i32 2, i32 3), ptr addrspace(1) getelementptr inbounds ({ [5 x ptr addrspace(1)], [7 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test31DE, i32 0, inrange i32 2, i32 3), ptr addrspace(1) getelementptr inbounds ({ [5 x ptr addrspace(1)], [7 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test31DE, i32 0, inrange i32 1, i32 6), ptr addrspace(1) getelementptr inbounds ({ [5 x ptr addrspace(1)], [7 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test31DE, i32 0, inrange i32 1, i32 6), ptr addrspace(1) getelementptr inbounds ({ [5 x ptr addrspace(1)], [7 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test31DE, i32 0, inrange i32 3, i32 3), ptr addrspace(1) getelementptr inbounds ({ [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test31DE64_NS_2V2E, i32 0, inrange i32 0, i32 3), ptr addrspace(1) getelementptr inbounds ({ [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test31DE64_NS_2V2E, i32 0, inrange i32 1, i32 3)]
+// CHECK: @_ZTVN5Test41DE = linkonce_odr unnamed_addr addrspace(1) constant { [6 x ptr addrspace(1)], [8 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)], [4 x ptr addrspace(1)] } { [6 x ptr addrspace(1)] [ptr addrspace(1) inttoptr (i64 72 to ptr addrspace(1)), ptr addrspace(1) inttoptr (i64 16 to ptr addrspace(1)), ptr addrspace(1) inttoptr (i64 56 to ptr addrspace(1)), ptr addrspace(1) inttoptr (i64 40 to ptr addrspace(1)), ptr addrspace(1) null, ptr addrspace(1) @_ZTIN5Test41DE], [8 x ptr addrspace(1)] [ptr addrspace(1) inttoptr (i64 40 to ptr addrspace(1)), ptr addrspace(1) inttoptr (i64 24 to ptr addrspace(1)), ptr addrspace(1) inttoptr (i64 56 to ptr addrspace(1)), ptr addrspace(1) null, ptr addrspace(1) null, ptr addrspace(1) inttoptr (i64 -16 to ptr addrspace(1)), ptr addrspace(1) @_ZTIN5Test41DE, ptr addrspace(1) addrspacecast (ptr @_ZN5Test42V31gEv to ptr addrspace(1))], [3 x ptr addrspace(1)] [ptr addrspace(1) inttoptr (i64 16 to ptr addrspace(1)), ptr addrspace(1) inttoptr (i64 -40 to ptr addrspace(1)), ptr addrspace(1) @_ZTIN5Test41DE], [4 x ptr addrspace(1)] [ptr addrspace(1) null, ptr addrspace(1) inttoptr (i64 -56 to ptr addrspace(1)), ptr addrspace(1) @_ZTIN5Test41DE, ptr addrspace(1) addrspacecast (ptr @_ZN5Test42A21fEv to ptr addrspace(1))], [4 x ptr addrspace(1)] [ptr addrspace(1) inttoptr (i64 -16 to ptr addrspace(1)), ptr addrspace(1) inttoptr (i64 -32 to ptr addrspace(1)), ptr addrspace(1) inttoptr (i64 -72 to ptr addrspace(1)), ptr addrspace(1) @_ZTIN5Test41DE] }
+// CHECK: @_ZTTN5Test41DE = linkonce_odr unnamed_addr addrspace(1) constant [19 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds ({ [6 x ptr addrspace(1)], [8 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test41DE, i32 0, inrange i32 0, i32 6), ptr addrspace(1) getelementptr inbounds ({ [4 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE0_NS_2C1E, i32 0, inrange i32 0, i32 4), ptr addrspace(1) getelementptr inbounds ({ [4 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE0_NS_2C1E, i32 0, inrange i32 1, i32 3), ptr addrspace(1) getelementptr inbounds ({ [4 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE0_NS_2C1E, i32 0, inrange i32 2, i32 3), ptr addrspace(1) getelementptr inbounds ({ [8 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE16_NS_2C2E, i32 0, inrange i32 0, i32 7), ptr addrspace(1) getelementptr inbounds ({ [8 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE16_NS_2C2E, i32 0, inrange i32 0, i32 7), ptr addrspace(1) getelementptr inbounds ({ [8 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE16_NS_2C2E, i32 0, inrange i32 1, i32 4), ptr addrspace(1) getelementptr inbounds ({ [8 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE16_NS_2C2E, i32 0, inrange i32 2, i32 3), ptr addrspace(1) getelementptr inbounds ({ [8 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE16_NS_2C2E, i32 0, inrange i32 3, i32 3), ptr addrspace(1) getelementptr inbounds ({ [6 x ptr addrspace(1)], [8 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test41DE, i32 0, inrange i32 2, i32 3), ptr addrspace(1) getelementptr inbounds ({ [6 x ptr addrspace(1)], [8 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test41DE, i32 0, inrange i32 3, i32 3), ptr addrspace(1) getelementptr inbounds ({ [6 x ptr addrspace(1)], [8 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test41DE, i32 0, inrange i32 1, i32 7), ptr addrspace(1) getelementptr inbounds ({ [6 x ptr addrspace(1)], [8 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test41DE, i32 0, inrange i32 1, i32 7), ptr addrspace(1) getelementptr inbounds ({ [6 x ptr addrspace(1)], [8 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test41DE, i32 0, inrange i32 4, i32 4), ptr addrspace(1) getelementptr inbounds ({ [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE40_NS_2V1E, i32 0, inrange i32 0, i32 3), ptr addrspace(1) getelementptr inbounds ({ [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE40_NS_2V1E, i32 0, inrange i32 1, i32 3), ptr addrspace(1) getelementptr inbounds ({ [4 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE72_NS_2V2E, i32 0, inrange i32 0, i32 4), ptr addrspace(1) getelementptr inbounds ({ [4 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE72_NS_2V2E, i32 0, inrange i32 1, i32 3), ptr addrspace(1) getelementptr inbounds ({ [4 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE72_NS_2V2E, i32 0, inrange i32 2, i32 3)]
+// CHECK: declare void @__cxa_pure_virtual() unnamed_addr
+// CHECK: declare void @__cxa_deleted_virtual() unnamed_addr
diff --git a/clang/test/Headers/hip-header.hip b/clang/test/Headers/hip-header.hip
index 154929fab4f15..146a43b643dba 100644
--- a/clang/test/Headers/hip-header.hip
+++ b/clang/test/Headers/hip-header.hip
@@ -57,6 +57,22 @@
 
 // expected-no-diagnostics
 
+// Check handling of overriden, implicitly __host__ dtor (should emit as a
+// nullptr to global)
+
+struct vbase {
+    virtual ~vbase();
+};
+
+template<typename T>
+struct vderived : public vbase {
+    ~vderived();
+};
+
+template struct vderived<void>;
+
+// CHECK: @_ZTV8vderivedIvE = weak_odr unnamed_addr addrspace(1) constant { [4 x ptr addrspace(1)] } zeroinitializer, comdat, align 8
+
 // Check support for pure and deleted virtual functions
 struct base {
   __host__
@@ -74,9 +90,8 @@ struct derived:base {
 __device__ void test_vf() {
     derived d;
 }
-// CHECK: @_ZTV7derived = linkonce_odr unnamed_addr addrspace(1) constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @_ZN7derived2pvEv, ptr @__cxa_deleted_virtual] }, comdat, align 8
-// CHECK: @_ZTV4base = linkonce_odr unnamed_addr addrspace(1) constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @__cxa_pure_virtual, ptr @__cxa_deleted_virtual] }, comdat, align 8
-
+// CHECK: @_ZTV7derived = linkonce_odr unnamed_addr addrspace(1) constant { [4 x ptr addrspace(1)] } { [4 x ptr addrspace(1)] [ptr addrspace(1) null, ptr addrspace(1) null, ptr addrspace(1) addrspacecast (ptr @_ZN7derived2pvEv to ptr addrspace(1)), ptr addrspace(1) addrspacecast (ptr @__cxa_deleted_virtual to ptr addrspace(1))] }, comdat, align 8
+// CHECK: @_ZTV4base = linkonce_odr unnamed_addr addrspace(1) constant { [4 x ptr addrspace(1)] } { [4 x ptr addrspace(1)] [ptr addrspace(1) null, ptr addrspace(1) null, ptr addrspace(1) addrspacecast (ptr @__cxa_pure_virtual to ptr addrspace(1)), ptr addrspace(1) addrspacecast (ptr @__cxa_deleted_virtual to ptr addrspace(1))] }, comdat, align 8
 // CHECK: define{{.*}}void @__cxa_pure_virtual()
 // CHECK: define{{.*}}void @__cxa_deleted_virtual()
 

From 0cfe7e359047e9b5eb1ec6656d6da113d578e38f Mon Sep 17 00:00:00 2001
From: Nick Sarnie <sarnex@users.noreply.github.com>
Date: Fri, 23 Feb 2024 04:59:23 +0900
Subject: [PATCH 26/30] [SYCL][ESIMD] Implement compile-time properties version
 of scatter(acc, ...) (#12670)

This implements the new compile-time properties API for scatter with
accessors. I believe this is the last missing piece.

---------

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
---
 sycl/include/sycl/ext/intel/esimd/memory.hpp  | 264 +++++++++++++++-
 .../ext/intel/experimental/esimd/memory.hpp   |  18 +-
 .../unified_memory_api/Inputs/scatter.hpp     | 294 +++++++++++++++---
 .../ESIMD/unified_memory_api/scatter_acc.cpp  |  36 +++
 .../scatter_acc_dg2_pvc.cpp                   |  38 +++
 sycl/test/esimd/memory_properties.cpp         |  61 ++++
 6 files changed, 636 insertions(+), 75 deletions(-)
 create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/scatter_acc.cpp
 create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/scatter_acc_dg2_pvc.cpp

diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp
index 592c2e129b0ad..b922cb02025d3 100644
--- a/sycl/include/sycl/ext/intel/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp
@@ -2737,6 +2737,55 @@ scatter_impl(AccessorTy acc, simd<T, N> vals, simd<uint32_t, N> offsets,
   }
 }
 
+#ifndef __ESIMD_FORCE_STATELESS_MEM
+/// Accessor-based scatter.
+/// Supported platforms: DG2, PVC
+/// VISA instruction: lsc_store.ugm
+///
+/// Scatters elements to surface.
+///
+/// @tparam T is element type.
+/// @tparam NElts is the number of elements to store per address.
+/// @tparam DS is the data size.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L2H is L2 cache hint.
+/// @tparam N is the number of channels (platform dependent).
+/// @tparam AccessorTy is the \ref sycl::accessor type.
+/// @param acc is the SYCL accessor.
+/// @param offsets is the zero-based offsets in bytes.
+/// @param vals is values to store.
+/// @param pred is predicates.
+///
+template <typename T, int NElts, lsc_data_size DS, cache_hint L1H,
+          cache_hint L2H, int N, typename AccessorTy, typename OffsetT>
+__ESIMD_API std::enable_if_t<
+    is_device_accessor_with_v<AccessorTy, accessor_mode_cap::can_write>>
+scatter_impl(AccessorTy acc, simd<OffsetT, N> offsets, simd<T, N * NElts> vals,
+             simd_mask<N> pred) {
+  static_assert(std::is_integral_v<OffsetT>,
+                "Scatter must have integral byte_offset type");
+  static_assert(sizeof(OffsetT) <= 4,
+                "Implicit truncation of 64-bit byte_offset to 32-bit is "
+                "disabled. Use -fsycl-esimd-force-stateless-mem or explicitly "
+                "convert offsets to a 32-bit vector");
+  check_lsc_vector_size<NElts>();
+  check_lsc_data_size<T, DS>();
+  check_cache_hint<cache_action::store, L1H, L2H>();
+  constexpr uint16_t AddressScale = 1;
+  constexpr int ImmOffset = 0;
+  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
+  constexpr lsc_vector_size LSCNElts = to_lsc_vector_size<NElts>();
+  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
+  using MsgT = typename lsc_expand_type<T>::type;
+  simd<MsgT, N * NElts> Tmp = lsc_format_input<MsgT, T>(vals);
+  simd<uint32_t, N> ByteOffsets32 = convert<uint32_t>(offsets);
+  auto si = get_surface_index(acc);
+  __esimd_lsc_store_bti<MsgT, L1H, L2H, AddressScale, ImmOffset, EDS, LSCNElts,
+                        Transposed, N>(pred.data(), ByteOffsets32.data(),
+                                       Tmp.data(), si);
+}
+#endif // __ESIMD_FORCE_STATELESS_MEM
+
 template <typename T, int N, typename AccessorTy>
 __ESIMD_API std::enable_if_t<
     (std::is_same_v<detail::LocalAccessorMarker, AccessorTy> ||
@@ -3343,6 +3392,197 @@ gather(AccessorT acc, OffsetSimdViewT byte_offsets, PropertyListT props = {}) {
 /// @anchor accessor_scatter
 /// Accessor-based scatter.
 ///
+/// template <typename T, int N, int VS = 1, typename AccessorTy,
+/// typename OffsetT, typename PropertyListT = empty_properties_t>
+/// void scatter(AccessorTy acc, simd<OffsetT, N / VS> byte_offsets,
+///              simd<T, N> vals, simd_mask<N / VS> mask,
+///              PropertyListT props = {});                        // (acc-sc-1)
+///
+/// template <typename T, int N, int VS = 1, typename AccessorTy,
+/// typename OffsetT, typename PropertyListT = empty_properties_t>
+/// void scatter(AccessorTy acc, simd<OffsetT, N / VS> byte_offsets,
+///              simd<T, N> vals, PropertyListT props = {});      // (acc-sc-2)
+
+/// The following two functions are similar to acc-sc-{1,2} with the
+/// 'byte_offsets' parameter represented as 'simd_view'.
+
+/// template <typename T, int N, int VS = 1, typename AccessorTy,
+/// typename OffsetSimdViewT, typename PropertyListT = empty_properties_t>
+/// void scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
+/// 	         simd_mask<N / VS> mask, PropertyListT props = {});// (acc-sc-3)
+///
+/// template <typename T, int N, int VS = 1, typename AccessorTy,
+/// typename OffsetSimdViewT, typename PropertyListT = empty_properties_t>
+/// void scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
+/// 	         PropertyListT props = {});                       // (acc-sc-4)
+///
+/// template <typename T, int N, int VS = 1, typename AccessorTy,
+/// typename OffsetT, typename PropertyListT = empty_properties_t>
+/// void scatter(AccessorTy acc, simd<OffsetT, N / VS> byte_offsets, simd<T, N>
+///              simd<T, N> vals, simd_mask<N / VS> mask,
+///              PropertyListT props = {});                      // (acc-sc-1)
+///
+/// Stores ("scatters") elements of the type 'T' to memory locations addressed
+/// by the accessor \p acc and byte offsets \p byte_offsets.
+/// Access to any element's memory location can be disabled via the input vector
+/// of predicates \p mask. If mask[i] is unset, then the store to
+/// (acc + byte_offsets[i]) is skipped.
+/// @tparam T Element type.
+/// @tparam N Number of elements to write.
+/// @tparam VS Vector size. It can also be read as the number of writes per each
+/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
+/// only on DG2 and PVC and only for 4- and 8-byte element vectors.
+/// @param acc Accessor referencing the data to store.
+/// @param byte_offsets the vector of 32-bit or 64-bit offsets in bytes.
+/// For each i, (acc + byte_offsets[i]) must be element size aligned.
+/// If the alignment property is not passed, then it is assumed that each
+/// accessed address is aligned by element-size.
+/// @param mask The access mask.
+/// @param props The optional compile-time properties. Only 'alignment'
+/// and cache hint properties are used.
+template <typename T, int N, int VS = 1, typename AccessorTy, typename OffsetT,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_device_accessor_with_v<AccessorTy,
+                                      detail::accessor_mode_cap::can_write> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+scatter(AccessorTy acc, simd<OffsetT, N / VS> byte_offsets, simd<T, N> vals,
+        simd_mask<N / VS> mask, PropertyListT props = {}) {
+#ifdef __ESIMD_FORCE_STATELESS_MEM
+  scatter<T, N, VS>(__ESIMD_DNS::accessorToPointer<T>(acc), byte_offsets, vals,
+                    mask, props);
+#else
+  constexpr size_t Alignment =
+      detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
+  static_assert(Alignment >= sizeof(T),
+                "gather() requires at least element-size alignment");
+  constexpr auto L1Hint =
+      detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
+          cache_hint::none);
+  constexpr auto L2Hint =
+      detail::getPropertyValue<PropertyListT, cache_hint_L2_key>(
+          cache_hint::none);
+  static_assert(!PropertyListT::template has_property<cache_hint_L3_key>(),
+                "L3 cache hint is reserved. The old/experimental L3 LSC cache "
+                "hint is cache_level::L2 now.");
+
+  if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none ||
+                VS > 1 || !detail::isPowerOf2(N, 32)) {
+    detail::scatter_impl<T, VS, detail::lsc_data_size::default_size, L1Hint,
+                         L2Hint>(acc, byte_offsets, vals, mask);
+  } else {
+    detail::scatter_impl<T, N, AccessorTy>(acc, vals, byte_offsets, 0, mask);
+  }
+
+#endif // __ESIMD_FORCE_STATELESS_MEM
+}
+/// template <typename T, int N, int VS = 1, typename AccessorTy,
+/// typename OffsetT, typename PropertyListT = empty_properties_t>
+/// void scatter(AccessorTy acc, simd<OffsetT, N / VS> byte_offsets,
+///              simd<T, N> vals, PropertyListT props = {});   // (acc-sc-2)
+///
+/// Stores ("scatters") elements of the type 'T' to memory locations addressed
+/// by the accessor \p acc and byte offsets \p byte_offsets.
+/// @tparam T Element type.
+/// @tparam N Number of elements to write.
+/// @tparam VS Vector size. It can also be read as the number of writes per each
+/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
+/// only on DG2 and PVC and only for 4- and 8-byte element vectors.
+/// @param acc Accessor referencing the data to store.
+/// @param byte_offsets the vector of 32-bit or 64-bit offsets in bytes.
+/// For each i, (acc + byte_offsets[i]) must be element size aligned.
+/// If the alignment property is not passed, then it is assumed that each
+/// accessed address is aligned by element-size.
+/// @param props The optional compile-time properties. Only 'alignment'
+/// and cache hint properties are used.
+template <typename T, int N, int VS = 1, typename AccessorTy, typename OffsetT,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_device_accessor_with_v<AccessorTy,
+                                      detail::accessor_mode_cap::can_write> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+scatter(AccessorTy acc, simd<OffsetT, N / VS> byte_offsets, simd<T, N> vals,
+        PropertyListT props = {}) {
+  simd_mask<N / VS> Mask = 1;
+  scatter<T, N, VS>(acc, byte_offsets, vals, Mask, props);
+}
+
+/// template <typename T, int N, int VS = 1, typename AccessorTy,
+/// typename OffsetSimdViewT, typename PropertyListT = empty_properties_t>
+/// void scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
+/// 	         simd_mask<N / VS> mask,
+///              PropertyListT props = {});                       // (acc-sc-3)
+///
+/// Stores ("scatters") elements of the type 'T' to memory locations addressed
+/// by the accessor \p acc and byte offsets \p byte_offsets.
+/// Access to any element's memory location can be disabled via the input vector
+/// of predicates \p mask. If mask[i] is unset, then the store to
+/// (acc + byte_offsets[i]) is skipped.
+/// @tparam T Element type.
+/// @tparam N Number of elements to write.
+/// @tparam VS Vector size. It can also be read as the number of writes per each
+/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
+/// only on DG2 and PVC and only for 4- and 8-byte element vectors.
+/// @param acc Accessor referencing the data to store.
+/// @param byte_offsets the vector of 32-bit or 64-bit offsets in bytes
+/// represented as a 'simd_view' object.
+/// For each i, (acc + byte_offsets[i]) must be element size aligned.
+/// If the alignment property is not passed, then it is assumed that each
+/// accessed address is aligned by element-size.
+/// @param mask The access mask.
+/// @param props The optional compile-time properties. Only 'alignment'
+/// and cache hint properties are used.
+template <typename T, int N, int VS = 1, typename AccessorTy,
+          typename OffsetSimdViewT,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_device_accessor_with_v<AccessorTy,
+                                      detail::accessor_mode_cap::can_write> &&
+    detail::is_simd_view_type_v<OffsetSimdViewT> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
+        simd_mask<N / VS> mask, PropertyListT props = {}) {
+  scatter<T, N, VS>(acc, byte_offsets.read(), vals, mask, props);
+}
+
+/// template <typename T, int N, int VS = 1, typename AccessorTy,
+/// typename OffsetSimdViewT, typename PropertyListT = empty_properties_t>
+/// void scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
+/// 	         PropertyListT props = {});                        // (acc-sc-4)
+///
+/// Stores ("scatters") elements of the type 'T' to memory locations addressed
+/// by the accessor \p acc and byte offsets \p byte_offsets.
+/// @tparam T Element type.
+/// @tparam N Number of elements to write.
+/// @tparam VS Vector size. It can also be read as the number of writes per each
+/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
+/// only on DG2 and PVC and only for 4- and 8-byte element vectors.
+/// @param acc Accessor referencing the data to store.
+/// @param byte_offsets the vector of 32-bit or 64-bit offsets in bytes
+/// represented as a 'simd_view' object.
+/// For each i, (acc + byte_offsets[i]) must be element size aligned.
+/// If the alignment property is not passed, then it is assumed that each
+/// accessed address is aligned by element-size.
+/// @param props The optional compile-time properties. Only 'alignment'
+/// and cache hint properties are used.
+template <typename T, int N, int VS = 1, typename AccessorTy,
+          typename OffsetSimdViewT,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_device_accessor_with_v<AccessorTy,
+                                      detail::accessor_mode_cap::can_write> &&
+    detail::is_simd_view_type_v<OffsetSimdViewT> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
+        PropertyListT props = {}) {
+  simd_mask<N / VS> Mask = 1;
+  scatter<T, N, VS>(acc, byte_offsets.read(), vals, Mask, props);
+}
+
 /// Writes elements of a \ref simd object into an accessor at given offsets.
 /// An element can be a 1, 2 or 4-byte value.
 ///
@@ -3365,25 +3605,31 @@ __ESIMD_API
                      detail::is_device_accessor_with_v<
                          AccessorTy, detail::accessor_mode_cap::can_write>>
     scatter(AccessorTy acc, simd<detail::DeviceAccessorOffsetT, N> offsets,
-            simd<T, N> vals, detail::DeviceAccessorOffsetT glob_offset = 0,
+            simd<T, N> vals, detail::DeviceAccessorOffsetT glob_offset,
             simd_mask<N> mask = 1) {
-#ifdef __ESIMD_FORCE_STATELESS_MEM
-  scatter<T, N>(__ESIMD_DNS::accessorToPointer<T>(acc, glob_offset), offsets,
-                vals, mask);
-#else
-  detail::scatter_impl<T, N, AccessorTy>(acc, vals, offsets, glob_offset, mask);
-#endif
+  offsets += glob_offset;
+  scatter<T, N>(acc, offsets, vals, mask);
+}
+
+template <typename T, int N, typename AccessorTy>
+__ESIMD_API
+    std::enable_if_t<(detail::isPowerOf2(N, 32)) &&
+                     detail::is_device_accessor_with_v<
+                         AccessorTy, detail::accessor_mode_cap::can_write>>
+    scatter(AccessorTy acc, detail::DeviceAccessorOffsetT glob_offset,
+            simd<T, N> vals, simd_mask<N> mask = 1) {
+  simd<detail::DeviceAccessorOffsetT, N> ByteOffsets = 0;
+  scatter<T, N>(acc, ByteOffsets, vals, glob_offset, mask);
 }
 
 #ifdef __ESIMD_FORCE_STATELESS_MEM
 template <typename T, int N, typename AccessorTy, typename Toffset>
 __ESIMD_API std::enable_if_t<
-    (detail::isPowerOf2(N, 32)) &&
     detail::is_device_accessor_with_v<AccessorTy,
                                       detail::accessor_mode_cap::can_write> &&
     std::is_integral_v<Toffset> && !std::is_same_v<Toffset, uint64_t>>
 scatter(AccessorTy acc, simd<Toffset, N> offsets, simd<T, N> vals,
-        uint64_t glob_offset = 0, simd_mask<N> mask = 1) {
+        uint64_t glob_offset, simd_mask<N> mask = 1) {
   scatter<T, N, AccessorTy>(acc, convert<uint64_t>(offsets), vals, glob_offset,
                             mask);
 }
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
index 9d5054b4392ba..0759f0b37b461 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
@@ -1531,23 +1531,7 @@ lsc_scatter(AccessorTy acc,
   lsc_scatter<T, NElts, DS, L1H, L3H>(__ESIMD_DNS::accessorToPointer<T>(acc),
                                       offsets, vals, pred);
 #else
-  detail::check_lsc_vector_size<NElts>();
-  detail::check_lsc_data_size<T, DS>();
-  detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
-  constexpr uint16_t _AddressScale = 1;
-  constexpr int _ImmOffset = 0;
-  constexpr lsc_data_size _DS =
-      detail::expand_data_size(detail::finalize_data_size<T, DS>());
-  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
-  constexpr detail::lsc_data_order _Transposed =
-      detail::lsc_data_order::nontranspose;
-  using MsgT = typename detail::lsc_expand_type<T>::type;
-  using _CstT = __ESIMD_DNS::uint_type_t<sizeof(T)>;
-  __ESIMD_NS::simd<MsgT, N * NElts> Tmp = vals.template bit_cast_view<_CstT>();
-  auto si = __ESIMD_NS::get_surface_index(acc);
-  __esimd_lsc_store_bti<MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
-                        _Transposed, N>(pred.data(), offsets.data(), Tmp.data(),
-                                        si);
+  __ESIMD_DNS::scatter_impl<T, NElts, DS, L1H, L3H>(acc, offsets, vals, pred);
 #endif
 }
 
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp
index b3b3d498276f4..a23c9884fa993 100644
--- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp
@@ -174,59 +174,141 @@ bool testUSM(queue Q, uint32_t MaskStride,
   return Passed;
 }
 
-template <typename T, TestFeatures Features> bool testUSM(queue Q) {
-  constexpr bool CheckMask = true;
-  constexpr bool CheckProperties = true;
-  properties EmptyProps;
-  properties AlignElemProps{alignment<sizeof(T)>};
-
-  bool Passed = true;
-
-  // Test scatter() that is available on Gen12 and PVC.
-  Passed &= testUSM<T, 1, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
-  Passed &= testUSM<T, 2, 1, !CheckMask, CheckProperties>(Q, 1, EmptyProps);
-  Passed &= testUSM<T, 4, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
-  Passed &= testUSM<T, 8, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
-  Passed &= testUSM<T, 16, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
-
-  Passed &= testUSM<T, 32, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
-
-  // Test scatter() without passing compile-time properties argument.
-  Passed &= testUSM<T, 16, 1, !CheckMask, !CheckProperties>(Q, 2, EmptyProps);
-  Passed &= testUSM<T, 32, 1, !CheckMask, !CheckProperties>(Q, 2, EmptyProps);
+template <typename T, uint16_t N, uint16_t VS, bool UseMask, bool UseProperties,
+          typename ScatterPropertiesT>
+bool testACC(queue Q, uint32_t MaskStride,
+             ScatterPropertiesT ScatterProperties) {
+  uint32_t Groups = 8;
+  uint32_t Threads = 16;
+  size_t Size = Groups * Threads * N;
+  using shared_allocator = sycl::usm_allocator<T, sycl::usm::alloc::shared, 16>;
+  using shared_vector = std::vector<T, shared_allocator>;
+  static_assert(VS > 0 && N % VS == 0,
+                "Incorrect VS parameter. N must be divisible by VS.");
+  constexpr int NOffsets = N / VS;
+  using Tuint = sycl::_V1::ext::intel::esimd::detail::uint_type_t<sizeof(T)>;
 
-  // Test scatter() with mask
-  Passed &= testUSM<T, 2, 1, CheckMask, CheckProperties>(Q, 2, EmptyProps);
-  Passed &= testUSM<T, 4, 1, CheckMask, CheckProperties>(Q, 2, EmptyProps);
-  Passed &= testUSM<T, 8, 1, CheckMask, CheckProperties>(Q, 2, EmptyProps);
-  Passed &= testUSM<T, 16, 1, CheckMask, !CheckProperties>(Q, 2, EmptyProps);
+  std::cout << "ACC case: T=" << esimd_test::type_name<T>() << ",N=" << N
+            << ", VS=" << VS << ",UseMask=" << UseMask
+            << ",UseProperties=" << UseProperties << std::endl;
 
-  if constexpr (Features == TestFeatures::PVC ||
-                Features == TestFeatures::DG2) {
-    properties LSCProps{cache_hint_L1<cache_hint::streaming>,
-                        cache_hint_L2<cache_hint::uncached>,
-                        alignment<sizeof(T)>};
-    Passed &= testUSM<T, 1, 1, !CheckMask, CheckProperties>(Q, 2, LSCProps);
-    Passed &= testUSM<T, 2, 1, CheckMask, CheckProperties>(Q, 2, LSCProps);
-    Passed &= testUSM<T, 4, 1, CheckMask, CheckProperties>(Q, 2, LSCProps);
-    Passed &= testUSM<T, 8, 1, CheckMask, CheckProperties>(Q, 2, LSCProps);
+  sycl::range<1> GlobalRange{Groups};
+  sycl::range<1> LocalRange{Threads};
+  sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange};
+  shared_vector Out(Size, shared_allocator{Q});
+  for (size_t i = 0; i < Size; i++)
+    Out[i] = i;
 
-    Passed &= testUSM<T, 32, 1, CheckMask, CheckProperties>(Q, 2, LSCProps);
+  try {
+    buffer<T, 1> OutBuf(Out);
+    Q.submit([&](handler &cgh) {
+       accessor OutAcc{OutBuf, cgh};
+       cgh.parallel_for(Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL {
+         ScatterPropertiesT Props{};
+         uint16_t GlobalID = ndi.get_global_id(0);
+         simd<int32_t, NOffsets> ByteOffsets(GlobalID * N * sizeof(T),
+                                             VS * sizeof(T));
+         auto ByteOffsetsView = ByteOffsets.template select<NOffsets, 1>();
+         simd<T, N> Vals = gather<T, N, VS>(OutAcc, ByteOffsets);
+         Vals *= 2;
+         auto ValsView = Vals.template select<N, 1>();
+         simd_mask<NOffsets> Pred = 0;
+         for (int I = 0; I < NOffsets; I++)
+           Pred[I] = (I % MaskStride == 0) ? 1 : 0;
+         if constexpr (VS > 1) { // VS > 1 requires specifying <T, N, VS>
+           if constexpr (UseMask) {
+             if constexpr (UseProperties) {
+               if (GlobalID % 4 == 0)
+                 scatter<T, N, VS>(OutAcc, ByteOffsets, Vals, Pred, Props);
+               else if (GlobalID % 4 == 1)
+                 scatter<T, N, VS>(OutAcc, ByteOffsetsView, Vals, Pred, Props);
+               else if (GlobalID % 4 == 2)
+                 scatter<T, N, VS>(OutAcc, ByteOffsets, ValsView, Pred, Props);
+               else if (GlobalID % 4 == 3)
+                 scatter<T, N, VS>(OutAcc, ByteOffsetsView, ValsView, Pred,
+                                   Props);
+             } else { // UseProperties == false
+               if (GlobalID % 4 == 0)
+                 scatter<T, N, VS>(OutAcc, ByteOffsets, Vals, Pred);
+               else if (GlobalID % 4 == 1)
+                 scatter<T, N, VS>(OutAcc, ByteOffsetsView, Vals, Pred);
+               else if (GlobalID % 4 == 2)
+                 scatter<T, N, VS>(OutAcc, ByteOffsets, ValsView, Pred);
+               else if (GlobalID % 4 == 3)
+                 scatter<T, N, VS>(OutAcc, ByteOffsetsView, ValsView, Pred);
+             }
+           } else { // UseMask == false
+             if constexpr (UseProperties) {
+               if (GlobalID % 4 == 0)
+                 scatter<T, N, VS>(OutAcc, ByteOffsets, Vals, Props);
+               else if (GlobalID % 4 == 1)
+                 scatter<T, N, VS>(OutAcc, ByteOffsetsView, Vals, Props);
+               else if (GlobalID % 4 == 2)
+                 scatter<T, N, VS>(OutAcc, ByteOffsets, ValsView, Props);
+               else if (GlobalID % 4 == 3)
+                 scatter<T, N, VS>(OutAcc, ByteOffsetsView, ValsView, Props);
+             } else { // UseProperties == false
+               if (GlobalID % 4 == 0)
+                 scatter<T, N, VS>(OutAcc, ByteOffsets, Vals);
+               else if (GlobalID % 4 == 1)
+                 scatter<T, N, VS>(OutAcc, ByteOffsetsView, Vals);
+               else if (GlobalID % 4 == 2)
+                 scatter<T, N, VS>(OutAcc, ByteOffsets, ValsView);
+               else if (GlobalID % 4 == 3)
+                 scatter<T, N, VS>(OutAcc, ByteOffsetsView, ValsView);
+             }
+           }
+         } else { // VS == 1
+           if constexpr (UseMask) {
+             if constexpr (UseProperties) {
+               if (GlobalID % 4 == 0)
+                 scatter(OutAcc, ByteOffsets, Vals, Pred, Props);
+               else if (GlobalID % 4 == 1)
+                 scatter(OutAcc, ByteOffsetsView, Vals, Pred, Props);
+               else if (GlobalID % 4 == 2)
+                 scatter<T, N>(OutAcc, ByteOffsets, ValsView, Pred, Props);
+               else if (GlobalID % 4 == 3)
+                 scatter<T, N>(OutAcc, ByteOffsetsView, ValsView, Pred, Props);
+             } else { // UseProperties == false
+               if (GlobalID % 4 == 0)
+                 scatter(OutAcc, ByteOffsets, Vals, Pred);
+               else if (GlobalID % 4 == 1)
+                 scatter(OutAcc, ByteOffsetsView, Vals, Pred);
+               else if (GlobalID % 4 == 2)
+                 scatter<T, N>(OutAcc, ByteOffsets, ValsView, Pred);
+               else if (GlobalID % 4 == 3)
+                 scatter<T, N>(OutAcc, ByteOffsetsView, ValsView, Pred);
+             }
+           } else { // UseMask == false
+             if constexpr (UseProperties) {
+               if (GlobalID % 4 == 0)
+                 scatter(OutAcc, ByteOffsets, Vals, Props);
+               else if (GlobalID % 4 == 1)
+                 scatter(OutAcc, ByteOffsetsView, Vals, Props);
+               else if (GlobalID % 4 == 2)
+                 scatter<T, N>(OutAcc, ByteOffsets, ValsView, Props);
+               else if (GlobalID % 4 == 3)
+                 scatter<T, N>(OutAcc, ByteOffsetsView, ValsView, Props);
+             } else { // UseProperties == false
+               if (GlobalID % 4 == 0)
+                 scatter(OutAcc, ByteOffsets, Vals);
+               else if (GlobalID % 4 == 1)
+                 scatter(OutAcc, ByteOffsetsView, Vals);
+               else if (GlobalID % 4 == 2)
+                 scatter<T, N>(OutAcc, ByteOffsets, ValsView);
+               else if (GlobalID % 4 == 3)
+                 scatter<T, N>(OutAcc, ByteOffsetsView, ValsView);
+             }
+           }
+         }
+       });
+     }).wait();
+  } catch (sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    return false;
+  }
 
-    // Check VS > 1. GPU supports only dwords and qwords in this mode.
-    if constexpr (sizeof(T) >= 4) {
-      // TODO: This test case causes flaky fail. Enable it after the issue
-      // in GPU driver is fixed.
-      // Passed &=
-      //     testUSM<T, 16, 2, CheckMask, CheckProperties>(Q, 2, AlignElemProps)
-      Passed &=
-          testUSM<T, 32, 2, !CheckMask, CheckProperties>(Q, 2, AlignElemProps);
-      Passed &=
-          testUSM<T, 32, 2, CheckMask, CheckProperties>(Q, 2, AlignElemProps);
-      Passed &=
-          testUSM<T, 32, 2, CheckMask, !CheckProperties>(Q, 2, AlignElemProps);
-    }
-  } // TestPVCFeatures
+  bool Passed = verify(Out.data(), N, Size, VS, MaskStride, UseMask);
 
   return Passed;
 }
@@ -395,6 +477,120 @@ bool testSLM(queue Q, uint32_t MaskStride,
   return Passed;
 }
 
+template <typename T, TestFeatures Features> bool testUSM(queue Q) {
+  constexpr bool CheckMask = true;
+  constexpr bool CheckProperties = true;
+  properties EmptyProps;
+  properties AlignElemProps{alignment<sizeof(T)>};
+
+  bool Passed = true;
+
+  // Test scatter() that is available on Gen12 and PVC.
+  Passed &= testUSM<T, 1, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testUSM<T, 2, 1, !CheckMask, CheckProperties>(Q, 1, EmptyProps);
+  Passed &= testUSM<T, 4, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testUSM<T, 8, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testUSM<T, 16, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+
+  Passed &= testUSM<T, 32, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+
+  // Test scatter() without passing compile-time properties argument.
+  Passed &= testUSM<T, 16, 1, !CheckMask, !CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testUSM<T, 32, 1, !CheckMask, !CheckProperties>(Q, 2, EmptyProps);
+
+  // Test scatter() with mask
+  Passed &= testUSM<T, 2, 1, CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testUSM<T, 4, 1, CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testUSM<T, 8, 1, CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testUSM<T, 16, 1, CheckMask, !CheckProperties>(Q, 2, EmptyProps);
+
+  if constexpr (Features == TestFeatures::PVC ||
+                Features == TestFeatures::DG2) {
+    properties LSCProps{cache_hint_L1<cache_hint::streaming>,
+                        cache_hint_L2<cache_hint::uncached>,
+                        alignment<sizeof(T)>};
+    Passed &= testUSM<T, 1, 1, !CheckMask, CheckProperties>(Q, 2, LSCProps);
+    Passed &= testUSM<T, 2, 1, CheckMask, CheckProperties>(Q, 2, LSCProps);
+    Passed &= testUSM<T, 4, 1, CheckMask, CheckProperties>(Q, 2, LSCProps);
+    Passed &= testUSM<T, 8, 1, CheckMask, CheckProperties>(Q, 2, LSCProps);
+
+    Passed &= testUSM<T, 32, 1, CheckMask, CheckProperties>(Q, 2, LSCProps);
+
+    // Check VS > 1. GPU supports only dwords and qwords in this mode.
+    if constexpr (sizeof(T) >= 4) {
+      // TODO: This test case causes flaky fail. Enable it after the issue
+      // in GPU driver is fixed.
+      // Passed &=
+      //     testUSM<T, 16, 2, CheckMask, CheckProperties>(Q, 2, AlignElemProps)
+      Passed &=
+          testUSM<T, 32, 2, !CheckMask, CheckProperties>(Q, 2, AlignElemProps);
+      Passed &=
+          testUSM<T, 32, 2, CheckMask, CheckProperties>(Q, 2, AlignElemProps);
+      Passed &=
+          testUSM<T, 32, 2, CheckMask, !CheckProperties>(Q, 2, AlignElemProps);
+    }
+  } // TestPVCFeatures
+
+  return Passed;
+}
+
+template <typename T, TestFeatures Features> bool testACC(queue Q) {
+  constexpr bool CheckMask = true;
+  constexpr bool CheckProperties = true;
+  properties EmptyProps;
+  properties AlignElemProps{alignment<sizeof(T)>};
+
+  bool Passed = true;
+
+  // Test scatter() that is available on Gen12 and PVC.
+  Passed &= testACC<T, 1, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testACC<T, 2, 1, !CheckMask, CheckProperties>(Q, 1, EmptyProps);
+  Passed &= testACC<T, 4, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testACC<T, 8, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testACC<T, 16, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+
+  Passed &= testACC<T, 32, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+
+  // Test scatter() without passing compile-time properties argument.
+  Passed &= testACC<T, 16, 1, !CheckMask, !CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testACC<T, 32, 1, !CheckMask, !CheckProperties>(Q, 2, EmptyProps);
+
+  // Test scatter() with mask
+  Passed &= testACC<T, 2, 1, CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testACC<T, 4, 1, CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testACC<T, 8, 1, CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testACC<T, 16, 1, CheckMask, !CheckProperties>(Q, 2, EmptyProps);
+
+  if constexpr (Features == TestFeatures::PVC ||
+                Features == TestFeatures::DG2) {
+    properties LSCProps{cache_hint_L1<cache_hint::streaming>,
+                        cache_hint_L2<cache_hint::uncached>,
+                        alignment<sizeof(T)>};
+    Passed &= testACC<T, 1, 1, !CheckMask, CheckProperties>(Q, 2, LSCProps);
+    Passed &= testACC<T, 2, 1, CheckMask, CheckProperties>(Q, 2, LSCProps);
+    Passed &= testACC<T, 4, 1, CheckMask, CheckProperties>(Q, 2, LSCProps);
+    Passed &= testACC<T, 8, 1, CheckMask, CheckProperties>(Q, 2, LSCProps);
+
+    Passed &= testACC<T, 32, 1, CheckMask, CheckProperties>(Q, 2, LSCProps);
+
+    // Check VS > 1. GPU supports only dwords and qwords in this mode.
+    if constexpr (sizeof(T) >= 4) {
+      // TODO: This test case causes flaky fail. Enable it after the issue
+      // in GPU driver is fixed.
+      // Passed &=
+      //     testACC<T, 16, 2, CheckMask, CheckProperties>(Q, 2, AlignElemProps)
+      Passed &=
+          testACC<T, 32, 2, !CheckMask, CheckProperties>(Q, 2, AlignElemProps);
+      Passed &=
+          testACC<T, 32, 2, CheckMask, CheckProperties>(Q, 2, AlignElemProps);
+      Passed &=
+          testACC<T, 32, 2, CheckMask, !CheckProperties>(Q, 2, AlignElemProps);
+    }
+  } // TestPVCFeatures
+
+  return Passed;
+}
+
 template <typename T, TestFeatures Features> bool testSLM(queue Q) {
   constexpr bool CheckMask = true;
   constexpr bool CheckProperties = true;
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/scatter_acc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_acc.cpp
new file mode 100644
index 0000000000000..3d042db56cbbf
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_acc.cpp
@@ -0,0 +1,36 @@
+//==------- scatter_acc.cpp - DPC++ ESIMD on-device test ---------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===------------------------------------------------------------------===//
+// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
+// RUN: %{run} %t.out
+
+// The test verifies esimd::scatter() functions accepting accessors
+// and optional compile-time esimd::properties.
+// The scatter() calls in this test do not use cache-hint
+// properties to not impose using DG2/PVC features.
+
+#include "Inputs/scatter.hpp"
+
+int main() {
+  auto Q = queue{gpu_selector_v};
+  esimd_test::printTestLabel(Q);
+
+  constexpr auto TestFeatures = TestFeatures::Generic;
+  bool Passed = true;
+
+  Passed &= testACC<int8_t, TestFeatures>(Q);
+  Passed &= testACC<int16_t, TestFeatures>(Q);
+  if (Q.get_device().has(sycl::aspect::fp16))
+    Passed &= testACC<sycl::half, TestFeatures>(Q);
+  Passed &= testACC<uint32_t, TestFeatures>(Q);
+  Passed &= testACC<float, TestFeatures>(Q);
+  if (Q.get_device().has(sycl::aspect::fp64))
+    Passed &= testACC<double, TestFeatures>(Q);
+
+  std::cout << (Passed ? "Passed\n" : "FAILED\n");
+  return Passed ? 0 : 1;
+}
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/scatter_acc_dg2_pvc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_acc_dg2_pvc.cpp
new file mode 100644
index 0000000000000..c0567ad922b89
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_acc_dg2_pvc.cpp
@@ -0,0 +1,38 @@
+//==------- scatter_acc_dg2_pvc.cpp - DPC++ ESIMD on-device test--------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===------------------------------------------------------------------===//
+// REQUIRES: gpu-intel-pvc || gpu-intel-dg2
+// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
+// RUN: %{run} %t.out
+
+// The test verifies esimd::scatter() functions accepting accessors
+// and optional compile-time esimd::properties.
+// The scatter() calls in this test uses cache-hint
+// properties and requires DG2 or PVC.
+
+#include "Inputs/scatter.hpp"
+
+int main() {
+  auto Q = queue{gpu_selector_v};
+  esimd_test::printTestLabel(Q);
+
+  constexpr auto TestFeatures = TestFeatures::PVC;
+  bool Passed = true;
+
+  Passed &= testACC<int8_t, TestFeatures>(Q);
+  Passed &= testACC<int16_t, TestFeatures>(Q);
+  if (Q.get_device().has(sycl::aspect::fp16))
+    Passed &= testACC<sycl::half, TestFeatures>(Q);
+  Passed &= testACC<uint32_t, TestFeatures>(Q);
+  Passed &= testACC<float, TestFeatures>(Q);
+  Passed &= testACC<int64_t, TestFeatures>(Q);
+  if (Q.get_device().has(sycl::aspect::fp64))
+    Passed &= testACC<double, TestFeatures>(Q);
+
+  std::cout << (Passed ? "Passed\n" : "FAILED\n");
+  return Passed ? 0 : 1;
+}
diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp
index f27fdad643370..f2c435bee571f 100644
--- a/sycl/test/esimd/memory_properties.cpp
+++ b/sycl/test/esimd/memory_properties.cpp
@@ -1351,6 +1351,67 @@ test_gather_scatter(AccType &acc, LocalAccType &local_acc, float *ptrf,
   // intrinsic is used
   // CHECK-COUNT-1: call void @llvm.masked.scatter.v10f32.v10p4(<10 x float> {{[^)]+}}, <10 x ptr addrspace(4)> {{[^)]+}}, i32 4, <10 x i1> {{[^)]+}})
   scatter(ptrf, ioffset_n10, usm_n10);
+
+  // Test accessor
+  // CHECK-STATEFUL-COUNT-4: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}})
+  // CHECK-STATELESS-COUNT-4: call void @llvm.masked.scatter.v32f32.v32p4(<32 x float> {{[^)]+}}, <32 x ptr addrspace(4)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}})
+  scatter(acc, ioffset_n32, usm, mask_n32);
+
+  scatter(acc, ioffset_n32, usm);
+
+  scatter(acc, ioffset_n32, usm, mask_n32, props_align4);
+
+  scatter(acc, ioffset_n32, usm, props_align4);
+
+  // CHECK-STATEFUL-COUNT-8: call void @llvm.genx.lsc.store.bti.v32i1.v32i32.v32i32(<32 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <32 x i32> {{[^)]+}}, <32 x i32> {{[^)]+}}, i32 {{[^)]+}})
+  // CHECK-STATELESS-COUNT-8: call void @llvm.genx.lsc.store.stateless.v32i1.v32i64.v32i32(<32 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <32 x i64> {{[^)]+}}, <32 x i32> {{[^)]+}}, i32 0)
+  scatter(acc, ioffset_n32, usm, mask_n32, props_cache_load);
+  scatter(acc, ioffset_n32, usm, props_cache_load);
+
+  scatter(acc, ioffset_n32_view, usm, mask_n32, props_cache_load);
+  scatter(acc, ioffset_n32_view, usm, props_cache_load);
+
+  scatter<float, 32>(acc, ioffset_n32, usm_view, mask_n32, props_cache_load);
+  scatter<float, 32>(acc, ioffset_n32, usm_view, props_cache_load);
+
+  scatter<float, 32>(acc, ioffset_n32_view, usm_view, mask_n32,
+                     props_cache_load);
+  scatter<float, 32>(acc, ioffset_n32_view, usm_view, props_cache_load);
+
+  // VS > 1
+  // CHECK-STATELESS-COUNT-8: call void @llvm.genx.lsc.store.stateless.v16i1.v16i64.v32i32(<16 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i64> {{[^)]+}}, <32 x i32> {{[^)]+}}, i32 0)
+  // CHECK-STATEFUL-COUNT-8: call void @llvm.genx.lsc.store.bti.v16i1.v16i32.v32i32(<16 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i32> {{[^)]+}}, <32 x i32> {{[^)]+}}, i32 {{[^)]+}})
+  scatter<float, 32, 2>(acc, ioffset_n16, usm, mask_n16, props_cache_load);
+
+  scatter<float, 32, 2>(acc, ioffset_n16, usm, props_cache_load);
+
+  scatter<float, 32, 2>(acc, ioffset_n16_view, usm, mask_n16, props_cache_load);
+  scatter<float, 32, 2>(acc, ioffset_n16_view, usm, props_cache_load);
+
+  scatter<float, 32, 2>(acc, ioffset_n16, usm_view, mask_n16, props_cache_load);
+  scatter<float, 32, 2>(acc, ioffset_n16, usm_view, props_cache_load);
+
+  scatter<float, 32, 2>(acc, ioffset_n16_view, usm_view, mask_n16,
+                        props_cache_load);
+  scatter<float, 32, 2>(acc, ioffset_n16_view, usm_view, props_cache_load);
+
+  // CHECK-STATELESS-COUNT-8: call void @llvm.genx.lsc.store.stateless.v16i1.v16i64.v32i32(<16 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i64> {{[^)]+}}, <32 x i32> {{[^)]+}}, i32 0)
+  // CHECK-STATEFUL-COUNT-8:  call void @llvm.genx.lsc.store.bti.v16i1.v16i32.v32i32(<16 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i32> {{[^)]+}}, <32 x i32> {{[^)]+}}, i32 {{[^)]+}})
+  scatter<float, 32, 2>(acc, ioffset_n16, usm, mask_n16);
+
+  scatter<float, 32, 2>(acc, ioffset_n16, usm);
+
+  scatter<float, 32, 2>(acc, ioffset_n16_view, usm, mask_n16);
+
+  scatter<float, 32, 2>(acc, ioffset_n16_view, usm);
+
+  scatter<float, 32, 2>(acc, ioffset_n16, usm_view, mask_n16);
+
+  scatter<float, 32, 2>(acc, ioffset_n16, usm_view);
+
+  scatter<float, 32, 2>(acc, ioffset_n16_view, usm_view, mask_n16);
+
+  scatter<float, 32, 2>(acc, ioffset_n16_view, usm_view);
 }
 
 // CHECK-LABEL: define {{.*}} @_Z23test_slm_gather_scatter{{.*}}

From a261ac158bb2952f2bca609758916d78f0c762be Mon Sep 17 00:00:00 2001
From: fineg74 <61437305+fineg74@users.noreply.github.com>
Date: Thu, 22 Feb 2024 12:18:40 -0800
Subject: [PATCH 27/30] [SYCL][ESIMD] Fix local accessor scatter test failure
 on PVC (#12745)

---
 .../unified_memory_api/Inputs/scatter.hpp     | 23 ++++++++++++++-----
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp
index a23c9884fa993..aed21c8b54509 100644
--- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp
@@ -672,7 +672,7 @@ bool testLACC(queue Q, uint32_t MaskStride,
 
   try {
     Q.submit([&](handler &cgh) {
-       constexpr uint32_t SLMSize = N;
+       constexpr uint32_t SLMSize = (Threads * N + 8);
        auto LocalAcc = local_accessor<T, 1>(SLMSize, cgh);
 
        cgh.parallel_for(Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL {
@@ -682,10 +682,17 @@ bool testLACC(queue Q, uint32_t MaskStride,
          uint32_t GlobalElemOffset = GlobalID * N;
          uint32_t LocalElemOffset = LocalID * N;
 
-         simd<T, N> InVec(GlobalElemOffset, 1);
+         if (LocalID == 0) {
+           for (int I = 0; I < Threads * N; I += 8) {
+             simd<T, 8> InVec(Out + GlobalElemOffset + I);
+             simd<uint32_t, 8> Offsets(I * sizeof(T), sizeof(T));
+             scatter<T>(LocalAcc, Offsets, InVec);
+           }
+         }
+         barrier();
 
-         simd<uint32_t, NOffsets> ByteOffsets(0, VS * sizeof(T));
-         scatter<T, N, VS>(LocalAcc, ByteOffsets, InVec);
+         simd<uint32_t, NOffsets> ByteOffsets(LocalElemOffset * sizeof(T),
+                                              VS * sizeof(T));
          auto ByteOffsetsView = ByteOffsets.template select<NOffsets, 1>();
          simd<T, N> Vals = gather<T, N, VS>(LocalAcc, ByteOffsets, Props);
 
@@ -786,8 +793,12 @@ bool testLACC(queue Q, uint32_t MaskStride,
            }
          }
 
-         simd<T, N> OutVec = gather<T, N, VS>(LocalAcc, ByteOffsets, Props);
-         OutVec.copy_to(Out + GlobalElemOffset);
+         barrier();
+         if (LocalID == 0) {
+           for (int I = 0; I < Threads * N; I++) {
+             Out[GlobalElemOffset + I] = LocalAcc[I];
+           }
+         }
        });
      }).wait();
   } catch (sycl::exception const &e) {

From 652d3eaa6e81f3876e2fbb97c8b773f671475bce Mon Sep 17 00:00:00 2001
From: Dounia Khaldi <dounia.khaldi@intel.com>
Date: Fri, 23 Feb 2024 00:32:14 -0600
Subject: [PATCH 28/30] [SYCL][Matrix tests] Missing general double type case
 in initialization and ref compute (#12800)

---
 sycl/test-e2e/Matrix/common.hpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/sycl/test-e2e/Matrix/common.hpp b/sycl/test-e2e/Matrix/common.hpp
index 155cf012d9754..1e27fe6d7d989 100644
--- a/sycl/test-e2e/Matrix/common.hpp
+++ b/sycl/test-e2e/Matrix/common.hpp
@@ -64,7 +64,9 @@ void matrix_multiply_ref(Ta *A, Tb *B, Tc *C, int M, int N, int K,
             acc += make_fp32(va[i]) * make_fp32(vb[i]);
           else if constexpr (std::is_same_v<Ta, float> &&
                                  std::is_same_v<Tc, float> ||
-                             std::is_integral_v<Ta> && std::is_integral_v<Tc>)
+                             std::is_integral_v<Ta> && std::is_integral_v<Tc> ||
+                             (std::is_same_v<Ta, double> &&
+                              std::is_same_v<Tc, double>))
             acc += va[i] * vb[i];
           else if constexpr (std::is_same_v<Ta, sycl::half> &&
                              std::is_same_v<Tc, float>)
@@ -127,7 +129,8 @@ void matrix_rand(unsigned int rows, unsigned int cols, T *src, T val) {
 
   for (unsigned int i = 0; i < rows; i++) {
     for (unsigned int j = 0; j < cols; j++) {
-      if constexpr (std::is_same_v<T, bfloat16> || std::is_same_v<T, float>) {
+      if constexpr (std::is_same_v<T, bfloat16> || std::is_same_v<T, float> ||
+                    std::is_same_v<T, double>) {
         src[i * cols + j] = T(fdistr(dev));
       } else if constexpr (std::is_same_v<T, int8_t> ||
                            std::is_same_v<T, int32_t>) {

From 6e3aa21803a28780279a49ad5aa72632307d6dc6 Mon Sep 17 00:00:00 2001
From: Udit Agarwal <16324601+uditagarwal97@users.noreply.github.com>
Date: Fri, 23 Feb 2024 00:05:11 -0800
Subject: [PATCH 29/30] [SYCL] Add '--ignore-device-selectors' CLI option to
 sycl-ls and improve warning messages (#12718)

This PR adds a '--ignore-device-selectors' CLI option to sycl-ls that
prints all platforms available in the user's system, irrespective of the
DPCPP filter environment variables like ONEAPI_DEVICE_SELECTOR.
---
 sycl/test/tools/sycl-ls.test   |  19 ++++-
 sycl/tools/sycl-ls/sycl-ls.cpp | 128 ++++++++++++++++++++++++++-------
 2 files changed, 121 insertions(+), 26 deletions(-)

diff --git a/sycl/test/tools/sycl-ls.test b/sycl/test/tools/sycl-ls.test
index 422f2edbdda23..9e9fc9b079c37 100644
--- a/sycl/test/tools/sycl-ls.test
+++ b/sycl/test/tools/sycl-ls.test
@@ -1,3 +1,18 @@
--- Check that sycl-ls exits with 0 exit code. 
+-- Check sycl-ls exit code and output.
 
-RUN: sycl-ls --verbose
+RUN: sycl-ls --verbose > vanilla_verbose.out
+RUN: sycl-ls > vanilla.out
+
+-- Check the functioning of '--ignore-device-selectors' CLI option.
+
+RUN: env ONEAPI_DEVICE_SELECTOR="opencl:*" sycl-ls --ignore-device-selectors > ods_ignore_device_selector.out
+RUN: diff vanilla.out ods_ignore_device_selector.out
+
+RUN: env ONEAPI_DEVICE_SELECTOR="opencl:*" sycl-ls --ignore-device-selectors --verbose > ods_ignore_device_selector_v.out
+RUN: diff vanilla_verbose.out ods_ignore_device_selector_v.out
+
+RUN: env SYCL_DEVICE_ALLOWLIST="BackendName:opencl" sycl-ls --ignore-device-selectors > sda_ignore_device_selector.out
+RUN: diff vanilla.out sda_ignore_device_selector.out
+
+RUN: env SYCL_DEVICE_ALLOWLIST="BackendName:opencl" sycl-ls --ignore-device-selectors --verbose > sda_ignore_device_selector_v.out
+RUN: diff vanilla_verbose.out sda_ignore_device_selector_v.out
diff --git a/sycl/tools/sycl-ls/sycl-ls.cpp b/sycl/tools/sycl-ls/sycl-ls.cpp
index ef2ea8fd2f121..1e7e69964cd6a 100644
--- a/sycl/tools/sycl-ls/sycl-ls.cpp
+++ b/sycl/tools/sycl-ls/sycl-ls.cpp
@@ -22,12 +22,20 @@
 #include <iostream>
 #include <map>
 #include <stdlib.h>
+#include <vector>
 
 using namespace sycl;
+using namespace std::literals;
 
 // Controls verbose output vs. concise.
 bool verbose;
 
+// Controls whether to discard filter environment variables or not.
+bool DiscardFilters;
+
+// To store various filter environment variables.
+std::vector<std::string> FilterEnvVars;
+
 // Trivial custom selector that selects a device of the given type.
 class custom_selector : public device_selector {
   info::device_type MType;
@@ -105,44 +113,116 @@ static void printSelectorChoice(const device_selector &Selector,
   }
 }
 
-int main(int argc, char **argv) {
+static int printUsageAndExit() {
+  std::cout << "Usage: sycl-ls [--verbose] [--ignore-device-selectors]"
+            << std::endl;
+  std::cout << "This program lists all devices and backends discovered by SYCL."
+            << std::endl;
+  std::cout << "\n Options:" << std::endl;
+  std::cout
+      << "\t --verbose " << "\t Verbosely prints all the discovered platforms. "
+      << "It also lists the device chosen by various SYCL device selectors."
+      << std::endl;
+  std::cout
+      << "\t --ignore-device-selectors "
+      << "\t Lists all platforms available on the system irrespective "
+      << "of DPCPP filter environment variables (like ONEAPI_DEVICE_SELECTOR)."
+      << std::endl;
 
-  // See if verbose output is requested
-  if (argc == 1)
-    verbose = false;
-  else if (argc == 2 && std::string(argv[1]) == "--verbose")
-    verbose = true;
-  else {
-    std::cout << "Usage: sycl-ls [--verbose]" << std::endl;
-    return EXIT_FAILURE;
-  }
+  return EXIT_FAILURE;
+}
 
-  bool SuppressNumberPrinting = false;
+// Print warning and suppress printing device ids if any of
+// the filter environment variable is set.
+static void printWarningIfFiltersUsed(bool &SuppressNumberPrinting) {
 
 #ifndef __INTEL_PREVIEW_BREAKING_CHANGES
   const char *filter = std::getenv("SYCL_DEVICE_FILTER");
   if (filter) {
-    std::cerr << "Warning: SYCL_DEVICE_FILTER environment variable is set to "
-              << filter << "." << std::endl;
-    std::cerr << "To see device ids, please unset SYCL_DEVICE_FILTER."
-              << std::endl
-              << std::endl;
-    SuppressNumberPrinting = true;
+    if (!DiscardFilters) {
+      std::cerr << "INFO: Output filtered by SYCL_DEVICE_FILTER "
+                << "environment variable, which is set to " << filter << "."
+                << std::endl;
+      std::cerr
+          << "To see device ids, use the --ignore-device-selectors CLI option."
+          << std::endl
+          << std::endl;
+      SuppressNumberPrinting = true;
+    } else
+      FilterEnvVars.push_back("SYCL_DEVICE_FILTER");
   }
 #endif
 
   const char *ods_targets = std::getenv("ONEAPI_DEVICE_SELECTOR");
   if (ods_targets) {
-    std::cerr
-        << "Warning: ONEAPI_DEVICE_SELECTOR environment variable is set to "
-        << ods_targets << "." << std::endl;
-    std::cerr << "To see device ids, please unset ONEAPI_DEVICE_SELECTOR."
-              << std::endl
-              << std::endl;
-    SuppressNumberPrinting = true;
+    if (!DiscardFilters) {
+      std::cerr << "INFO: Output filtered by ONEAPI_DEVICE_SELECTOR "
+                << "environment variable, which is set to " << ods_targets
+                << "." << std::endl;
+      std::cerr
+          << "To see device ids, use the --ignore-device-selectors CLI option."
+          << std::endl
+          << std::endl;
+      SuppressNumberPrinting = true;
+    } else
+      FilterEnvVars.push_back("ONEAPI_DEVICE_SELECTOR");
+  }
+
+  const char *sycl_dev_allow = std::getenv("SYCL_DEVICE_ALLOWLIST");
+  if (sycl_dev_allow) {
+    if (!DiscardFilters) {
+      std::cerr << "INFO: Output filtered by SYCL_DEVICE_ALLOWLIST "
+                << "environment variable, which is set to " << sycl_dev_allow
+                << "." << std::endl;
+      std::cerr
+          << "To see device ids, use the --ignore-device-selectors CLI option."
+          << std::endl
+          << std::endl;
+      SuppressNumberPrinting = true;
+    } else
+      FilterEnvVars.push_back("SYCL_DEVICE_ALLOWLIST");
+  }
+}
+
+// Unset filter related environment variables namely, SYCL_DEVICE_FILTER,
+// ONEAPI_DEVICE_SELECTOR, and SYCL_DEVICE_ALLOWLIST.
+static void unsetFilterEnvVars() {
+  for (auto it : FilterEnvVars) {
+#ifdef _WIN32
+    _putenv_s(it.c_str(), "");
+#else
+    unsetenv(it.c_str());
+#endif
   }
+}
+
+int main(int argc, char **argv) {
+
+  if (argc == 1) {
+    verbose = false;
+    DiscardFilters = false;
+  } else {
+    // Parse CLI options.
+    for (int i = 1; i < argc; i++) {
+      if (argv[i] == "--verbose"sv)
+        verbose = true;
+      else if (argv[i] == "--ignore-device-selectors"sv)
+        DiscardFilters = true;
+      else
+        return printUsageAndExit();
+    }
+  }
+
+  bool SuppressNumberPrinting = false;
+  // Print warning and suppress printing device ids if any of
+  // the filter environment variable is set.
+  printWarningIfFiltersUsed(SuppressNumberPrinting);
 
   try {
+    // Unset all filter env. vars to get all available devices in the system.
+    if (DiscardFilters)
+      unsetFilterEnvVars();
+
     const auto &Platforms = platform::get_platforms();
 
     // Keep track of the number of devices per backend

From 5a92a19f0e4e4535560a2f6190d774d1b8e19fc5 Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Fri, 23 Feb 2024 00:53:44 -0800
Subject: [PATCH 30/30] [SYCL][E2E] Disable select_device.cpp on Arc GPU
 (#12808)

Started to fail in post commit after
https://github.com/intel/llvm/pull/12719. This looks like a pre-existing
bug in the test so I'm going to temporarily disable the test instead of
reverting. Will work on the fix with highest priority after that.
---
 sycl/test-e2e/Config/select_device.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sycl/test-e2e/Config/select_device.cpp b/sycl/test-e2e/Config/select_device.cpp
index 970e3088b3e71..4787aa044021b 100644
--- a/sycl/test-e2e/Config/select_device.cpp
+++ b/sycl/test-e2e/Config/select_device.cpp
@@ -1,4 +1,6 @@
 // REQUIRES: gpu
+// Post-commit fails due to a bug in test, will fix in a couple of days.
+// UNSUPPORTED: gpu-intel-dg2
 // RUN: %{build} -o %t.out
 //
 // RUN: env ONEAPI_DEVICE_SELECTOR="*:gpu" %{run-unfiltered-devices} %t.out DEVICE_INFO write > %t.txt
@@ -159,6 +161,8 @@ static std::vector<DevDescT> getAllowListDesc(std::string allowList) {
     }
 
     else if (allowList.find('|', pos) != std::string::npos) {
+      // FIXME: That is wrong and result in a infinite loop. We start processing
+      // the string from the start here.
       pos = allowList.find('|') + 1;
       while (allowList[pos] == ' ') {
         pos++;