diff --git a/sycl/test-e2e/Matrix/SG32/get_coord_float_matC.cpp b/sycl/test-e2e/Matrix/SG32/get_coord_float_matC.cpp
deleted file mode 100644
index 9b4777faced20..0000000000000
--- a/sycl/test-e2e/Matrix/SG32/get_coord_float_matC.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-//==----------- get_coord_float_matC.cpp  - DPC++ joint_matrix---------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
-// UNSUPPORTED: gpu-intel-dg2
-// REQUIRES: aspect-ext_intel_matrix
-// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include "../common.hpp"
-
-#define SG_SZ 32
-
-#include "../get_coord_float_matC_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/get_coord_int8_matA.cpp b/sycl/test-e2e/Matrix/SG32/get_coord_int8_matA.cpp
deleted file mode 100644
index 09c3ccf9983ab..0000000000000
--- a/sycl/test-e2e/Matrix/SG32/get_coord_int8_matA.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-//==----------- get_coord_int8_matA.cpp  - DPC++ joint_matrix---------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
-// UNSUPPORTED: gpu-intel-dg2
-// REQUIRES: aspect-ext_intel_matrix
-// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include "../common.hpp"
-
-#define SG_SZ 32
-
-#include "../get_coord_int8_matA_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/SG32/get_coordinate_ops.cpp
similarity index 78%
rename from sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp
rename to sycl/test-e2e/Matrix/SG32/get_coordinate_ops.cpp
index 7d7e92e102780..86a8bf730c2df 100644
--- a/sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp
+++ b/sycl/test-e2e/Matrix/SG32/get_coordinate_ops.cpp
@@ -1,4 +1,4 @@
-//==----------- get_coord_int8_matB.cpp  - DPC++ joint_matrix---------==//
+//==----------- get_coordinate_ops.cpp - DPC++ joint_matrix---------==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -10,6 +10,9 @@
 // REQUIRES: aspect-ext_intel_matrix
 // REQUIRES-INTEL-DRIVER: lin: 30049, win: 101.4943
 
+// XFAIL: !igc-dev
+// XFAIL-TRACKER: GSD-6376
+
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -17,4 +20,4 @@
 
 #define SG_SZ 32
 
-#include "../get_coord_int8_matB_impl.hpp"
+#include "../get_coordinate_ops_impl.hpp"
\ No newline at end of file
diff --git a/sycl/test-e2e/Matrix/get_coord_float_matC.cpp b/sycl/test-e2e/Matrix/get_coord_float_matC.cpp
deleted file mode 100644
index 044c8072bb3bc..0000000000000
--- a/sycl/test-e2e/Matrix/get_coord_float_matC.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-//==----------- get_coord_float_matC.cpp  - DPC++ joint_matrix---------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// REQUIRES: aspect-ext_intel_matrix
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include "common.hpp"
-#include "get_coord_float_matC_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp b/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp
deleted file mode 100644
index a6c7c5646a548..0000000000000
--- a/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp
+++ /dev/null
@@ -1,130 +0,0 @@
-//==----------- get_coord_float_matC_impl.hpp  - DPC++ joint_matrix---------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#include <sycl/atomic_ref.hpp>
-#include <sycl/group_algorithm.hpp>
-
-template <size_t TileRows, size_t TileCols> class add_rows;
-
-// clang-format off
-/*
-Here's how the data is distributed for sub group size = 16 on PVC
-W0 --> 0 1 2 3 4 5 6 7
-wi [0,0] -> i=0, [0, 0]        wi [0,1] --> i=0, [0, 1]     wi [0,15] --> i=0, [0, 15]
-            i=1, [1, 0]                     i=1, [1, 1]                   i=1, [1, 15]
-            i=2, [2, 0]                     i=2, [2, 1]                   ...
-            ...                             ....
-            i=7, [7, 0]                     i=7, [7, 1]
-*/
-// clang-format on
-
-template <typename T, size_t Rows, size_t Cols, size_t TileRows,
-          size_t TileCols>
-void matrix_sum_rows(big_matrix<T, Rows, Cols> &C, T *sum_rows) {
-  buffer<T, 2> bufC((T *)C.get_data(), range<2>(Rows, Cols));
-  buffer<T> sum_rows_v(sum_rows, Rows);
-
-  queue q;
-  size_t sg_size = get_sg_size<add_rows<TileRows, TileCols>>(q);
-  q.submit([&](handler &cgh) {
-     sycl::accessor accC{bufC, cgh, sycl::read_write};
-     sycl::accessor v{sum_rows_v, cgh, sycl::read_write};
-
-     cgh.parallel_for<add_rows<TileRows, TileCols>>(
-         nd_range<2>({Rows / TileRows, Cols / TileCols * sg_size},
-                     {1, 1 * sg_size}),
-         [=](nd_item<2> spmd_item)
-#ifdef SG_SZ
-             [[sycl::reqd_sub_group_size(SG_SZ)]]
-#endif
-         {
-           // The submatrix API has to be accessed by all the workitems in a
-           // subgroup these functions will be called once by the subgroup no
-           // code divergence between the workitems
-           const auto global_idx = spmd_item.get_global_id(0);
-           const auto global_idy = spmd_item.get_global_id(1);
-           const auto sg_startx = global_idx - spmd_item.get_local_id(0);
-           const auto sg_starty = global_idy - spmd_item.get_local_id(1);
-
-           sub_group sg = spmd_item.get_sub_group();
-           joint_matrix<sub_group, T, use::accumulator, TileRows, TileCols>
-               sub_c;
-
-           joint_matrix_load(
-               sg, sub_c,
-               accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TileRows) * Cols +
-                   sg_starty / sg_size * TileCols,
-               Cols, layout::row_major);
-
-           T sum_local_rows[Rows] = {0};
-
-           ext::intel::experimental::matrix::joint_matrix_apply(
-               sg, sub_c, [&](T &x, size_t row, size_t col) {
-                 sum_local_rows[row + global_idx * TileRows] += x;
-               });
-           for (int i = 0; i < Rows; i++) {
-             sum_local_rows[i] =
-                 reduce_over_group(sg, sum_local_rows[i], sycl::plus<>());
-             // only Groups leader perform the global reduction
-             if (global_idy % sg_size == 0) {
-               sycl::atomic_ref<T, sycl::memory_order::relaxed,
-                                sycl::memory_scope::device>
-                   aref(v[i]);
-               aref.fetch_add(sum_local_rows[i]);
-             }
-           }
-         }); // parallel for
-   }).wait();
-}
-
-template <typename T, size_t TM, size_t TN> void test() {
-  constexpr size_t SCALE = 2;
-  static constexpr size_t Rows = TM * SCALE;
-  static constexpr size_t Cols = TN * SCALE;
-
-  T sum_rows[Rows] = {0};
-  T sum_rows_ref[Rows] = {0};
-  T C[Rows][Cols];
-  big_matrix<T, Rows, Cols> MC((T *)&C);
-
-  matrix_rand(Rows, Cols, (T *)&C, (T)100);
-  matrix_sum_rows<T, Rows, Cols, TM, TN>(MC, sum_rows);
-
-  for (int i = 0; i < Rows; i++) {
-    for (int j = 0; j < Cols; j++) {
-      sum_rows_ref[i] += C[i][j];
-    }
-    assert(std::fabs(sum_rows_ref[i] - sum_rows[i]) <= FLOAT_EPSILON);
-  }
-}
-
-int main() {
-  queue q;
-  std::vector<combination> combinations =
-      q.get_device()
-          .get_info<sycl::ext::oneapi::experimental::info::device::
-                        matrix_combinations>();
-
-  for (unsigned int i = 0; i < combinations.size(); i++) {
-    if (combinations[i].nsize == 0) { // Intel AMX
-      test<float, /*TM*/ 16, /*TN*/ 16>();
-      break;
-    }
-
-    if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
-      test<float, /*TM*/ 8, /*TN*/ 16>();
-      break;
-    }
-
-    if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
-      test<float, /*TM*/ 8, /*TN*/ 8>();
-      break;
-    }
-  }
-  return 0;
-}
diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp b/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp
deleted file mode 100644
index 8e953c8689305..0000000000000
--- a/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-//==----------- get_coord_int8_matA.cpp  - DPC++ joint_matrix---------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// REQUIRES: aspect-ext_intel_matrix
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include "common.hpp"
-#include "get_coord_int8_matA_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp b/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp
deleted file mode 100644
index 0bf9281ab9f45..0000000000000
--- a/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp
+++ /dev/null
@@ -1,176 +0,0 @@
-//==----------- get_coord_int8_matA_impl.hpp  - DPC++ joint_matrix---------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#include <sycl/atomic_ref.hpp>
-#include <sycl/group_algorithm.hpp>
-
-template <size_t TileRows, size_t TileCols> class add_rows;
-
-template <typename T, typename TResult, size_t Rows, size_t Cols>
-void sum_rows_ref(
-    host_accessor<T, 2, access::mode::read_write> A,
-    host_accessor<TResult, 1, access::mode::read_write> sum_rows) {
-  int sum_rows_ref[Rows] = {0};
-  for (size_t i = 0; i < Rows; i++) {
-    for (size_t j = 0; j < Cols; j++) {
-      sum_rows_ref[i] += A[i][j];
-    }
-    auto diff = sum_rows[i] - sum_rows_ref[i];
-    assert(std::fabs(static_cast<TResult>(diff)) <=
-           std::numeric_limits<TResult>::epsilon());
-  }
-}
-
-// clang-format off
-/* For sub group size = 16:
-wi [0,0] -> i=0, [0, 0]        wi [0,1] --> i=0, [0, 2]     wi [0,15] --> i=0, [0, 30]
-            i=1, [0, 1]                     i=1, [0, 3]                   i=1, [0, 31]
-            i=2, [1, 0]                     i=2, [1, 2]                   i=2, [1, 30]
-            i=3, [1, 1]                     i=3, [1, 3]                   i=3, [1, 31]
-            i=4, [2, 0]                     i=4, [2, 2]                   ...
-            i=5, [2, 1]                     i=5, [2, 3]
-            ...                             ....
-            i=14,[7, 0]                     i=14, [7, 2]
-            i=15,[7, 1]                     i=15, [7, 3]                  i=15, [7, 31]
-
-Here's how the distribution of the A matrix looks like for this test case
-x x x x  x x x x    x x x x  x x x x  x x x x  x x x x    x x x x  x x x x
-x x x x  x x x x    x x x x  x x x x  x x x x  x x x x    x x x x  x x x x
-x x x x  x x x x    x x x x  x x x x  x x x x  x x x x    x x x x  x x x x
-x x x x  x x x x    x x x x  x x x x  x x x x  x x x x    x x x x  x x x x
-x x x x  x x x x    x x x x  x x x x  x x x x  x x x x    x x x x  x x x x
-x x x x  x x x x    x x x x  x x x x  x x x x  x x x x    x x x x  x x x x
-x x x x  x x x x    x x x x  x x x x  x x x x  x x x x    x x x x  x x x x
-x x x x  x x x x    x x x x  x x x x  x x x x  x x x x    x x x x  x x x x
-<---------------------------------  SG1 --------------------------------->
-x x x x  x x x x    x x x x  x x x x  x x x x  x x x x    x x x x  x x x x
-x x x x  x x x x    x x x x  x x x x  x x x x  x x x x    x x x x  x x x x
-x x x x  x x x x    x x x x  x x x x  x x x x  x x x x    x x x x  x x x x
-x x x x  x x x x    x x x x  x x x x  x x x x  x x x x    x x x x  x x x x
-x x x x  x x x x    x x x x  x x x x  x x x x  x x x x    x x x x  x x x x
-x x x x  x x x x    x x x x  x x x x  x x x x  x x x x    x x x x  x x x x
-x x x x  x x x x    x x x x  x x x x  x x x x  x x x x    x x x x  x x x x
-x x x x  x x x x    x x x x  x x x x  x x x x  x x x x    x x x x  x x x x
-<0> <1>  <2> <3>    <4> <5>  <6> <7>  ..... WORK ITEMS
-Each work item has 16 elements <8 rows and 2 cols of the original matrix>
-the data_slice holds the matrix elements in the following order:
-0 0  0 0
-   /
-  /
-1 1  1 1
-   /
-  /
-2 2  2 2
-  /
- / 
-3 3  3 3 
-W0 --> 0 0 1 1 2 2 3 3 .... 7 7
-*/
-// clang-format on
-
-template <typename T, typename TResult, size_t Rows, size_t Cols,
-          size_t TileRows, size_t TileCols>
-void matrix_sum_rows(big_matrix<T, Rows, Cols> &A) {
-  buffer<T, 2> bufA(A.get_data(), range<2>(Rows, Cols));
-
-  // size of vector is equal to number of rows in big matrix
-  TResult sum_rows[Rows] = {0};
-  buffer<TResult> sum_rows_v(sum_rows, Rows);
-  queue q;
-  size_t sg_size = get_sg_size<add_rows<TileRows, TileCols>>(q);
-  nd_range<2> r({Rows / TileRows, Cols / TileCols * sg_size}, {1, 1 * sg_size});
-  q.submit([&](handler &cgh) {
-     sycl::accessor accA{bufA, cgh, sycl::read_write};
-     sycl::accessor v{sum_rows_v, cgh, sycl::read_write};
-
-     cgh.parallel_for<add_rows<TileRows, TileCols>>(
-         r, [=](nd_item<2> spmd_item)
-#ifdef SG_SZ
-                [[sycl::reqd_sub_group_size(SG_SZ)]]
-#endif
-         {
-           const auto global_idx = spmd_item.get_global_id(0);
-           const auto global_idy = spmd_item.get_global_id(1);
-           const auto sg_startx = global_idx - spmd_item.get_local_id(0);
-           const auto sg_starty = global_idy - spmd_item.get_local_id(1);
-
-           sycl::sub_group sg = spmd_item.get_sub_group();
-           joint_matrix<sub_group, T, use::a, TileRows, TileCols,
-                        layout::row_major>
-               sub_a;
-           joint_matrix_load(
-               sg, sub_a,
-               accA.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TileRows * Cols) +
-                   sg_starty / sg_size * TileCols,
-               Cols);
-
-           TResult sum_local_rows[Rows] = {0};
-
-           ext::intel::experimental::matrix::joint_matrix_apply(
-               sg, sub_a, [&](T &x, size_t row, size_t col) {
-                 sum_local_rows[row + global_idx * TileRows] += x;
-               });
-           for (int i = 0; i < Rows; ++i) {
-             sum_local_rows[i] =
-                 reduce_over_group(sg, sum_local_rows[i], sycl::plus<>());
-
-             // only Groups leader performs the global reduction
-             if (global_idy % sg_size == 0) {
-               sycl::atomic_ref<TResult, sycl::memory_order::relaxed,
-                                sycl::memory_scope::device>
-                   aref(v[i]);
-               aref.fetch_add(sum_local_rows[i]);
-             }
-           }
-         }); // parallel for
-   }).wait();
-  sum_rows_ref<T, TResult, Rows, Cols>(bufA.get_host_access(),
-                                       sum_rows_v.get_host_access());
-}
-
-template <typename T, typename TResult, size_t TM, size_t TK> void test() {
-  static constexpr size_t Rows = TM * 2;
-  static constexpr size_t Cols = TK * 2;
-  T A[Rows][Cols];
-
-  big_matrix<T, Rows, Cols> MA((T *)&A);
-
-  for (int i = 0; i < Rows; i++) {
-    for (int j = 0; j < Cols; j++) {
-      A[i][j] = i + j;
-    }
-  }
-
-  matrix_sum_rows<T, TResult, Rows, Cols, TM, TK>(MA);
-}
-
-int main() {
-  queue q;
-  std::vector<combination> combinations =
-      q.get_device()
-          .get_info<sycl::ext::oneapi::experimental::info::device::
-                        matrix_combinations>();
-
-  for (unsigned int i = 0; i < combinations.size(); i++) {
-    if (combinations[i].nsize == 0) { // Intel AMX
-      test<int8_t, int, /*TM*/ 16, /*TK*/ 64>();
-      break;
-    }
-
-    if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
-      test<int8_t, int, /*TM*/ 8, /*TK*/ 32>();
-      break;
-    }
-
-    if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
-      test<int8_t, int, /*TM*/ 8, /*TK*/ 32>();
-      break;
-    }
-  }
-  return 0;
-}
diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matB_impl.hpp b/sycl/test-e2e/Matrix/get_coord_int8_matB_impl.hpp
deleted file mode 100644
index 08cb616cc6cc4..0000000000000
--- a/sycl/test-e2e/Matrix/get_coord_int8_matB_impl.hpp
+++ /dev/null
@@ -1,214 +0,0 @@
-//==----------- get_coord_int8_matB_impl.hpp  - DPC++ joint_matrix---------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#include <sycl/atomic_ref.hpp>
-#include <sycl/group_algorithm.hpp>
-
-template <size_t TileRows, size_t TileCols> class add_cols;
-
-template <typename T, typename TResult, size_t Rows, size_t Cols>
-void sum_cols_ref(
-    host_accessor<T, 2, access::mode::read_write> B,
-    host_accessor<TResult, 1, access::mode::read_write> sum_cols) {
-  TResult sum_cols_ref[Cols] = {0};
-  for (size_t j = 0; j < Cols; j++) {
-    for (size_t i = 0; i < Rows; i++) {
-      sum_cols_ref[j] += B[i][j];
-    }
-    auto diff = sum_cols[j] - sum_cols_ref[j];
-    assert(std::fabs(static_cast<TResult>(diff)) <=
-           std::numeric_limits<TResult>::epsilon());
-  }
-}
-
-// clang-format off
-/* 
-    Here is a demonstration of how matrix B will be divided across
-    work items for this test case for sub group size = 16 on PVC.
-    <    ---------------    128    ---------------------------------->
-    x x x x x x x x x x x x x x x x       ..........    x x x x x x   ^
-    x x x x x x x x x x x x x x x x       ..........    x x x x x x  16
-    x x x x x x x x x x x x x x x x       ..........    x x x x x x   |
-    .....                                                             |
-    x x x x x x x x x x x x x x x x       ..........    x x x x x x   |
-    x x x x x x x x x x x x x x x x       ..........    x x x x x x   v
-    
-    ---------------    64    ---------------->
-    x x x x   x x    ..........    x x  x x x x   ^
-    x x x x   x x    ..........    x x  x x x x   8
-    x x x x   x x    ..........    x x  x x x x   |  <-- part of (VNNI-ed) 
-    .....                                         |   original matrix each SG
-    x x x x   x x    ..........    x x  x x x x   |   holds
-    x x x x   x x    ..........    x x  x x x x   v
-    < WI0 >                            < WI15 >
-    <--------    16    ------------->
-    x x x     ..........    x x x   ^
-    x x x     ..........    x x x   |
-    x x x     ..........    x x x   | <-- part of (non-VNNI-ed) original matrix
-    .....                           |           each SG holds
-    x x x     ..........    x x x   |
-    x x x     ..........    x x x   |
-    x x x     ..........    x x x  32
-    x x x     ..........    x x x   |
-    x x x     ..........    x x x   |
-    x x x     ..........    x x x   |
-    x x x     ..........    x x x   |
-    x x x     ..........    x x x   |
-    x x x     ..........    x x x   v
-    If we divide the above matrix across 16 (SG_SZ) work items,
-    each WI will hold 32 elements.  And these 32 elements will be
-    8x4 chunks as shown in the VNNI-ed matrix figure. 
-
-The total distribution among the WIs in ALL the sub-groups is as follows:
-This is useful to figure out how the global index is to be calculated
-
-W0 --> 0 0 0 0   1 1 1 1 ...   7 7 7 7 --> total 32 elements
-wi [0,0] --> i=0, [0, 0]        wi [0,1] --> i=0, [0, 4]     wi [0,15] --> i=0, [0, 60] | wi [0,16] --> i=0, [0, 64]
-            i=1, [0, 1]                     i=1, [0, 5]                   i=1, [0, 61]  |               i=1, [0, 65]
-            i=2, [0, 2]                     i=2, [0, 6]                   i=2, [0, 62]  |               i=2, [0, 66]
-            i=3, [0, 3]                     i=3, [0, 7]                   i=3, [0, 63]  |               i=3, [0, 67]              
-            i=4, [1, 0]                     i=4, [1, 4]                   i=4, [1, 60]  |               ....
-            i=5, [1, 1]                     i=5, [1, 5]                   i=5, [1, 61]  |
-            i=6, [1, 2]                     i=6, [1, 6]                   i=6, [1, 62]  |
-            i=7, [1, 3]                     i=7, [1, 7]                   i=7, [1, 63]  |
-            ...                             ...                           ....          |
-            i=28,[7, 0]                     i=28,[7, 4]                   i=28,[7, 60]  |               i=28, [7, 124]
-            i=29,[7, 1]                     i=29,[7, 5]                   i=29,[7, 61]  |               i=29, [7, 125]
-            i=30,[7, 2]                     i=30,[7, 6]                   i=30,[7, 62]  |               i=30, [7, 126]
-            i=31,[7, 3]                     i=31,[7, 7]                   i=31,[7, 63]  |               i=31, [7, 127]
----------------------------------------------------------------------------------------- ---------------------------
-wi [1,0] -->    i=0, [8, 0]
-                i=1, [8, 1]
-                i=2, [8, 2]
-                i=3, [8, 2]
-                ...
-                i=28, [15, 0]
-                i=29, [15, 1]
-                i=30, [15, 2]
-                i=31, [15, 3]
-*/
-
-// clang-format on
-
-template <typename T, typename TResult, size_t Rows, size_t Cols,
-          size_t TileRows, size_t TileCols, size_t VNNI>
-void matrix_sum_cols(big_matrix<T, Rows, Cols> &B,
-                     big_matrix<T, Rows / VNNI, Cols * VNNI> &Bvnni) {
-  buffer<T, 2> bufB(B.get_data(), range<2>(Rows, Cols));
-  buffer<T, 2> bufBvnni(Bvnni.get_data(), range<2>(Rows / VNNI, Cols * VNNI));
-
-  TResult sum_cols[Cols] = {0};
-  buffer<TResult> sum_cols_v(sum_cols, Cols);
-
-  size_t NDRangeK = Rows / TileRows;
-  size_t NDRangeN = Cols / TileCols;
-  queue q;
-  size_t sg_size = get_sg_size<add_cols<TileRows, TileCols>>(q);
-  nd_range<2> r({NDRangeK, NDRangeN * sg_size}, {1, 1 * sg_size});
-
-  q.submit([&](handler &cgh) {
-     sycl::accessor accB{bufBvnni, cgh, sycl::read_write};
-     sycl::accessor v{sum_cols_v, cgh, sycl::read_write};
-
-     cgh.parallel_for<add_cols<TileRows, TileCols>>(
-         r, [=](nd_item<2> spmd_item)
-#ifdef SG_SZ
-                [[sycl::reqd_sub_group_size(SG_SZ)]]
-#endif
-         {
-           const auto global_idx = spmd_item.get_global_id(0);
-           const auto global_idy = spmd_item.get_global_id(1);
-           const auto sg_startx = global_idx - spmd_item.get_local_id(0);
-           const auto sg_starty = global_idy - spmd_item.get_local_id(1);
-
-           sycl::sub_group sg = spmd_item.get_sub_group();
-
-           joint_matrix<sub_group, T, use::b, TileRows, TileCols,
-                        layout::ext_intel_packed>
-               sub_b;
-
-           joint_matrix_load(
-               sg, sub_b,
-               accB.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * (TileRows / VNNI) * Cols * VNNI) +
-                   sg_starty / sg_size * TileCols * VNNI,
-               Cols * VNNI);
-
-           TResult sum_local_cols[Cols] = {0};
-           ext::intel::experimental::matrix::joint_matrix_apply(
-               sg, sub_b, [&](T &x, size_t row, size_t col) {
-                 // the coordinates returned are in the logical range
-                 // [Rows,Cols] If users want to retrieve the VNNIed
-                 // coordinates, they can be obtained using colVNNI = col/VNNI
-                 // rowVNNI = row*VNNI
-                 size_t global_index = col + global_idy / sg_size * TileCols;
-                 sum_local_cols[global_index] += x;
-               });
-
-           for (int i = 0; i < Cols; i++) {
-             sum_local_cols[i] =
-                 reduce_over_group(sg, sum_local_cols[i], sycl::plus<>());
-             if (global_idy % sg_size == 0) {
-               sycl::atomic_ref<TResult, sycl::memory_order::relaxed,
-                                sycl::memory_scope::device>
-                   aref(v[i]);
-               aref.fetch_add(sum_local_cols[i]);
-             }
-           }
-         }); // parallel for
-   }).wait();
-  sum_cols_ref<T, TResult, Rows, Cols>(bufB.get_host_access(),
-                                       sum_cols_v.get_host_access());
-}
-
-template <typename T, typename TResult, size_t VNNI, size_t TK, size_t TN>
-void test() {
-  static constexpr size_t scale = 2;
-  static constexpr size_t MATRIX_K = TK * scale;
-  static constexpr size_t MATRIX_N = TN * scale;
-
-  T B[MATRIX_K][MATRIX_N];
-  big_matrix<T, MATRIX_K, MATRIX_N> MB((T *)&B);
-
-  T Bvnni[MATRIX_K / VNNI][MATRIX_N * VNNI];
-  big_matrix<T, MATRIX_K / VNNI, MATRIX_N * VNNI> MBvnni((T *)&Bvnni);
-
-  for (int i = 0; i < MATRIX_K; i++) {
-    for (int j = 0; j < MATRIX_N; j++) {
-      B[i][j] = i + j;
-    }
-  }
-  matrix_vnni<T>(MATRIX_K, MATRIX_N, *B, *Bvnni, VNNI);
-  // This test calculates sum of columns in the non VNNI B matrix
-  matrix_sum_cols<T, TResult, MATRIX_K, MATRIX_N, TK, TN, VNNI>(MB, MBvnni);
-}
-
-int main() {
-  queue q;
-  std::vector<combination> combinations =
-      q.get_device()
-          .get_info<sycl::ext::oneapi::experimental::info::device::
-                        matrix_combinations>();
-
-  for (unsigned int i = 0; i < combinations.size(); i++) {
-    if (combinations[i].nsize == 0) { // Intel AMX
-      test<int8_t, int32_t, 4, /*TK*/ 64, /*TN*/ 16>();
-      break;
-    }
-
-    if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
-      test<int8_t, int32_t, 4, /*TK*/ 32, /*TN*/ 16>();
-      break;
-    }
-
-    if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
-      test<int8_t, int32_t, 4, /*TK*/ 32, /*TN*/ 8>();
-      break;
-    }
-  }
-  return 0;
-}
\ No newline at end of file
diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/get_coordinate_ops.cpp
similarity index 73%
rename from sycl/test-e2e/Matrix/get_coord_int8_matB.cpp
rename to sycl/test-e2e/Matrix/get_coordinate_ops.cpp
index 255331dca89cb..b32145b954422 100644
--- a/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp
+++ b/sycl/test-e2e/Matrix/get_coordinate_ops.cpp
@@ -1,4 +1,4 @@
-//==----------- get_coord_int8_matB.cpp  - DPC++ joint_matrix---------==//
+//==----------- get_coordinate_ops.cpp - DPC++ joint_matrix---------==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,9 +7,12 @@
 //===----------------------------------------------------------------------===//
 // REQUIRES: aspect-ext_intel_matrix
 
+// XFAIL: !igc-dev
+// XFAIL-TRACKER: GSD-6376
+// REQUIRES-INTEL-DRIVER: lin: 30049
+
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
-// REQUIRES-INTEL-DRIVER: lin: 30049
 
 #include "common.hpp"
-#include "get_coord_int8_matB_impl.hpp"
+#include "get_coordinate_ops_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/get_coordinate_ops_impl.hpp b/sycl/test-e2e/Matrix/get_coordinate_ops_impl.hpp
new file mode 100644
index 0000000000000..a21f3daa47b1a
--- /dev/null
+++ b/sycl/test-e2e/Matrix/get_coordinate_ops_impl.hpp
@@ -0,0 +1,227 @@
+//==----------- get_coordinate_ops_impl.hpp  - DPC++ joint_matrix---------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include <sycl/atomic_ref.hpp>
+#include <sycl/group_algorithm.hpp>
+
+template <typename T, size_t Rows, size_t Cols, layout Layout, use Use>
+class matrix_process;
+
+template <typename T, typename TResult, size_t NUM_ROWS, size_t NUM_COLS,
+          size_t SROWS, size_t SCOLS, use Use, layout Layout, size_t VF>
+void matrix_sum(big_matrix<T, NUM_ROWS / VF, NUM_COLS * VF> &M,
+                TResult *sum_rows, TResult *sum_cols) {
+  buffer<T, 2> buf((T *)M.get_data(), range<2>(NUM_ROWS / VF, NUM_COLS * VF));
+  buffer<TResult> sum_rows_v(sum_rows, NUM_ROWS);
+  buffer<TResult> sum_cols_v(sum_cols, NUM_COLS);
+
+  queue q;
+  size_t sg_size =
+      get_sg_size<matrix_process<T, NUM_ROWS, NUM_COLS, Layout, Use>>(q);
+  q.submit([&](handler &cgh) {
+     sycl::accessor acc{buf, cgh, sycl::read_write};
+     sycl::accessor v_rows{sum_rows_v, cgh, sycl::read_write};
+     sycl::accessor v_cols{sum_cols_v, cgh, sycl::read_write};
+
+     cgh.parallel_for<matrix_process<T, NUM_ROWS, NUM_COLS, Layout, Use>>(
+         nd_range<2>({NUM_ROWS / SROWS, NUM_COLS / SCOLS * sg_size},
+                     {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
+         {
+           // The submatrix API has to be accessed by all the workitems in a
+           // subgroup these functions will be called once by the subgroup no
+           // code divergence between the workitems
+           const auto global_idx = spmd_item.get_global_id(0);
+           const auto global_idy = spmd_item.get_global_id(1);
+           const auto sg_startx = global_idx - spmd_item.get_local_id(0);
+           const auto sg_starty = global_idy - spmd_item.get_local_id(1);
+
+           sub_group sg = spmd_item.get_sub_group();
+
+           TResult sum_local_rows[NUM_ROWS] = {0};
+           TResult sum_local_cols[NUM_COLS] = {0};
+
+           if (Use == use::accumulator) {
+             joint_matrix<sub_group, T, use::accumulator, SROWS, SCOLS,
+                          layout::dynamic>
+                 sub;
+
+             joint_matrix_load(
+                 sg, sub,
+                 acc.template get_multi_ptr<access::decorated::no>() +
+                     (sg_startx * SROWS * NUM_COLS) +
+                     sg_starty / sg_size * SCOLS,
+                 NUM_COLS, Layout);
+
+             ext::intel::experimental::matrix::joint_matrix_apply(
+                 sg, sub, [&](T &x, size_t row, size_t col) {
+                   sum_local_rows[row + global_idx * SROWS] += x;
+                   sum_local_cols[col + global_idy / sg_size * SCOLS] += x;
+                 });
+
+           } else {
+             joint_matrix<sub_group, T, Use, SROWS, SCOLS, Layout> sub;
+
+             joint_matrix_load(
+                 sg, sub,
+                 acc.template get_multi_ptr<access::decorated::no>() +
+                     (sg_startx * (SROWS / VF) * NUM_COLS * VF) +
+                     sg_starty / sg_size * SCOLS * VF,
+                 NUM_COLS * VF);
+
+             ext::intel::experimental::matrix::joint_matrix_apply(
+                 sg, sub, [&](T &x, size_t row, size_t col) {
+                   sum_local_rows[row + global_idx * SROWS] += x;
+                   sum_local_cols[col + global_idy / sg_size * SCOLS] += x;
+                 });
+           }
+
+           for (int i = 0; i < NUM_ROWS; i++) {
+             sum_local_rows[i] =
+                 reduce_over_group(sg, sum_local_rows[i], sycl::plus<>());
+             // only Groups leader perform the global reduction
+             if (global_idy % sg_size == 0) {
+               sycl::atomic_ref<TResult, sycl::memory_order::relaxed,
+                                sycl::memory_scope::device>
+                   aref(v_rows[i]);
+               aref.fetch_add(sum_local_rows[i]);
+             }
+           }
+
+           for (int i = 0; i < NUM_COLS; i++) {
+             sum_local_cols[i] =
+                 reduce_over_group(sg, sum_local_cols[i], sycl::plus<>());
+             // only Groups leader perform the global reduction
+             if (global_idy % sg_size == 0) {
+               sycl::atomic_ref<TResult, sycl::memory_order::relaxed,
+                                sycl::memory_scope::device>
+                   aref(v_cols[i]);
+               aref.fetch_add(sum_local_cols[i]);
+             }
+           }
+         }); // parallel for
+   }).wait();
+}
+
+template <typename T, typename TResult, size_t SROWS, size_t SCOLS, use Use,
+          layout Layout, size_t VF>
+void test_get_coord_op() {
+  constexpr size_t SCALE = 2;
+  static constexpr size_t Rows = SROWS * SCALE;
+  static constexpr size_t Cols = SCOLS * SCALE;
+
+  T M[Rows][Cols];
+  T Mvnni[Rows / VF][Cols * VF];
+  TResult sum_rows[Rows] = {0};
+  TResult sum_rows_ref[Rows] = {0};
+  TResult sum_cols[Cols] = {0};
+  TResult sum_cols_ref[Cols] = {0};
+
+  for (int i = 0; i < Rows; i++) {
+    for (int j = 0; j < Cols; j++) {
+      M[i][j] = i + j;
+    }
+  }
+
+  matrix_vnni<T>(Rows, Cols, *M, *Mvnni, VF);
+  big_matrix<T, Rows / VF, Cols * VF> MM((T *)&Mvnni);
+
+  matrix_sum<T, TResult, Rows, Cols, SROWS, SCOLS, Use, Layout, VF>(
+      MM, sum_rows, sum_cols);
+
+  for (int i = 0; i < Rows; i++) {
+    for (int j = 0; j < Cols; j++) {
+      sum_rows_ref[i] += (int)M[i][j];
+    }
+    assert(std::fabs(sum_rows_ref[i] - sum_rows[i]) <= FLOAT_EPSILON);
+  }
+
+  for (int j = 0; j < Cols; j++) {
+    for (int i = 0; i < Rows; i++) {
+      sum_cols_ref[j] += (int)M[i][j];
+    }
+    assert(std::fabs(sum_cols_ref[j] - sum_cols[j]) <= FLOAT_EPSILON);
+  }
+}
+
+int main() {
+  queue q;
+  std::vector<combination> combinations =
+      q.get_device()
+          .get_info<sycl::ext::oneapi::experimental::info::device::
+                        matrix_combinations>();
+
+  for (unsigned int i = 0; i < combinations.size(); i++) {
+    if (combinations[i].nsize == 0) { // Intel AMX
+      test_get_coord_op<bfloat16, float, /*TM*/ 16, /*TK*/ 32, use::a,
+                        layout::row_major, 1>();
+      test_get_coord_op<int8_t, int, /*TM*/ 16, /*TK*/ 64, use::a,
+                        layout::row_major, 1>();
+      test_get_coord_op<bfloat16, float, /*TK*/ 16, /*TN*/ 16, use::b,
+                        layout::row_major, 1>();
+      test_get_coord_op<int8_t, int32_t, /*TK*/ 64, /*TN*/ 16, use::b,
+                        layout::row_major, 1>();
+      test_get_coord_op<bfloat16, float, /*TK*/ 16, /*TN*/ 16, use::b,
+                        layout::ext_intel_packed, 2>();
+      test_get_coord_op<int8_t, int32_t, /*TK*/ 64, /*TN*/ 16, use::b,
+                        layout::ext_intel_packed, 4>();
+      test_get_coord_op<float, float, /*TM*/ 16, /*TN*/ 16, use::accumulator,
+                        layout::row_major, 1>();
+      test_get_coord_op<int32_t, int32_t, /*TM*/ 16, /*TN*/ 16,
+                        use::accumulator, layout::row_major, 1>();
+      break;
+    }
+
+    if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
+      test_get_coord_op<bfloat16, float, /*TM*/ 8, /*TK*/ 16, use::a,
+                        layout::row_major, 1>();
+      test_get_coord_op<int8_t, int, /*TM*/ 8, /*TK*/ 32, use::a,
+                        layout::row_major, 1>();
+      test_get_coord_op<bfloat16, float, /*TK*/ 16, /*TN*/ 16, use::b,
+                        layout::ext_intel_packed, 2>();
+      test_get_coord_op<int8_t, int32_t, /*TK*/ 32, /*TN*/ 16, use::b,
+                        layout::ext_intel_packed, 4>();
+      test_get_coord_op<float, float, /*TM*/ 8, /*TN*/ 16, use::accumulator,
+                        layout::row_major, 1>();
+      test_get_coord_op<int32_t, int32_t, /*TM*/ 8, /*TN*/ 16, use::accumulator,
+                        layout::row_major, 1>();
+      // This combination is not currently supported for sub group size = 32 in
+      // IGC
+#if (!defined(SG_SZ) || SG_SZ != 32)
+      test_get_coord_op<bfloat16, float, /*TK*/ 16, /*TN*/ 16, use::b,
+                        layout::row_major, 1>();
+      test_get_coord_op<int8_t, int32_t, /*TK*/ 32, /*TN*/ 16, use::b,
+                        layout::row_major, 1>();
+#endif
+      break;
+    }
+
+    if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
+      test_get_coord_op<bfloat16, float, /*TM*/ 8, /*TK*/ 16, use::a,
+                        layout::row_major, 1>();
+      test_get_coord_op<int8_t, int, /*TM*/ 8, /*TK*/ 32, use::a,
+                        layout::row_major, 1>();
+      test_get_coord_op<bfloat16, float, /*TK*/ 16, /*TN*/ 8, use::b,
+                        layout::row_major, 1>();
+      test_get_coord_op<int8_t, int32_t, /*TK*/ 32, /*TN*/ 8, use::b,
+                        layout::row_major, 1>();
+      test_get_coord_op<bfloat16, float, /*TK*/ 16, /*TN*/ 8, use::b,
+                        layout::ext_intel_packed, 2>();
+      test_get_coord_op<int8_t, int32_t, /*TK*/ 32, /*TN*/ 8, use::b,
+                        layout::ext_intel_packed, 4>();
+      test_get_coord_op<float, float, /*TM*/ 8, /*TN*/ 8, use::accumulator,
+                        layout::row_major, 1>();
+      test_get_coord_op<int32_t, int32_t, /*TM*/ 8, /*TN*/ 8, use::accumulator,
+                        layout::row_major, 1>();
+      break;
+    }
+  }
+  return 0;
+}
diff --git a/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp b/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp
index ec44b32f182f7..33a61de5bc8cd 100644
--- a/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp
+++ b/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp
@@ -54,7 +54,7 @@
 // tests to match the required format and in that case you should just update
 // (i.e. reduce) the number and the list below.
 //
-// NUMBER-OF-UNSUPPORTED-WITHOUT-INFO: 381
+// NUMBER-OF-UNSUPPORTED-WITHOUT-INFO: 379
 //
 // List of improperly UNSUPPORTED tests.
 // Remove the CHECK once the test has been properly UNSUPPORTED.
@@ -264,9 +264,7 @@
 // CHECK-NEXT: Matrix/SG32/element_wise_all_ops_int8_packed.cpp
 // CHECK-NEXT: Matrix/SG32/element_wise_all_sizes.cpp
 // CHECK-NEXT: Matrix/SG32/element_wise_ops.cpp
-// CHECK-NEXT: Matrix/SG32/get_coord_float_matC.cpp
-// CHECK-NEXT: Matrix/SG32/get_coord_int8_matA.cpp
-// CHECK-NEXT: Matrix/SG32/get_coord_int8_matB.cpp
+// CHECK-NEXT: Matrix/SG32/get_coordinate_ops.cpp
 // CHECK-NEXT: Matrix/SG32/joint_matrix_all_sizes.cpp
 // CHECK-NEXT: Matrix/SG32/joint_matrix_apply_bf16.cpp
 // CHECK-NEXT: Matrix/SG32/joint_matrix_apply_two_matrices.cpp