diff --git a/sycl/test-e2e/Matrix/SG32/get_coord_float_matC.cpp b/sycl/test-e2e/Matrix/SG32/get_coord_float_matC.cpp deleted file mode 100644 index 9b4777faced20..0000000000000 --- a/sycl/test-e2e/Matrix/SG32/get_coord_float_matC.cpp +++ /dev/null @@ -1,20 +0,0 @@ -//==----------- get_coord_float_matC.cpp - DPC++ joint_matrix---------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 -// UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: aspect-ext_intel_matrix -// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 - -// RUN: %{build} -o %t.out -// RUN: %{run} %t.out - -#include "../common.hpp" - -#define SG_SZ 32 - -#include "../get_coord_float_matC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/get_coord_int8_matA.cpp b/sycl/test-e2e/Matrix/SG32/get_coord_int8_matA.cpp deleted file mode 100644 index 09c3ccf9983ab..0000000000000 --- a/sycl/test-e2e/Matrix/SG32/get_coord_int8_matA.cpp +++ /dev/null @@ -1,20 +0,0 @@ -//==----------- get_coord_int8_matA.cpp - DPC++ joint_matrix---------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 -// UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: aspect-ext_intel_matrix -// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 - -// RUN: %{build} -o %t.out -// RUN: %{run} %t.out - -#include "../common.hpp" - -#define SG_SZ 32 - -#include "../get_coord_int8_matA_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/SG32/get_coordinate_ops.cpp similarity index 78% rename from sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp rename to sycl/test-e2e/Matrix/SG32/get_coordinate_ops.cpp index 7d7e92e102780..86a8bf730c2df 100644 --- a/sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp +++ b/sycl/test-e2e/Matrix/SG32/get_coordinate_ops.cpp @@ -1,4 +1,4 @@ -//==----------- get_coord_int8_matB.cpp - DPC++ joint_matrix---------==// +//==----------- get_coordinate_ops.cpp - DPC++ joint_matrix---------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -10,6 +10,9 @@ // REQUIRES: aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 30049, win: 101.4943 +// XFAIL: !igc-dev +// XFAIL-TRACKER: GSD-6376 + // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -17,4 +20,4 @@ #define SG_SZ 32 -#include "../get_coord_int8_matB_impl.hpp" +#include "../get_coordinate_ops_impl.hpp" \ No newline at end of file diff --git a/sycl/test-e2e/Matrix/get_coord_float_matC.cpp b/sycl/test-e2e/Matrix/get_coord_float_matC.cpp deleted file mode 100644 index 044c8072bb3bc..0000000000000 --- a/sycl/test-e2e/Matrix/get_coord_float_matC.cpp +++ /dev/null @@ -1,14 +0,0 @@ -//==----------- get_coord_float_matC.cpp - DPC++ joint_matrix---------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix - -// RUN: %{build} -o %t.out -// RUN: %{run} %t.out - -#include "common.hpp" -#include "get_coord_float_matC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp b/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp deleted file mode 100644 index a6c7c5646a548..0000000000000 --- a/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp +++ /dev/null @@ -1,130 +0,0 @@ -//==----------- get_coord_float_matC_impl.hpp - DPC++ joint_matrix---------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#include -#include - -template class add_rows; - -// clang-format off -/* -Here's how the data is distributed for sub group size = 16 on PVC -W0 --> 0 1 2 3 4 5 6 7 -wi [0,0] -> i=0, [0, 0] wi [0,1] --> i=0, [0, 1] wi [0,15] --> i=0, [0, 15] - i=1, [1, 0] i=1, [1, 1] i=1, [1, 15] - i=2, [2, 0] i=2, [2, 1] ... - ... .... - i=7, [7, 0] i=7, [7, 1] -*/ -// clang-format on - -template -void matrix_sum_rows(big_matrix &C, T *sum_rows) { - buffer bufC((T *)C.get_data(), range<2>(Rows, Cols)); - buffer sum_rows_v(sum_rows, Rows); - - queue q; - size_t sg_size = get_sg_size>(q); - q.submit([&](handler &cgh) { - sycl::accessor accC{bufC, cgh, sycl::read_write}; - sycl::accessor v{sum_rows_v, cgh, sycl::read_write}; - - cgh.parallel_for>( - nd_range<2>({Rows / TileRows, Cols / TileCols * sg_size}, - {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[sycl::reqd_sub_group_size(SG_SZ)]] -#endif - { - // The submatrix API has to be accessed by all the workitems in a - // subgroup these functions will be called once by the subgroup no - // code divergence between the workitems - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix - sub_c; - - joint_matrix_load( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TileRows) * Cols + - sg_starty / sg_size * TileCols, - Cols, layout::row_major); - - T sum_local_rows[Rows] = {0}; - - ext::intel::experimental::matrix::joint_matrix_apply( - sg, sub_c, [&](T &x, size_t row, size_t col) { - sum_local_rows[row + global_idx * TileRows] += x; - }); - for (int i = 0; i < Rows; i++) { - sum_local_rows[i] = - reduce_over_group(sg, sum_local_rows[i], sycl::plus<>()); - // only Groups leader perform the global reduction - if (global_idy % sg_size == 0) { - sycl::atomic_ref - aref(v[i]); - aref.fetch_add(sum_local_rows[i]); - } - } - }); // parallel for - }).wait(); -} - -template void test() { - constexpr size_t SCALE = 2; - static constexpr size_t Rows = TM * SCALE; - static constexpr size_t Cols = TN * SCALE; - - T sum_rows[Rows] = {0}; - T sum_rows_ref[Rows] = {0}; - T C[Rows][Cols]; - big_matrix MC((T *)&C); - - matrix_rand(Rows, Cols, (T *)&C, (T)100); - matrix_sum_rows(MC, sum_rows); - - for (int i = 0; i < Rows; i++) { - for (int j = 0; j < Cols; j++) { - sum_rows_ref[i] += C[i][j]; - } - assert(std::fabs(sum_rows_ref[i] - sum_rows[i]) <= FLOAT_EPSILON); - } -} - -int main() { - queue q; - std::vector combinations = - q.get_device() - .get_info(); - - for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].nsize == 0) { // Intel AMX - test(); - break; - } - - if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - test(); - break; - } - - if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - test(); - break; - } - } - return 0; -} diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp b/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp deleted file mode 100644 index 8e953c8689305..0000000000000 --- a/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp +++ /dev/null @@ -1,14 +0,0 @@ -//==----------- get_coord_int8_matA.cpp - DPC++ joint_matrix---------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix - -// RUN: %{build} -o %t.out -// RUN: %{run} %t.out - -#include "common.hpp" -#include "get_coord_int8_matA_impl.hpp" diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp b/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp deleted file mode 100644 index 0bf9281ab9f45..0000000000000 --- a/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp +++ /dev/null @@ -1,176 +0,0 @@ -//==----------- get_coord_int8_matA_impl.hpp - DPC++ joint_matrix---------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#include -#include - -template class add_rows; - -template -void sum_rows_ref( - host_accessor A, - host_accessor sum_rows) { - int sum_rows_ref[Rows] = {0}; - for (size_t i = 0; i < Rows; i++) { - for (size_t j = 0; j < Cols; j++) { - sum_rows_ref[i] += A[i][j]; - } - auto diff = sum_rows[i] - sum_rows_ref[i]; - assert(std::fabs(static_cast(diff)) <= - std::numeric_limits::epsilon()); - } -} - -// clang-format off -/* For sub group size = 16: -wi [0,0] -> i=0, [0, 0] wi [0,1] --> i=0, [0, 2] wi [0,15] --> i=0, [0, 30] - i=1, [0, 1] i=1, [0, 3] i=1, [0, 31] - i=2, [1, 0] i=2, [1, 2] i=2, [1, 30] - i=3, [1, 1] i=3, [1, 3] i=3, [1, 31] - i=4, [2, 0] i=4, [2, 2] ... - i=5, [2, 1] i=5, [2, 3] - ... .... - i=14,[7, 0] i=14, [7, 2] - i=15,[7, 1] i=15, [7, 3] i=15, [7, 31] - -Here's how the distribution of the A matrix looks like for this test case -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -<--------------------------------- SG1 ---------------------------------> -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -<0> <1> <2> <3> <4> <5> <6> <7> ..... WORK ITEMS -Each work item has 16 elements <8 rows and 2 cols of the original matrix> -the data_slice holds the matrix elements in the following order: -0 0 0 0 - / - / -1 1 1 1 - / - / -2 2 2 2 - / - / -3 3 3 3 -W0 --> 0 0 1 1 2 2 3 3 .... 7 7 -*/ -// clang-format on - -template -void matrix_sum_rows(big_matrix &A) { - buffer bufA(A.get_data(), range<2>(Rows, Cols)); - - // size of vector is equal to number of rows in big matrix - TResult sum_rows[Rows] = {0}; - buffer sum_rows_v(sum_rows, Rows); - queue q; - size_t sg_size = get_sg_size>(q); - nd_range<2> r({Rows / TileRows, Cols / TileCols * sg_size}, {1, 1 * sg_size}); - q.submit([&](handler &cgh) { - sycl::accessor accA{bufA, cgh, sycl::read_write}; - sycl::accessor v{sum_rows_v, cgh, sycl::read_write}; - - cgh.parallel_for>( - r, [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[sycl::reqd_sub_group_size(SG_SZ)]] -#endif - { - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sycl::sub_group sg = spmd_item.get_sub_group(); - joint_matrix - sub_a; - joint_matrix_load( - sg, sub_a, - accA.template get_multi_ptr() + - (sg_startx * TileRows * Cols) + - sg_starty / sg_size * TileCols, - Cols); - - TResult sum_local_rows[Rows] = {0}; - - ext::intel::experimental::matrix::joint_matrix_apply( - sg, sub_a, [&](T &x, size_t row, size_t col) { - sum_local_rows[row + global_idx * TileRows] += x; - }); - for (int i = 0; i < Rows; ++i) { - sum_local_rows[i] = - reduce_over_group(sg, sum_local_rows[i], sycl::plus<>()); - - // only Groups leader performs the global reduction - if (global_idy % sg_size == 0) { - sycl::atomic_ref - aref(v[i]); - aref.fetch_add(sum_local_rows[i]); - } - } - }); // parallel for - }).wait(); - sum_rows_ref(bufA.get_host_access(), - sum_rows_v.get_host_access()); -} - -template void test() { - static constexpr size_t Rows = TM * 2; - static constexpr size_t Cols = TK * 2; - T A[Rows][Cols]; - - big_matrix MA((T *)&A); - - for (int i = 0; i < Rows; i++) { - for (int j = 0; j < Cols; j++) { - A[i][j] = i + j; - } - } - - matrix_sum_rows(MA); -} - -int main() { - queue q; - std::vector combinations = - q.get_device() - .get_info(); - - for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].nsize == 0) { // Intel AMX - test(); - break; - } - - if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - test(); - break; - } - - if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - test(); - break; - } - } - return 0; -} diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matB_impl.hpp b/sycl/test-e2e/Matrix/get_coord_int8_matB_impl.hpp deleted file mode 100644 index 08cb616cc6cc4..0000000000000 --- a/sycl/test-e2e/Matrix/get_coord_int8_matB_impl.hpp +++ /dev/null @@ -1,214 +0,0 @@ -//==----------- get_coord_int8_matB_impl.hpp - DPC++ joint_matrix---------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#include -#include - -template class add_cols; - -template -void sum_cols_ref( - host_accessor B, - host_accessor sum_cols) { - TResult sum_cols_ref[Cols] = {0}; - for (size_t j = 0; j < Cols; j++) { - for (size_t i = 0; i < Rows; i++) { - sum_cols_ref[j] += B[i][j]; - } - auto diff = sum_cols[j] - sum_cols_ref[j]; - assert(std::fabs(static_cast(diff)) <= - std::numeric_limits::epsilon()); - } -} - -// clang-format off -/* - Here is a demonstration of how matrix B will be divided across - work items for this test case for sub group size = 16 on PVC. - < --------------- 128 ----------------------------------> - x x x x x x x x x x x x x x x x .......... x x x x x x ^ - x x x x x x x x x x x x x x x x .......... x x x x x x 16 - x x x x x x x x x x x x x x x x .......... x x x x x x | - ..... | - x x x x x x x x x x x x x x x x .......... x x x x x x | - x x x x x x x x x x x x x x x x .......... x x x x x x v - - --------------- 64 ----------------> - x x x x x x .......... x x x x x x ^ - x x x x x x .......... x x x x x x 8 - x x x x x x .......... x x x x x x | <-- part of (VNNI-ed) - ..... | original matrix each SG - x x x x x x .......... x x x x x x | holds - x x x x x x .......... x x x x x x v - < WI0 > < WI15 > - <-------- 16 -------------> - x x x .......... x x x ^ - x x x .......... x x x | - x x x .......... x x x | <-- part of (non-VNNI-ed) original matrix - ..... | each SG holds - x x x .......... x x x | - x x x .......... x x x | - x x x .......... x x x 32 - x x x .......... x x x | - x x x .......... x x x | - x x x .......... x x x | - x x x .......... x x x | - x x x .......... x x x | - x x x .......... x x x v - If we divide the above matrix across 16 (SG_SZ) work items, - each WI will hold 32 elements. And these 32 elements will be - 8x4 chunks as shown in the VNNI-ed matrix figure. - -The total distribution among the WIs in ALL the sub-groups is as follows: -This is useful to figure out how the global index is to be calculated - -W0 --> 0 0 0 0 1 1 1 1 ... 7 7 7 7 --> total 32 elements -wi [0,0] --> i=0, [0, 0] wi [0,1] --> i=0, [0, 4] wi [0,15] --> i=0, [0, 60] | wi [0,16] --> i=0, [0, 64] - i=1, [0, 1] i=1, [0, 5] i=1, [0, 61] | i=1, [0, 65] - i=2, [0, 2] i=2, [0, 6] i=2, [0, 62] | i=2, [0, 66] - i=3, [0, 3] i=3, [0, 7] i=3, [0, 63] | i=3, [0, 67] - i=4, [1, 0] i=4, [1, 4] i=4, [1, 60] | .... - i=5, [1, 1] i=5, [1, 5] i=5, [1, 61] | - i=6, [1, 2] i=6, [1, 6] i=6, [1, 62] | - i=7, [1, 3] i=7, [1, 7] i=7, [1, 63] | - ... ... .... | - i=28,[7, 0] i=28,[7, 4] i=28,[7, 60] | i=28, [7, 124] - i=29,[7, 1] i=29,[7, 5] i=29,[7, 61] | i=29, [7, 125] - i=30,[7, 2] i=30,[7, 6] i=30,[7, 62] | i=30, [7, 126] - i=31,[7, 3] i=31,[7, 7] i=31,[7, 63] | i=31, [7, 127] ----------------------------------------------------------------------------------------- --------------------------- -wi [1,0] --> i=0, [8, 0] - i=1, [8, 1] - i=2, [8, 2] - i=3, [8, 2] - ... - i=28, [15, 0] - i=29, [15, 1] - i=30, [15, 2] - i=31, [15, 3] -*/ - -// clang-format on - -template -void matrix_sum_cols(big_matrix &B, - big_matrix &Bvnni) { - buffer bufB(B.get_data(), range<2>(Rows, Cols)); - buffer bufBvnni(Bvnni.get_data(), range<2>(Rows / VNNI, Cols * VNNI)); - - TResult sum_cols[Cols] = {0}; - buffer sum_cols_v(sum_cols, Cols); - - size_t NDRangeK = Rows / TileRows; - size_t NDRangeN = Cols / TileCols; - queue q; - size_t sg_size = get_sg_size>(q); - nd_range<2> r({NDRangeK, NDRangeN * sg_size}, {1, 1 * sg_size}); - - q.submit([&](handler &cgh) { - sycl::accessor accB{bufBvnni, cgh, sycl::read_write}; - sycl::accessor v{sum_cols_v, cgh, sycl::read_write}; - - cgh.parallel_for>( - r, [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[sycl::reqd_sub_group_size(SG_SZ)]] -#endif - { - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sycl::sub_group sg = spmd_item.get_sub_group(); - - joint_matrix - sub_b; - - joint_matrix_load( - sg, sub_b, - accB.template get_multi_ptr() + - (sg_startx * (TileRows / VNNI) * Cols * VNNI) + - sg_starty / sg_size * TileCols * VNNI, - Cols * VNNI); - - TResult sum_local_cols[Cols] = {0}; - ext::intel::experimental::matrix::joint_matrix_apply( - sg, sub_b, [&](T &x, size_t row, size_t col) { - // the coordinates returned are in the logical range - // [Rows,Cols] If users want to retrieve the VNNIed - // coordinates, they can be obtained using colVNNI = col/VNNI - // rowVNNI = row*VNNI - size_t global_index = col + global_idy / sg_size * TileCols; - sum_local_cols[global_index] += x; - }); - - for (int i = 0; i < Cols; i++) { - sum_local_cols[i] = - reduce_over_group(sg, sum_local_cols[i], sycl::plus<>()); - if (global_idy % sg_size == 0) { - sycl::atomic_ref - aref(v[i]); - aref.fetch_add(sum_local_cols[i]); - } - } - }); // parallel for - }).wait(); - sum_cols_ref(bufB.get_host_access(), - sum_cols_v.get_host_access()); -} - -template -void test() { - static constexpr size_t scale = 2; - static constexpr size_t MATRIX_K = TK * scale; - static constexpr size_t MATRIX_N = TN * scale; - - T B[MATRIX_K][MATRIX_N]; - big_matrix MB((T *)&B); - - T Bvnni[MATRIX_K / VNNI][MATRIX_N * VNNI]; - big_matrix MBvnni((T *)&Bvnni); - - for (int i = 0; i < MATRIX_K; i++) { - for (int j = 0; j < MATRIX_N; j++) { - B[i][j] = i + j; - } - } - matrix_vnni(MATRIX_K, MATRIX_N, *B, *Bvnni, VNNI); - // This test calculates sum of columns in the non VNNI B matrix - matrix_sum_cols(MB, MBvnni); -} - -int main() { - queue q; - std::vector combinations = - q.get_device() - .get_info(); - - for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].nsize == 0) { // Intel AMX - test(); - break; - } - - if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - test(); - break; - } - - if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - test(); - break; - } - } - return 0; -} \ No newline at end of file diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/get_coordinate_ops.cpp similarity index 73% rename from sycl/test-e2e/Matrix/get_coord_int8_matB.cpp rename to sycl/test-e2e/Matrix/get_coordinate_ops.cpp index 255331dca89cb..b32145b954422 100644 --- a/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp +++ b/sycl/test-e2e/Matrix/get_coordinate_ops.cpp @@ -1,4 +1,4 @@ -//==----------- get_coord_int8_matB.cpp - DPC++ joint_matrix---------==// +//==----------- get_coordinate_ops.cpp - DPC++ joint_matrix---------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -7,9 +7,12 @@ //===----------------------------------------------------------------------===// // REQUIRES: aspect-ext_intel_matrix +// XFAIL: !igc-dev +// XFAIL-TRACKER: GSD-6376 +// REQUIRES-INTEL-DRIVER: lin: 30049 + // RUN: %{build} -o %t.out // RUN: %{run} %t.out -// REQUIRES-INTEL-DRIVER: lin: 30049 #include "common.hpp" -#include "get_coord_int8_matB_impl.hpp" +#include "get_coordinate_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/get_coordinate_ops_impl.hpp b/sycl/test-e2e/Matrix/get_coordinate_ops_impl.hpp new file mode 100644 index 0000000000000..a21f3daa47b1a --- /dev/null +++ b/sycl/test-e2e/Matrix/get_coordinate_ops_impl.hpp @@ -0,0 +1,227 @@ +//==----------- get_coordinate_ops_impl.hpp - DPC++ joint_matrix---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include +#include + +template +class matrix_process; + +template +void matrix_sum(big_matrix &M, + TResult *sum_rows, TResult *sum_cols) { + buffer buf((T *)M.get_data(), range<2>(NUM_ROWS / VF, NUM_COLS * VF)); + buffer sum_rows_v(sum_rows, NUM_ROWS); + buffer sum_cols_v(sum_cols, NUM_COLS); + + queue q; + size_t sg_size = + get_sg_size>(q); + q.submit([&](handler &cgh) { + sycl::accessor acc{buf, cgh, sycl::read_write}; + sycl::accessor v_rows{sum_rows_v, cgh, sycl::read_write}; + sycl::accessor v_cols{sum_cols_v, cgh, sycl::read_write}; + + cgh.parallel_for>( + nd_range<2>({NUM_ROWS / SROWS, NUM_COLS / SCOLS * sg_size}, + {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + // The submatrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + + TResult sum_local_rows[NUM_ROWS] = {0}; + TResult sum_local_cols[NUM_COLS] = {0}; + + if (Use == use::accumulator) { + joint_matrix + sub; + + joint_matrix_load( + sg, sub, + acc.template get_multi_ptr() + + (sg_startx * SROWS * NUM_COLS) + + sg_starty / sg_size * SCOLS, + NUM_COLS, Layout); + + ext::intel::experimental::matrix::joint_matrix_apply( + sg, sub, [&](T &x, size_t row, size_t col) { + sum_local_rows[row + global_idx * SROWS] += x; + sum_local_cols[col + global_idy / sg_size * SCOLS] += x; + }); + + } else { + joint_matrix sub; + + joint_matrix_load( + sg, sub, + acc.template get_multi_ptr() + + (sg_startx * (SROWS / VF) * NUM_COLS * VF) + + sg_starty / sg_size * SCOLS * VF, + NUM_COLS * VF); + + ext::intel::experimental::matrix::joint_matrix_apply( + sg, sub, [&](T &x, size_t row, size_t col) { + sum_local_rows[row + global_idx * SROWS] += x; + sum_local_cols[col + global_idy / sg_size * SCOLS] += x; + }); + } + + for (int i = 0; i < NUM_ROWS; i++) { + sum_local_rows[i] = + reduce_over_group(sg, sum_local_rows[i], sycl::plus<>()); + // only Groups leader perform the global reduction + if (global_idy % sg_size == 0) { + sycl::atomic_ref + aref(v_rows[i]); + aref.fetch_add(sum_local_rows[i]); + } + } + + for (int i = 0; i < NUM_COLS; i++) { + sum_local_cols[i] = + reduce_over_group(sg, sum_local_cols[i], sycl::plus<>()); + // only Groups leader perform the global reduction + if (global_idy % sg_size == 0) { + sycl::atomic_ref + aref(v_cols[i]); + aref.fetch_add(sum_local_cols[i]); + } + } + }); // parallel for + }).wait(); +} + +template +void test_get_coord_op() { + constexpr size_t SCALE = 2; + static constexpr size_t Rows = SROWS * SCALE; + static constexpr size_t Cols = SCOLS * SCALE; + + T M[Rows][Cols]; + T Mvnni[Rows / VF][Cols * VF]; + TResult sum_rows[Rows] = {0}; + TResult sum_rows_ref[Rows] = {0}; + TResult sum_cols[Cols] = {0}; + TResult sum_cols_ref[Cols] = {0}; + + for (int i = 0; i < Rows; i++) { + for (int j = 0; j < Cols; j++) { + M[i][j] = i + j; + } + } + + matrix_vnni(Rows, Cols, *M, *Mvnni, VF); + big_matrix MM((T *)&Mvnni); + + matrix_sum( + MM, sum_rows, sum_cols); + + for (int i = 0; i < Rows; i++) { + for (int j = 0; j < Cols; j++) { + sum_rows_ref[i] += (int)M[i][j]; + } + assert(std::fabs(sum_rows_ref[i] - sum_rows[i]) <= FLOAT_EPSILON); + } + + for (int j = 0; j < Cols; j++) { + for (int i = 0; i < Rows; i++) { + sum_cols_ref[j] += (int)M[i][j]; + } + assert(std::fabs(sum_cols_ref[j] - sum_cols[j]) <= FLOAT_EPSILON); + } +} + +int main() { + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + test_get_coord_op(); + test_get_coord_op(); + test_get_coord_op(); + test_get_coord_op(); + test_get_coord_op(); + test_get_coord_op(); + test_get_coord_op(); + test_get_coord_op(); + break; + } + + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + test_get_coord_op(); + test_get_coord_op(); + test_get_coord_op(); + test_get_coord_op(); + test_get_coord_op(); + test_get_coord_op(); + // This combination is not currently supported for sub group size = 32 in + // IGC +#if (!defined(SG_SZ) || SG_SZ != 32) + test_get_coord_op(); + test_get_coord_op(); +#endif + break; + } + + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + test_get_coord_op(); + test_get_coord_op(); + test_get_coord_op(); + test_get_coord_op(); + test_get_coord_op(); + test_get_coord_op(); + test_get_coord_op(); + test_get_coord_op(); + break; + } + } + return 0; +} diff --git a/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp b/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp index ec44b32f182f7..33a61de5bc8cd 100644 --- a/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp +++ b/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp @@ -54,7 +54,7 @@ // tests to match the required format and in that case you should just update // (i.e. reduce) the number and the list below. // -// NUMBER-OF-UNSUPPORTED-WITHOUT-INFO: 381 +// NUMBER-OF-UNSUPPORTED-WITHOUT-INFO: 379 // // List of improperly UNSUPPORTED tests. // Remove the CHECK once the test has been properly UNSUPPORTED. @@ -264,9 +264,7 @@ // CHECK-NEXT: Matrix/SG32/element_wise_all_ops_int8_packed.cpp // CHECK-NEXT: Matrix/SG32/element_wise_all_sizes.cpp // CHECK-NEXT: Matrix/SG32/element_wise_ops.cpp -// CHECK-NEXT: Matrix/SG32/get_coord_float_matC.cpp -// CHECK-NEXT: Matrix/SG32/get_coord_int8_matA.cpp -// CHECK-NEXT: Matrix/SG32/get_coord_int8_matB.cpp +// CHECK-NEXT: Matrix/SG32/get_coordinate_ops.cpp // CHECK-NEXT: Matrix/SG32/joint_matrix_all_sizes.cpp // CHECK-NEXT: Matrix/SG32/joint_matrix_apply_bf16.cpp // CHECK-NEXT: Matrix/SG32/joint_matrix_apply_two_matrices.cpp