From 61a47e6e2c1fbd4e3353716505a7900d9988c5b1 Mon Sep 17 00:00:00 2001 From: hyunback Date: Fri, 7 Feb 2025 08:43:04 +0900 Subject: [PATCH 1/4] [GPU] Fix resmlp-12-distilled-224 failure. implicit casting error when different input and output data type. Signed-off-by: hyunback --- .../src/kernel_selector/cl_kernels/permute_f_y_axes.cl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl index 9f74654bd45a65..14031c4f420cf3 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl @@ -122,7 +122,7 @@ KERNEL (permute_f_y_axes)( #endif ) { - __local OUTPUT_TYPE transpose_buf[TILE_SIZE][TILE_SIZE+1]; + __local INPUT0_TYPE transpose_buf[TILE_SIZE][TILE_SIZE+1]; const int bf = get_global_id(2); const int b_idx = bf / INPUT0_FEATURE_NUM; @@ -154,7 +154,7 @@ KERNEL (permute_f_y_axes)( __attribute__((opencl_unroll_hint(J_TIMES))) for (int j = 0; j < J_TIMES; ++j) { const int j_vec = j * VEC_SIZE; - OUT_VEC_TYPE res = READ_VEC(0, &transpose_buf[bf_local][j_vec]); + IN_VEC_TYPE res = READ_VEC(0, &transpose_buf[bf_local][j_vec]); const int f_out_idx = y_begin + bf_local; const int y_out_idx = (f_begin + j_vec) % INPUT0_FEATURE_NUM;; FUSED_OPS_VEC; From f6efbaa81e66e0cc1b19b950ad732229f4243e8c Mon Sep 17 00:00:00 2001 From: hyunback Date: Fri, 7 Feb 2025 10:48:11 +0900 Subject: [PATCH 2/4] Fix func_test_failure. Signed-off-by: hyunback --- .../src/kernel_selector/cl_kernels/permute_f_y_axes.cl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl index 14031c4f420cf3..2109ebec19f6a3 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl @@ -169,7 +169,7 @@ KERNEL (permute_f_y_axes)( const int f = (f_begin + j_vec) % INPUT0_FEATURE_NUM;; const int y_idx = y_begin + bf_local; const int output_idx = OUTPUT_GET_INDEX(b_idx, y_idx, f, x_idx); - WRITE_VEC(READ_VEC(0, &transpose_buf[bf_local][j_vec]), 0, &output[output_idx]); + WRITE_VEC(TO_OUT_VEC_TYPE(READ_VEC(0, &transpose_buf[bf_local][j_vec])), 0, &output[output_idx]); } #endif From 5aa9ed2ecc6ad5d6746810fb7da5ef9e4b9391fe Mon Sep 17 00:00:00 2001 From: hyunback Date: Fri, 7 Feb 2025 17:23:57 +0900 Subject: [PATCH 3/4] Use accumulator_type macro. Signed-off-by: hyunback --- .../src/kernel_selector/cl_kernels/permute_f_y_axes.cl | 4 ++-- .../kernels/permute/permute_kernel_f_y_axes.cpp | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl index 2109ebec19f6a3..8d93743094801b 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl @@ -122,7 +122,7 @@ KERNEL (permute_f_y_axes)( #endif ) { - __local INPUT0_TYPE transpose_buf[TILE_SIZE][TILE_SIZE+1]; + __local ACCUMULATOR_TYPE transpose_buf[TILE_SIZE][TILE_SIZE+1]; const int bf = get_global_id(2); const int b_idx = bf / INPUT0_FEATURE_NUM; @@ -154,7 +154,7 @@ KERNEL (permute_f_y_axes)( __attribute__((opencl_unroll_hint(J_TIMES))) for (int j = 0; j < J_TIMES; ++j) { const int j_vec = j * VEC_SIZE; - IN_VEC_TYPE res = READ_VEC(0, &transpose_buf[bf_local][j_vec]); + ACC_VEC_TYPE res = READ_VEC(0, &transpose_buf[bf_local][j_vec]); const int f_out_idx = y_begin + bf_local; const int y_out_idx = (f_begin + j_vec) % INPUT0_FEATURE_NUM;; FUSED_OPS_VEC; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_f_y_axes.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_f_y_axes.cpp index deb25d17618347..b2d3141ee74142 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_f_y_axes.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_f_y_axes.cpp @@ -145,6 +145,8 @@ JitConstants PermuteKernel_f_y_axes::GetJitConstants(const permute_params& param jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", subgroup_size)); } + jit.Merge(MakeTypeJitConstants(params.inputs[0].GetDType(), "ACCUMULATOR")); + if (!params.fused_ops.empty()) { const std::vector original_output_order = {"b_idx", "f_out_idx", "y_out_idx", "x_idx"}; const FusedOpsConfiguration conf_scalar = {"", original_output_order, "res", params.inputs[0].GetDType(), 1}; From 64c9694d60784dcfde82ffab87a79887a8c8ffda Mon Sep 17 00:00:00 2001 From: hyunback Date: Fri, 7 Feb 2025 18:04:54 +0900 Subject: [PATCH 4/4] Add unit_test. Signed-off-by: hyunback --- .../intel_gpu/tests/unit/fusions/permute_fusion_test.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/plugins/intel_gpu/tests/unit/fusions/permute_fusion_test.cpp b/src/plugins/intel_gpu/tests/unit/fusions/permute_fusion_test.cpp index 66ecf6a7918c2b..2714653727f380 100644 --- a/src/plugins/intel_gpu/tests/unit/fusions/permute_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/fusions/permute_fusion_test.cpp @@ -103,6 +103,7 @@ class PermuteReorderFusingTest : public ::BaseFusingTest #define CASE_PERMUTE_F32_5 { 1, 32, 4, 5 }, { 32, 4, 5, 1 }, { 1, 3, 0, 2 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx #define CASE_PERMUTE_F32_6 { 1, 16, 4, 5 }, { 5, 16, 4, 1 }, { 2, 1, 0, 3 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx #define CASE_PERMUTE_F32_7 { 1, 16, 1, 1 }, { 1, 1, 1, 16 }, { 3, 2, 1, 0 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_PERMUTE_F32_8 { 1, 16, 1, 32 }, { 1, 16, 1, 32 }, { 0, 2, 1, 3 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx #define CASE_PERMUTE_F16_0 { 1, 16, 4, 5 }, { 1, 16, 4, 5 }, { 0, 1, 2, 3 }, tensor{ 0 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx #define CASE_PERMUTE_F16_1 { 2, 16, 4, 5 }, { 16, 4, 5, 2 }, { 1, 3, 0, 2 }, tensor{ 0 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx @@ -314,6 +315,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_quant_u8, ::testing::ValuesIn(std: permute_params{ CASE_PERMUTE_F16_0, 2, 3 }, permute_params{ CASE_PERMUTE_F16_1, 2, 3 }, + permute_params{ CASE_PERMUTE_F32_8, 2, 3 }, })); class permute_scale_actv_eltw_scale_actv_quant_i8: public PermuteFusingTest {};