From 61a47e6e2c1fbd4e3353716505a7900d9988c5b1 Mon Sep 17 00:00:00 2001
From: hyunback <hyunback.kim@intel.com>
Date: Fri, 7 Feb 2025 08:43:04 +0900
Subject: [PATCH 1/4] [GPU] Fix resmlp-12-distilled-224 failure.

implicit casting error when different input and output data type.

Signed-off-by: hyunback <hyunback.kim@intel.com>
---
 .../src/kernel_selector/cl_kernels/permute_f_y_axes.cl        | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl
index 9f74654bd45a65..14031c4f420cf3 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl
@@ -122,7 +122,7 @@ KERNEL (permute_f_y_axes)(
 #endif
     )
 {
-    __local OUTPUT_TYPE transpose_buf[TILE_SIZE][TILE_SIZE+1];
+    __local INPUT0_TYPE transpose_buf[TILE_SIZE][TILE_SIZE+1];
 
     const int bf = get_global_id(2);
     const int b_idx = bf / INPUT0_FEATURE_NUM;
@@ -154,7 +154,7 @@ KERNEL (permute_f_y_axes)(
     __attribute__((opencl_unroll_hint(J_TIMES)))
     for (int j = 0; j < J_TIMES; ++j) {
         const int j_vec = j * VEC_SIZE;
-        OUT_VEC_TYPE res = READ_VEC(0, &transpose_buf[bf_local][j_vec]);
+        IN_VEC_TYPE res = READ_VEC(0, &transpose_buf[bf_local][j_vec]);
         const int f_out_idx = y_begin + bf_local;
         const int y_out_idx = (f_begin + j_vec) % INPUT0_FEATURE_NUM;;
         FUSED_OPS_VEC;

From f6efbaa81e66e0cc1b19b950ad732229f4243e8c Mon Sep 17 00:00:00 2001
From: hyunback <hyunback.kim@intel.com>
Date: Fri, 7 Feb 2025 10:48:11 +0900
Subject: [PATCH 2/4] Fix func_test_failure.

Signed-off-by: hyunback <hyunback.kim@intel.com>
---
 .../src/kernel_selector/cl_kernels/permute_f_y_axes.cl          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl
index 14031c4f420cf3..2109ebec19f6a3 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl
@@ -169,7 +169,7 @@ KERNEL (permute_f_y_axes)(
         const int f = (f_begin + j_vec) % INPUT0_FEATURE_NUM;;
         const int y_idx = y_begin + bf_local;
         const int output_idx = OUTPUT_GET_INDEX(b_idx, y_idx, f, x_idx);
-        WRITE_VEC(READ_VEC(0, &transpose_buf[bf_local][j_vec]), 0, &output[output_idx]);
+        WRITE_VEC(TO_OUT_VEC_TYPE(READ_VEC(0, &transpose_buf[bf_local][j_vec])), 0, &output[output_idx]);
     }
 #endif
 

From 5aa9ed2ecc6ad5d6746810fb7da5ef9e4b9391fe Mon Sep 17 00:00:00 2001
From: hyunback <hyunback.kim@intel.com>
Date: Fri, 7 Feb 2025 17:23:57 +0900
Subject: [PATCH 3/4] Use accumulator_type macro.

Signed-off-by: hyunback <hyunback.kim@intel.com>
---
 .../src/kernel_selector/cl_kernels/permute_f_y_axes.cl        | 4 ++--
 .../kernels/permute/permute_kernel_f_y_axes.cpp               | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl
index 2109ebec19f6a3..8d93743094801b 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl
@@ -122,7 +122,7 @@ KERNEL (permute_f_y_axes)(
 #endif
     )
 {
-    __local INPUT0_TYPE transpose_buf[TILE_SIZE][TILE_SIZE+1];
+    __local ACCUMULATOR_TYPE transpose_buf[TILE_SIZE][TILE_SIZE+1];
 
     const int bf = get_global_id(2);
     const int b_idx = bf / INPUT0_FEATURE_NUM;
@@ -154,7 +154,7 @@ KERNEL (permute_f_y_axes)(
     __attribute__((opencl_unroll_hint(J_TIMES)))
     for (int j = 0; j < J_TIMES; ++j) {
         const int j_vec = j * VEC_SIZE;
-        IN_VEC_TYPE res = READ_VEC(0, &transpose_buf[bf_local][j_vec]);
+        ACC_VEC_TYPE res = READ_VEC(0, &transpose_buf[bf_local][j_vec]);
         const int f_out_idx = y_begin + bf_local;
         const int y_out_idx = (f_begin + j_vec) % INPUT0_FEATURE_NUM;;
         FUSED_OPS_VEC;
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_f_y_axes.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_f_y_axes.cpp
index deb25d17618347..b2d3141ee74142 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_f_y_axes.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_f_y_axes.cpp
@@ -145,6 +145,8 @@ JitConstants PermuteKernel_f_y_axes::GetJitConstants(const permute_params& param
         jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", subgroup_size));
     }
 
+    jit.Merge(MakeTypeJitConstants(params.inputs[0].GetDType(), "ACCUMULATOR"));
+
     if (!params.fused_ops.empty()) {
         const std::vector<std::string> original_output_order = {"b_idx", "f_out_idx", "y_out_idx", "x_idx"};
         const FusedOpsConfiguration conf_scalar = {"", original_output_order, "res", params.inputs[0].GetDType(), 1};

From 64c9694d60784dcfde82ffab87a79887a8c8ffda Mon Sep 17 00:00:00 2001
From: hyunback <hyunback.kim@intel.com>
Date: Fri, 7 Feb 2025 18:04:54 +0900
Subject: [PATCH 4/4] Add unit_test.

Signed-off-by: hyunback <hyunback.kim@intel.com>
---
 .../intel_gpu/tests/unit/fusions/permute_fusion_test.cpp        | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/plugins/intel_gpu/tests/unit/fusions/permute_fusion_test.cpp b/src/plugins/intel_gpu/tests/unit/fusions/permute_fusion_test.cpp
index 66ecf6a7918c2b..2714653727f380 100644
--- a/src/plugins/intel_gpu/tests/unit/fusions/permute_fusion_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/fusions/permute_fusion_test.cpp
@@ -103,6 +103,7 @@ class PermuteReorderFusingTest : public ::BaseFusingTest<permute_reorder_params>
 #define CASE_PERMUTE_F32_5 { 1, 32, 4, 5 }, { 32, 4, 5, 1 }, { 1, 3, 0, 2 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
 #define CASE_PERMUTE_F32_6 { 1, 16, 4, 5 }, { 5, 16, 4, 1 }, { 2, 1, 0, 3 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
 #define CASE_PERMUTE_F32_7 { 1, 16, 1, 1 }, { 1, 1, 1, 16 }, { 3, 2, 1, 0 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_PERMUTE_F32_8 { 1, 16, 1, 32 }, { 1, 16, 1, 32 }, { 0, 2, 1, 3 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
 
 #define CASE_PERMUTE_F16_0 { 1, 16, 4, 5 }, { 1, 16, 4, 5 }, { 0, 1, 2, 3 }, tensor{ 0 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
 #define CASE_PERMUTE_F16_1 { 2, 16, 4, 5 }, { 16, 4, 5, 2 }, { 1, 3, 0, 2 }, tensor{ 0 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
@@ -314,6 +315,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_quant_u8, ::testing::ValuesIn(std:
 
     permute_params{ CASE_PERMUTE_F16_0, 2, 3 },
     permute_params{ CASE_PERMUTE_F16_1, 2, 3 },
+    permute_params{ CASE_PERMUTE_F32_8, 2, 3 },
 }));
 
 class permute_scale_actv_eltw_scale_actv_quant_i8: public PermuteFusingTest {};