Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GPU] Fix resmlp-12-distilled-224 failure. #28861

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ KERNEL (permute_f_y_axes)(
#endif
)
{
__local OUTPUT_TYPE transpose_buf[TILE_SIZE][TILE_SIZE+1];
__local ACCUMULATOR_TYPE transpose_buf[TILE_SIZE][TILE_SIZE+1];

const int bf = get_global_id(2);
const int b_idx = bf / INPUT0_FEATURE_NUM;
Expand Down Expand Up @@ -154,7 +154,7 @@ KERNEL (permute_f_y_axes)(
__attribute__((opencl_unroll_hint(J_TIMES)))
for (int j = 0; j < J_TIMES; ++j) {
const int j_vec = j * VEC_SIZE;
OUT_VEC_TYPE res = READ_VEC(0, &transpose_buf[bf_local][j_vec]);
ACC_VEC_TYPE res = READ_VEC(0, &transpose_buf[bf_local][j_vec]);
const int f_out_idx = y_begin + bf_local;
const int y_out_idx = (f_begin + j_vec) % INPUT0_FEATURE_NUM;;
FUSED_OPS_VEC;
Expand All @@ -169,7 +169,7 @@ KERNEL (permute_f_y_axes)(
const int f = (f_begin + j_vec) % INPUT0_FEATURE_NUM;;
const int y_idx = y_begin + bf_local;
const int output_idx = OUTPUT_GET_INDEX(b_idx, y_idx, f, x_idx);
WRITE_VEC(READ_VEC(0, &transpose_buf[bf_local][j_vec]), 0, &output[output_idx]);
WRITE_VEC(TO_OUT_VEC_TYPE(READ_VEC(0, &transpose_buf[bf_local][j_vec])), 0, &output[output_idx]);
}
#endif

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,8 @@ JitConstants PermuteKernel_f_y_axes::GetJitConstants(const permute_params& param
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", subgroup_size));
}

jit.Merge(MakeTypeJitConstants(params.inputs[0].GetDType(), "ACCUMULATOR"));

if (!params.fused_ops.empty()) {
const std::vector<std::string> original_output_order = {"b_idx", "f_out_idx", "y_out_idx", "x_idx"};
const FusedOpsConfiguration conf_scalar = {"", original_output_order, "res", params.inputs[0].GetDType(), 1};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ class PermuteReorderFusingTest : public ::BaseFusingTest<permute_reorder_params>
#define CASE_PERMUTE_F32_5 { 1, 32, 4, 5 }, { 32, 4, 5, 1 }, { 1, 3, 0, 2 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_PERMUTE_F32_6 { 1, 16, 4, 5 }, { 5, 16, 4, 1 }, { 2, 1, 0, 3 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_PERMUTE_F32_7 { 1, 16, 1, 1 }, { 1, 1, 1, 16 }, { 3, 2, 1, 0 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_PERMUTE_F32_8 { 1, 16, 1, 32 }, { 1, 16, 1, 32 }, { 0, 2, 1, 3 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx

#define CASE_PERMUTE_F16_0 { 1, 16, 4, 5 }, { 1, 16, 4, 5 }, { 0, 1, 2, 3 }, tensor{ 0 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_PERMUTE_F16_1 { 2, 16, 4, 5 }, { 16, 4, 5, 2 }, { 1, 3, 0, 2 }, tensor{ 0 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
Expand Down Expand Up @@ -314,6 +315,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_quant_u8, ::testing::ValuesIn(std:

permute_params{ CASE_PERMUTE_F16_0, 2, 3 },
permute_params{ CASE_PERMUTE_F16_1, 2, 3 },
permute_params{ CASE_PERMUTE_F32_8, 2, 3 },
}));

class permute_scale_actv_eltw_scale_actv_quant_i8: public PermuteFusingTest {};
Expand Down
Loading