From 51268b3a71741d6f71ca6b9b980b548b49a20679 Mon Sep 17 00:00:00 2001 From: Ashwin Natesan Date: Fri, 27 Oct 2023 16:57:35 +0530 Subject: [PATCH] svcenc: Redundant code removed The following lines of code have been removed to improve coverage - [x] Functions - - isvc_interleaved_copy - isvc_16bit_interleaved_copy - isvc_16bit_interleaved_memset - isvc_iquant_itrans_recon_chroma_4x4_neon - isvc_iquant_itrans_recon_chroma_4x4_sse42 - isvc_iquant_itrans_recon_4x4_dc_with_res_output_neon - isvc_iquant_itrans_recon_res_dc_4x4_sse42 - isvc_iquant_itrans_recon_4x4_dc_with_res_accumulate_neon - isvc_iquant_itrans_recon_res_dc_with_res_acc_4x4_sse42 - isvce_wait_for_thread [x] Function pointer initialisations for the functions above Test: svc_enc_fuzzer --- .../arm/svc/isvc_iquant_itrans_recon_neon.c | 452 -------- common/svc/isvc_mem_fns.c | 158 --- common/svc/isvc_mem_fns.h | 8 - common/svc/isvc_trans_quant_itrans_iquant.h | 8 - .../x86/svc/isvc_iquant_itrans_recon_sse42.c | 967 +++--------------- encoder/arm/svc/isvce_function_selector_a9q.c | 14 - encoder/arm/svc/isvce_function_selector_av8.c | 16 +- encoder/svc/isvce_api.c | 265 ++--- encoder/svc/isvce_encode.c | 33 - encoder/svc/isvce_function_selector_generic.c | 4 - encoder/svc/isvce_ilp_mv.c | 2 +- encoder/svc/isvce_structs.h | 8 - .../x86/svc/isvce_function_selector_sse42.c | 7 - 13 files changed, 205 insertions(+), 1737 deletions(-) diff --git a/common/arm/svc/isvc_iquant_itrans_recon_neon.c b/common/arm/svc/isvc_iquant_itrans_recon_neon.c index 270adde4..8a97fbc5 100644 --- a/common/arm/svc/isvc_iquant_itrans_recon_neon.c +++ b/common/arm/svc/isvc_iquant_itrans_recon_neon.c @@ -587,193 +587,6 @@ void isvc_iquant_itrans_recon_4x4_with_res_accumulate_neon( vreinterpret_u32_u8(pred23_un), 1); } -void isvc_iquant_itrans_recon_chroma_4x4_neon( - buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred, - buffer_container_t *ps_res, buffer_container_t *ps_rec, - iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src, - WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate) -{ - WORD16 *pi2_src = (WORD16 *) ps_src->pv_data; - UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data; - UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data; - WORD32 i4_pred_stride = ps_pred->i4_data_stride; - WORD32 i4_out_stride = ps_rec->i4_data_stride; - const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat; - const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat; - UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6; - - WORD16 i2_rnd_factor = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0; - - int16x4x4_t src_16x4x2; - int16x4x4_t iscal_16x4x2; - int16x4x4_t weigh_16x4x2; - - int16x4_t q0_16x4, q1_16x4, q2_16x4, q3_16x4; - int32x4_t q0_32x4, q1_32x4, q2_32x4, q3_32x4; - int16x4_t rq1_16x4, rq3_16x4; - int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4; - int16x8_t x0_16x8, x1_16x8, x2_16x8, x3_16x8; - int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4; - int16x4x2_t xx0_16x4x2, xx1_16x4x2; - int32x2x2_t x0_32x2x2, x1_32x2x2; - int16x4_t weigh0_16x4, weigh1_16x4, weigh2_16x4, weigh3_16x4; - - uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in; - int16x8_t pred0, pred1, pred2, pred3; - int16x8_t rec0, rec1, rec2, rec3; - uint8x8_t rec0_un, rec1_un, rec2_un, rec3_un; - uint8x8_t out0, out1, out2, out3; - - uint8x8_t chroma_mask_8x8 = vreinterpret_u8_u16(vdup_n_u16(0x00ff)); - - int16x4_t pos_255_16x4 = vdup_n_s16(((WORD16) UINT8_MAX)); - int16x4_t neg_255_16x4 = vdup_n_s16(-((WORD16) UINT8_MAX)); - int32x4_t qp_div_6_32x4 = vdupq_n_s32(u4_qp_div_6); - int32x4_t rnd_fact = vdupq_n_s32(i2_rnd_factor); - - UNUSED(i4_iq_start_idx); - UNUSED(ps_res); - UNUSED(ps_res_pred); - UNUSED(u1_res_accumulate); - - src_16x4x2 = vld4_s16(pi2_src); - iscal_16x4x2 = vld4_s16((const int16_t *) pu2_iscal_mat); - weigh_16x4x2 = vld4_s16((const int16_t *) pu2_weigh_mat); - - weigh0_16x4 = vmul_s16(weigh_16x4x2.val[0], iscal_16x4x2.val[0]); - weigh1_16x4 = vmul_s16(weigh_16x4x2.val[1], iscal_16x4x2.val[1]); - weigh2_16x4 = vmul_s16(weigh_16x4x2.val[2], iscal_16x4x2.val[2]); - weigh3_16x4 = vmul_s16(weigh_16x4x2.val[3], iscal_16x4x2.val[3]); - - q0_32x4 = vmull_s16(weigh0_16x4, src_16x4x2.val[0]); - q1_32x4 = vmull_s16(weigh1_16x4, src_16x4x2.val[1]); - q2_32x4 = vmull_s16(weigh2_16x4, src_16x4x2.val[2]); - q3_32x4 = vmull_s16(weigh3_16x4, src_16x4x2.val[3]); - - q0_32x4 = vaddq_s32(q0_32x4, rnd_fact); - q1_32x4 = vaddq_s32(q1_32x4, rnd_fact); - q2_32x4 = vaddq_s32(q2_32x4, rnd_fact); - q3_32x4 = vaddq_s32(q3_32x4, rnd_fact); - - q0_32x4 = vshlq_s32(q0_32x4, qp_div_6_32x4); - q1_32x4 = vshlq_s32(q1_32x4, qp_div_6_32x4); - q2_32x4 = vshlq_s32(q2_32x4, qp_div_6_32x4); - q3_32x4 = vshlq_s32(q3_32x4, qp_div_6_32x4); - - q0_16x4 = vqshrn_n_s32(q0_32x4, 4); - q1_16x4 = vqshrn_n_s32(q1_32x4, 4); - q2_16x4 = vqshrn_n_s32(q2_32x4, 4); - q3_16x4 = vqshrn_n_s32(q3_32x4, 4); - - q0_16x4 = vset_lane_s16(pi2_dc_src[0], q0_16x4, 0); - - rq1_16x4 = vshr_n_s16(q1_16x4, 1); - rq3_16x4 = vshr_n_s16(q3_16x4, 1); - - x0_16x4 = vadd_s16(q0_16x4, q2_16x4); - x1_16x4 = vsub_s16(q0_16x4, q2_16x4); - x2_16x4 = vsub_s16(rq1_16x4, q3_16x4); - x3_16x4 = vadd_s16(q1_16x4, rq3_16x4); - - xx0_16x4 = vadd_s16(x0_16x4, x3_16x4); - xx1_16x4 = vadd_s16(x1_16x4, x2_16x4); - xx2_16x4 = vsub_s16(x1_16x4, x2_16x4); - xx3_16x4 = vsub_s16(x0_16x4, x3_16x4); - - /* row 0 to row 3 */ - xx0_16x4x2 = vtrn_s16(xx0_16x4, xx1_16x4); - xx1_16x4x2 = vtrn_s16(xx2_16x4, xx3_16x4); - x0_32x2x2 = - vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0])); - x1_32x2x2 = - vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1])); - - x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]); - x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]); - x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]); - x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]); - - /* Store Horz transform output into temp */ - vst1_s16(pi2_tmp, x0_16x4); - vst1_s16(pi2_tmp + 4, x1_16x4); - vst1_s16(pi2_tmp + 8, x2_16x4); - vst1_s16(pi2_tmp + 12, x3_16x4); - - /* vertical inverse transform */ - rq1_16x4 = vshr_n_s16(x1_16x4, 1); - rq3_16x4 = vshr_n_s16(x3_16x4, 1); - - xx0_16x4 = vadd_s16(x0_16x4, x2_16x4); - xx1_16x4 = vsub_s16(x0_16x4, x2_16x4); - xx2_16x4 = vsub_s16(rq1_16x4, x3_16x4); - xx3_16x4 = vadd_s16(x1_16x4, rq3_16x4); - - x0_16x4 = vadd_s16(xx0_16x4, xx3_16x4); - x1_16x4 = vadd_s16(xx1_16x4, xx2_16x4); - x2_16x4 = vsub_s16(xx1_16x4, xx2_16x4); - x3_16x4 = vsub_s16(xx0_16x4, xx3_16x4); - - x0_16x4 = vrshr_n_s16(x0_16x4, 6); - x1_16x4 = vrshr_n_s16(x1_16x4, 6); - x2_16x4 = vrshr_n_s16(x2_16x4, 6); - x3_16x4 = vrshr_n_s16(x3_16x4, 6); - - /* Saturate all values < -255 to -255 and retain the rest as it is */ - x0_16x4 = vmax_s16(x0_16x4, neg_255_16x4); - x1_16x4 = vmax_s16(x1_16x4, neg_255_16x4); - x2_16x4 = vmax_s16(x2_16x4, neg_255_16x4); - x3_16x4 = vmax_s16(x3_16x4, neg_255_16x4); - - /* Saturate all values > 255 to 255 and retain the rest as it is */ - x0_16x4 = vmin_s16(x0_16x4, pos_255_16x4); - x1_16x4 = vmin_s16(x1_16x4, pos_255_16x4); - x2_16x4 = vmin_s16(x2_16x4, pos_255_16x4); - x3_16x4 = vmin_s16(x3_16x4, pos_255_16x4); - - x0_16x8 = vreinterpretq_s16_s32(vmovl_s16(x0_16x4)); - x1_16x8 = vreinterpretq_s16_s32(vmovl_s16(x1_16x4)); - x2_16x8 = vreinterpretq_s16_s32(vmovl_s16(x2_16x4)); - x3_16x8 = vreinterpretq_s16_s32(vmovl_s16(x3_16x4)); - - pred0_in = vld1_u8((uint8_t *) pu1_pred); - pred1_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride)); - pred2_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride << 1)); - pred3_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride * 3)); - - pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in)); - pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in)); - pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in)); - pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in)); - - /* Out pixel = pred + res */ - rec0 = vaddq_s16(pred0, x0_16x8); - rec1 = vaddq_s16(pred1, x1_16x8); - rec2 = vaddq_s16(pred2, x2_16x8); - rec3 = vaddq_s16(pred3, x3_16x8); - - out0 = vld1_u8(pu1_out); - out1 = vld1_u8(pu1_out + i4_out_stride); - out2 = vld1_u8(pu1_out + i4_out_stride * 2); - out3 = vld1_u8(pu1_out + i4_out_stride * 3); - - /* Convert to 8 bit unsigned with saturation */ - rec0_un = vqmovun_s16(rec0); - rec1_un = vqmovun_s16(rec1); - rec2_un = vqmovun_s16(rec2); - rec3_un = vqmovun_s16(rec3); - - /* Store in alternate postions */ - out0 = vbsl_u8(chroma_mask_8x8, rec0_un, out0); - out1 = vbsl_u8(chroma_mask_8x8, rec1_un, out1); - out2 = vbsl_u8(chroma_mask_8x8, rec2_un, out2); - out3 = vbsl_u8(chroma_mask_8x8, rec3_un, out3); - - vst1_u8((pu1_out), out0); - vst1_u8((pu1_out + i4_out_stride), out1); - vst1_u8((pu1_out + (i4_out_stride << 1)), out2); - vst1_u8((pu1_out + ((i4_out_stride << 1) + i4_out_stride)), out3); -} - void isvc_iquant_itrans_recon_chroma_4x4_with_res_output_neon( buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred, buffer_container_t *ps_res, buffer_container_t *ps_rec, @@ -1280,271 +1093,6 @@ void isvc_iquant_itrans_recon_4x4_dc_neon(buffer_container_t *ps_src, buffer_con vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride * 3), vreinterpret_u32_u8(pred3_in), 0); } -void isvc_iquant_itrans_recon_4x4_dc_with_res_output_neon( - buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred, - buffer_container_t *ps_res, buffer_container_t *ps_rec, - iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src, - WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate) -{ - WORD16 *pi2_src = (WORD16 *) ps_src->pv_data; - WORD16 *pi2_res = (WORD16 *) ps_res->pv_data; - UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data; - UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data; - WORD32 i4_res_stride = ps_res->i4_data_stride; - WORD32 i4_pred_stride = ps_pred->i4_data_stride; - WORD32 i4_out_stride = ps_rec->i4_data_stride; - const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat; - const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat; - UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6; - WORD16 rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0; - - WORD16 i2_it_out; - WORD32 i4_iq_out_temp; - int16x8_t temp_0; - int16x4_t residue_res; - uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in; - int16x8_t pred0, pred1, pred2, pred3; - - UNUSED(pi2_tmp); - UNUSED(ps_res_pred); - UNUSED(u1_res_accumulate); - - if(i4_iq_start_idx == 0) - { - i4_iq_out_temp = pi2_src[0]; - INV_QUANT(i4_iq_out_temp, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4); - } - else - { - i4_iq_out_temp = pi2_dc_src[0]; - } - - i2_it_out = ((i4_iq_out_temp + 32) >> 6); - temp_0 = vdupq_n_s16(i2_it_out); - residue_res = vdup_n_s16(isvc_get_residue(i2_it_out, 0, 0)); - - vst1_s16(pi2_res, residue_res); - vst1_s16(pi2_res + i4_res_stride, residue_res); - vst1_s16(pi2_res + (i4_res_stride << 1), residue_res); - vst1_s16(pi2_res + (i4_res_stride << 1) + i4_res_stride, residue_res); - - pred0_in = vld1_u8(pu1_pred); - pu1_pred = pu1_pred + i4_pred_stride; - pred1_in = vld1_u8(pu1_pred); - pu1_pred = pu1_pred + i4_pred_stride; - pred2_in = vld1_u8(pu1_pred); - pu1_pred = pu1_pred + i4_pred_stride; - pred3_in = vld1_u8(pu1_pred); - - pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in)); - pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in)); - pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in)); - pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in)); - - /* Out pixel = Res + pred */ - pred0 = vaddq_s16(pred0, temp_0); - pred1 = vaddq_s16(pred1, temp_0); - pred2 = vaddq_s16(pred2, temp_0); - pred3 = vaddq_s16(pred3, temp_0); - - /* Convert to unsigned 8 bit with saturation */ - pred0_in = vqmovun_s16(pred0); - pred1_in = vqmovun_s16(pred1); - pred2_in = vqmovun_s16(pred2); - pred3_in = vqmovun_s16(pred3); - - vst1_lane_u32((uint32_t *) (pu1_out), vreinterpret_u32_u8(pred0_in), 0); - vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride), vreinterpret_u32_u8(pred1_in), 0); - vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride * 2), vreinterpret_u32_u8(pred2_in), 0); - vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride * 3), vreinterpret_u32_u8(pred3_in), 0); -} - -void isvc_iquant_itrans_recon_4x4_dc_with_res_accumulate_neon( - buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred, - buffer_container_t *ps_res, buffer_container_t *ps_rec, - iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src, - WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate) -{ - WORD16 *pi2_src = (WORD16 *) ps_src->pv_data; - WORD16 *pi2_res = (WORD16 *) ps_res->pv_data; - WORD16 *pi2_res_pred = (WORD16 *) ps_res_pred->pv_data; - UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data; - UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data; - WORD32 i4_res_stride = ps_res->i4_data_stride; - WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride; - WORD32 i4_pred_stride = ps_pred->i4_data_stride; - WORD32 i4_out_stride = ps_rec->i4_data_stride; - const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat; - const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat; - UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6; - WORD16 rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0; - - WORD32 i4_iq_out_temp; - int16x4_t temp_0; - uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in; - int16x8_t pred0, pred1, pred2, pred3; - int16x8_t pred01_in, pred23_in; - uint8x8_t pred01_un, pred23_un; - - int16x4_t resd0_in, resd1_in, resd2_in, resd3_in; - int16x8_t resd01_in, resd23_in; - int16x4_t pos_255 = vdup_n_s16(((WORD16) UINT8_MAX)); - int16x4_t neg_255 = vdup_n_s16(-((WORD16) UINT8_MAX)); - - UNUSED(pi2_tmp); - UNUSED(u1_res_accumulate); - - if(i4_iq_start_idx == 0) - { - i4_iq_out_temp = pi2_src[0]; - INV_QUANT(i4_iq_out_temp, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4); - } - else - { - i4_iq_out_temp = pi2_dc_src[0]; - } - - temp_0 = vdup_n_s16((i4_iq_out_temp + 32) >> 6); - - resd0_in = vld1_s16((int16_t *) pi2_res_pred); - resd1_in = vld1_s16((int16_t *) pi2_res_pred + i4_res_pred_stride); - resd2_in = vld1_s16((int16_t *) pi2_res_pred + (i4_res_pred_stride * 2)); - resd3_in = vld1_s16((int16_t *) pi2_res_pred + (i4_res_pred_stride * 3)); - - /* Add res pred to the res obtained */ - resd0_in = vadd_s16(resd0_in, temp_0); - resd1_in = vadd_s16(resd1_in, temp_0); - resd2_in = vadd_s16(resd2_in, temp_0); - resd3_in = vadd_s16(resd3_in, temp_0); - - /* Saturate all values < -255 to -255 and retain the rest as it is */ - resd0_in = vmax_s16(resd0_in, neg_255); - resd1_in = vmax_s16(resd1_in, neg_255); - resd2_in = vmax_s16(resd2_in, neg_255); - resd3_in = vmax_s16(resd3_in, neg_255); - - /* Saturate all values > 255 to 255 and retain the rest as it is */ - resd0_in = vmin_s16(resd0_in, pos_255); - resd1_in = vmin_s16(resd1_in, pos_255); - resd2_in = vmin_s16(resd2_in, pos_255); - resd3_in = vmin_s16(resd3_in, pos_255); - - vst1_s16(pi2_res, resd0_in); - vst1_s16(pi2_res + i4_res_stride, resd1_in); - vst1_s16(pi2_res + (i4_res_stride << 1), resd2_in); - vst1_s16(pi2_res + (i4_res_stride << 1) + i4_res_stride, resd3_in); - - resd01_in = vcombine_s16(resd0_in, resd1_in); - resd23_in = vcombine_s16(resd2_in, resd3_in); - - pred0_in = vld1_u8(pu1_pred); - pu1_pred = pu1_pred + i4_pred_stride; - pred1_in = vld1_u8(pu1_pred); - pu1_pred = pu1_pred + i4_pred_stride; - pred2_in = vld1_u8(pu1_pred); - pu1_pred = pu1_pred + i4_pred_stride; - pred3_in = vld1_u8(pu1_pred); - - pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in)); - pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in)); - pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in)); - pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in)); - - pred01_in = vcombine_s16(vget_low_s16(pred0), vget_low_s16(pred1)); - pred23_in = vcombine_s16(vget_low_s16(pred2), vget_low_s16(pred3)); - - /* Out pixel = Res + pred */ - pred01_in = vaddq_s16(pred01_in, resd01_in); - pred23_in = vaddq_s16(pred23_in, resd23_in); - - /* Convert to unsigned 8 bit with saturation */ - pred01_un = vqmovun_s16(pred01_in); - pred23_un = vqmovun_s16(pred23_in); - - vst1_lane_u32((uint32_t *) (pu1_out), vreinterpret_u32_u8(pred01_un), 0); - vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride), vreinterpret_u32_u8(pred01_un), 1); - vst1_lane_u32((uint32_t *) (pu1_out + (i4_out_stride << 1)), vreinterpret_u32_u8(pred23_un), 0); - vst1_lane_u32((uint32_t *) (pu1_out + ((i4_out_stride << 1) + i4_out_stride)), - vreinterpret_u32_u8(pred23_un), 1); -} - -void isvc_iquant_itrans_recon_chroma_4x4_dc_neon( - buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred, - buffer_container_t *ps_res, buffer_container_t *ps_rec, - iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src, - WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate) -{ - WORD16 *pi2_src = (WORD16 *) ps_src->pv_data; - UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data; - UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data; - WORD32 i4_pred_stride = ps_pred->i4_data_stride; - WORD32 i4_out_stride = ps_rec->i4_data_stride; - const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat; - const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat; - UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6; - - WORD32 i4_iq_out_temp; - int16x8_t temp_0; - uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in; - int16x8_t pred0, pred1, pred2, pred3; - uint8x8_t i4_out_horz_8x8_r0, i4_out_horz_8x8_r1, i4_out_horz_8x8_r2, i4_out_horz_8x8_r3; - uint8x8_t chroma_mask_8x8 = vreinterpret_u8_u16(vdup_n_u16(0x00ff)); - - UNUSED(pi2_src); - UNUSED(pu2_iscal_mat); - UNUSED(pu2_weigh_mat); - UNUSED(u4_qp_div_6); - UNUSED(pi2_tmp); - UNUSED(i4_iq_start_idx); - UNUSED(ps_res); - UNUSED(ps_res_pred); - UNUSED(u1_res_accumulate); - - i4_iq_out_temp = pi2_dc_src[0]; - temp_0 = vdupq_n_s16((i4_iq_out_temp + 32) >> 6); - - pred0_in = vld1_u8(pu1_pred); - pu1_pred = pu1_pred + i4_pred_stride; - pred1_in = vld1_u8(pu1_pred); - pu1_pred = pu1_pred + i4_pred_stride; - pred2_in = vld1_u8(pu1_pred); - pu1_pred = pu1_pred + i4_pred_stride; - pred3_in = vld1_u8(pu1_pred); - - pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in)); - pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in)); - pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in)); - pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in)); - - /* Out pixel = Res + pred */ - pred0 = vaddq_s16(pred0, temp_0); - pred1 = vaddq_s16(pred1, temp_0); - pred2 = vaddq_s16(pred2, temp_0); - pred3 = vaddq_s16(pred3, temp_0); - - /* Convert to unsigned 8 bit with saturation */ - pred0_in = vqmovun_s16(pred0); - pred1_in = vqmovun_s16(pred1); - pred2_in = vqmovun_s16(pred2); - pred3_in = vqmovun_s16(pred3); - - i4_out_horz_8x8_r0 = vld1_u8(pu1_out); - i4_out_horz_8x8_r1 = vld1_u8(pu1_out + i4_out_stride); - i4_out_horz_8x8_r2 = vld1_u8(pu1_out + i4_out_stride * 2); - i4_out_horz_8x8_r3 = vld1_u8(pu1_out + i4_out_stride * 3); - - /* Store out pixels in alternate positions */ - i4_out_horz_8x8_r0 = vbsl_u8(chroma_mask_8x8, pred0_in, i4_out_horz_8x8_r0); - i4_out_horz_8x8_r1 = vbsl_u8(chroma_mask_8x8, pred1_in, i4_out_horz_8x8_r1); - i4_out_horz_8x8_r2 = vbsl_u8(chroma_mask_8x8, pred2_in, i4_out_horz_8x8_r2); - i4_out_horz_8x8_r3 = vbsl_u8(chroma_mask_8x8, pred3_in, i4_out_horz_8x8_r3); - - vst1_u8((uint8_t *) (pu1_out), i4_out_horz_8x8_r0); - vst1_u8((uint8_t *) (pu1_out + i4_out_stride), i4_out_horz_8x8_r1); - vst1_u8((uint8_t *) (pu1_out + i4_out_stride * 2), i4_out_horz_8x8_r2); - vst1_u8((uint8_t *) (pu1_out + i4_out_stride * 3), i4_out_horz_8x8_r3); -} - void isvc_iquant_itrans_recon_chroma_4x4_dc_with_res_output_neon( buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred, buffer_container_t *ps_res, buffer_container_t *ps_rec, diff --git a/common/svc/isvc_mem_fns.c b/common/svc/isvc_mem_fns.c index 345715af..35a4c662 100644 --- a/common/svc/isvc_mem_fns.c +++ b/common/svc/isvc_mem_fns.c @@ -120,164 +120,6 @@ void isvc_memset_2d(UWORD8 *pu1_dst, WORD32 i4_dst_stride, UWORD8 u1_val, WORD32 } } -/** - ******************************************************************************* - * - * @brief - * Function for copying to an interleaved destination - * - * @par Description: - * Copies the array of width 'wd' and height 'ht' from the location pointed - * by 'src' to the location pointed by 'dst' - * - * @param[in] pu1_src - * UWORD8 pointer to the source - * - * @param[out] pu1_dst - * UWORD8 pointer to the destination - * - * @param[in] src_strd - * integer source stride - * - * @param[in] dst_strd - * integer destination stride - * - * @param[in] ht - * integer height of the array - * - * @param[in] wd - * integer width of the array - * - * @returns - * - * @remarks - * The alternate elements of src will be copied to alternate locations in dsr - * Other locations are not touched - * - ******************************************************************************* - */ -void isvc_interleaved_copy(UWORD8 *pu1_src, UWORD8 *pu1_dst, WORD32 src_strd, WORD32 dst_strd, - WORD32 ht, WORD32 wd) -{ - WORD32 row, col; - wd *= 2; - - for(row = 0; row < ht; row++) - { - for(col = 0; col < wd; col += 2) - { - pu1_dst[col] = pu1_src[col]; - } - - pu1_src += src_strd; - pu1_dst += dst_strd; - } -} - -/** - ******************************************************************************* - * - * @brief - * Function for copying to an interleaved destination - * - * @par Description: - * Copies the array of width 'wd' and height 'ht' from the location pointed - * by 'src' to the location pointed by 'dst' - * - * @param[in] pu1_src - * UWORD8 pointer to the source - * - * @param[out] pu1_dst - * UWORD8 pointer to the destination - * - * @param[in] src_strd - * integer source stride - * - * @param[in] dst_strd - * integer destination stride - * - * @param[in] ht - * integer height of the array - * - * @param[in] wd - * integer width of the array - * - * @returns - * - * @remarks - * The alternate elements of src will be copied to alternate locations in dsr - * Other locations are not touched - * - ******************************************************************************* - */ -void isvc_16bit_interleaved_copy(WORD16 *pi2_src, WORD16 *pi2_dst, WORD32 src_strd, WORD32 dst_strd, - WORD32 ht, WORD32 wd) -{ - WORD32 row, col; - wd *= 2; - - for(row = 0; row < ht; row++) - { - for(col = 0; col < wd; col += 2) - { - pi2_dst[col] = pi2_src[col]; - } - - pi2_src += src_strd; - pi2_dst += dst_strd; - } -} - -/** - ******************************************************************************* - * - * @brief - * Function for memsetting to an interleaved destination - * - * @par Description: - * Memsets the array of width 'wd' and height 'ht' pointed by 'src' - * - * @param[in] pu1_src - * UWORD8 pointer to the source - * - * @param[in] src_strd - * integer source stride - * - * @param[in] value - * Value to set - * - * @param[in] ht - * integer height of the array - * - * @param[in] wd - * integer width of the array - * - * @returns - * - * @remarks - * The alternate elements of src will be copied to alternate locations in dsr - * Other locations are not touched - * - ******************************************************************************* - */ -void isvc_16bit_interleaved_memset(WORD16 *pi2_src, WORD32 i4_src_strd, WORD16 i2_value, - WORD32 i4_wd, WORD32 i4_ht) -{ - WORD32 row, col; - - i4_wd *= 2; - - for(row = 0; row < i4_ht; row++) - { - for(col = 0; col < i4_wd; col += 2) - { - pi2_src[col] = i2_value; - } - - pi2_src += i4_src_strd; - } -} - /** ******************************************************************************* * diff --git a/common/svc/isvc_mem_fns.h b/common/svc/isvc_mem_fns.h index a4d95f71..581e478b 100644 --- a/common/svc/isvc_mem_fns.h +++ b/common/svc/isvc_mem_fns.h @@ -64,32 +64,24 @@ typedef UWORD8 FT_NONZERO_CHECKER(UWORD8 *pu1_data, WORD32 i4_data_strd, UWORD32 UWORD32 u4_ht); /* C function declarations */ -extern FT_MEMCPY ih264_memcpy; extern FT_MEMCPY ih264_memcpy_mul_8; -extern FT_MEMSET ih264_memset; extern FT_MEMSET ih264_memset_mul_8; extern FT_MEMSET_16BIT ih264_memset_16bit; extern FT_MEMSET_16BIT ih264_memset_16bit_mul_8; extern FT_COPY_2D isvc_copy_2d; extern FT_MEMSET_2D isvc_memset_2d; -extern FT_16BIT_INTERLEAVED_COPY isvc_16bit_interleaved_copy; -extern FT_16BIT_INTERLEAVED_MEMSET isvc_16bit_interleaved_memset; extern FT_NONZERO_CHECKER isvc_is_nonzero_blk; extern FT_MEM_ALLOC isvc_memory_alloc; extern FT_MEM_FREE isvc_memory_free; /* A9 Q function declarations */ -extern FT_MEMCPY isvc_memcpy_a9q; extern FT_MEMCPY ih264_memcpy_mul_8_a9q; -extern FT_MEMSET ih264_memset_a9q; extern FT_MEMSET ih264_memset_mul_8_a9q; extern FT_MEMSET_16BIT ih264_memset_16bit_a9q; extern FT_MEMSET_16BIT ih264_memset_16bit_mul_8_a9q; /* AV8 function declarations */ -extern FT_MEMCPY ih264_memcpy_av8; extern FT_MEMCPY ih264_memcpy_mul_8_av8; -extern FT_MEMSET ih264_memset_av8; extern FT_MEMSET ih264_memset_mul_8_av8; extern FT_MEMSET_16BIT ih264_memset_16bit_av8; extern FT_MEMSET_16BIT ih264_memset_16bit_mul_8_av8; diff --git a/common/svc/isvc_trans_quant_itrans_iquant.h b/common/svc/isvc_trans_quant_itrans_iquant.h index fd15dccd..7ded8112 100644 --- a/common/svc/isvc_trans_quant_itrans_iquant.h +++ b/common/svc/isvc_trans_quant_itrans_iquant.h @@ -195,15 +195,11 @@ extern FT_IQ_IT_RECON isvc_iquant_itrans_recon_4x4_sse42; extern FT_IQ_IT_RECON isvc_iquant_itrans_recon_res_4x4_sse42; extern FT_IQ_IT_RECON isvc_iquant_itrans_recon_res_4x4_with_res_acc_sse42; -extern FT_IQ_IT_RECON isvc_iquant_itrans_recon_chroma_4x4_sse42; extern FT_IQ_IT_RECON isvc_iquant_itrans_recon_res_chroma_4x4_sse42; extern FT_IQ_IT_RECON isvc_iquant_itrans_recon_res_chroma_4x4_with_res_acc_sse42; extern FT_IQ_IT_RECON isvc_iquant_itrans_recon_dc_4x4_sse42; -extern FT_IQ_IT_RECON isvc_iquant_itrans_recon_res_dc_4x4_sse42; -extern FT_IQ_IT_RECON isvc_iquant_itrans_recon_res_dc_with_res_acc_4x4_sse42; -extern FT_IQ_IT_RECON isvc_iquant_itrans_recon_chroma_4x4_dc_sse42; extern FT_IQ_IT_RECON isvc_iquant_itrans_recon_res_chroma_4x4_dc_sse42; extern FT_IQ_IT_RECON isvc_iquant_itrans_recon_res_chroma_4x4_dc_with_res_acc_sse42; @@ -217,15 +213,11 @@ extern FT_IQ_IT_RECON isvc_iquant_itrans_recon_4x4_neon; extern FT_IQ_IT_RECON isvc_iquant_itrans_recon_4x4_with_res_output_neon; extern FT_IQ_IT_RECON isvc_iquant_itrans_recon_4x4_with_res_accumulate_neon; -extern FT_IQ_IT_RECON isvc_iquant_itrans_recon_chroma_4x4_neon; extern FT_IQ_IT_RECON isvc_iquant_itrans_recon_chroma_4x4_with_res_output_neon; extern FT_IQ_IT_RECON isvc_iquant_itrans_recon_chroma_4x4_with_res_accumulate_neon; extern FT_IQ_IT_RECON isvc_iquant_itrans_recon_4x4_dc_neon; -extern FT_IQ_IT_RECON isvc_iquant_itrans_recon_4x4_dc_with_res_output_neon; -extern FT_IQ_IT_RECON isvc_iquant_itrans_recon_4x4_dc_with_res_accumulate_neon; -extern FT_IQ_IT_RECON isvc_iquant_itrans_recon_chroma_4x4_dc_neon; extern FT_IQ_IT_RECON isvc_iquant_itrans_recon_chroma_4x4_dc_with_res_output_neon; extern FT_IQ_IT_RECON isvc_iquant_itrans_recon_chroma_4x4_dc_with_res_accumulate_neon; diff --git a/common/x86/svc/isvc_iquant_itrans_recon_sse42.c b/common/x86/svc/isvc_iquant_itrans_recon_sse42.c index 829952b2..f86ebc70 100644 --- a/common/x86/svc/isvc_iquant_itrans_recon_sse42.c +++ b/common/x86/svc/isvc_iquant_itrans_recon_sse42.c @@ -1034,63 +1034,19 @@ void isvc_iquant_itrans_recon_res_4x4_with_res_acc_sse42( *(pu4_out) = _mm_cvtsi128_si32(resq_r3); } -/* - ******************************************************************************** - * - * @brief This function reconstructs a 4x4 sub block from quantized chroma - *resiude and prediction buffer - * - * @par Description: - * The quantized residue is first inverse quantized, then inverse transformed. - * This inverse transformed content is added to the prediction buffer to recon- - * struct the end output - * - * @param[in] pi2_src - * quantized 4x4 block - * - * @param[in] pu1_pred - * prediction 4x4 block - * - * @param[out] pu1_out - * reconstructed 4x4 block - * - * @param[in] src_strd - * quantization buffer stride - * - * @param[in] i4_pred_stride, - * Prediction buffer stride - * - * @param[in] i4_out_stride - * recon buffer Stride - * - * @param[in] pu2_scaling_list - * pointer to scaling list - * - * @param[in] pu2_norm_adjust - * pointer to inverse scale matrix - * - * @param[in] u4_qp_div_6 - * Floor (qp/6) - * - * @param[in] pi4_tmp - * temporary buffer of size 1*16 - * - * @returns none - * - * @remarks none - * - ******************************************************************************* - */ -void isvc_iquant_itrans_recon_chroma_4x4_sse42( +void isvc_iquant_itrans_recon_res_chroma_4x4_sse42( buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred, buffer_container_t *ps_res, buffer_container_t *ps_rec, iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src, WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate) { WORD16 *pi2_src = (WORD16 *) ps_src->pv_data; + WORD16 *pi2_res = (WORD16 *) ps_res->pv_data; + WORD16 *pi2_res_ptr = pi2_res; UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data; UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data; WORD32 i4_src_stride = ps_src->i4_data_stride; + WORD32 i4_res_stride = ps_res->i4_data_stride; WORD32 i4_pred_stride = ps_pred->i4_data_stride; WORD32 i4_out_stride = ps_rec->i4_data_stride; const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat; @@ -1111,13 +1067,13 @@ void isvc_iquant_itrans_recon_chroma_4x4_sse42( __m128i value_32 = _mm_set1_epi32(32); __m128i chroma_mask = _mm_set1_epi16(0xFF); __m128i out_r0, out_r1, out_r2, out_r3; + __m128i res_r0, res_r1, res_r2, res_r3; ASSERT(4 == i4_src_stride); ASSERT(0 == u1_res_accumulate); UNUSED(i4_src_stride); UNUSED(u1_res_accumulate); - UNUSED(ps_res); UNUSED(ps_res_pred); UNUSED(i4_iq_start_idx); @@ -1126,27 +1082,27 @@ void isvc_iquant_itrans_recon_chroma_4x4_sse42( /* operations on platform */ /*************************************************************/ /* a00 a01 a02 a03 a10 a11 a12 a13 -- the source - matrix 0th,1st row */ + matrix 0th,1st row */ src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); /* a20 a21 a22 a23 a30 a31 a32 a33 -- the - source matrix 2nd,3rd row */ + source matrix 2nd,3rd row */ src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); /* b00 b01 b02 b03 b10 b11 b12 b13 -- the - scaling matrix 0th,1st row */ + scaling matrix 0th,1st row */ scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat)); /* b20 b21 b22 b23 b30 b31 b32 b33 --b12 b13 -- the - the scaling matrix 2nd,3rd row */ + the scaling matrix 2nd,3rd row */ scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 8)); /* q00 q01 q02 q03 q10 q11 - q12 q13 -- all 16 bits */ + q12 q13 -- all 16 bits */ dequant_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat)); /* q20 q21 q22 q23 q30 q31 - q32 q33 -- all 16 bits */ + q32 q33 -- all 16 bits */ dequant_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat + 8)); temp0 = _mm_mullo_epi16(scalemat_r0_r1, @@ -1239,7 +1195,7 @@ void isvc_iquant_itrans_recon_chroma_4x4_sse42( temp2 = _mm_srai_epi32(resq_r1, 1); temp2 = _mm_sub_epi32(temp2, resq_r3); /* z3 = w1 + (w3 >> 1) */ - temp3 = _mm_srai_epi32(resq_r3, 1); //(w3>>1) + w1 + temp3 = _mm_srai_epi32(resq_r3, 1); temp3 = _mm_add_epi32(temp3, resq_r1); /*----------------------------------------------------------*/ /* x0 = z0 + z3 */ @@ -1292,8 +1248,10 @@ void isvc_iquant_itrans_recon_chroma_4x4_sse42( pred_r2 = _mm_and_si128(pred_r2, chroma_mask); pred_r3 = _mm_and_si128(pred_r3, chroma_mask); - pred_r0 = _mm_unpacklo_epi64(pred_r0, pred_r1); - pred_r1 = _mm_unpacklo_epi64(pred_r2, pred_r3); + pred_r0 = _mm_cvtepu16_epi32(pred_r0); + pred_r1 = _mm_cvtepu16_epi32(pred_r1); + pred_r2 = _mm_cvtepu16_epi32(pred_r2); + pred_r3 = _mm_cvtepu16_epi32(pred_r3); /*--------------------------------------------------------------*/ /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6 */ @@ -1333,17 +1291,50 @@ void isvc_iquant_itrans_recon_chroma_4x4_sse42( temp1 = _mm_packs_epi32(temp6, temp7); /* Saturate all values < -255 to -255 and retain the rest as it is */ - temp4 = _mm_max_epi16(temp0, neg_255_8x16b); + temp0 = _mm_max_epi16(temp0, neg_255_8x16b); /* Saturate all values > 255 to 255 and retain the rest as it is */ - temp4 = _mm_min_epi16(temp4, pos_255_8x16b); + temp0 = _mm_min_epi16(temp0, pos_255_8x16b); /* Saturate all values < -255 to -255 and retain the rest as it is */ - temp5 = _mm_max_epi16(temp1, neg_255_8x16b); + temp1 = _mm_max_epi16(temp1, neg_255_8x16b); /* Saturate all values > 255 to 255 and retain the rest as it is */ - temp5 = _mm_min_epi16(temp5, pos_255_8x16b); + temp1 = _mm_min_epi16(temp1, pos_255_8x16b); - temp0 = _mm_add_epi16(temp4, pred_r0); - temp1 = _mm_add_epi16(temp5, pred_r1); + chroma_mask = _mm_set1_epi32(0xffff0000); + out_r0 = _mm_loadu_si128((__m128i *) (&pi2_res_ptr[0 * i4_res_stride])); + out_r1 = _mm_loadu_si128((__m128i *) (&pi2_res_ptr[1 * i4_res_stride])); + out_r2 = _mm_loadu_si128((__m128i *) (&pi2_res_ptr[2 * i4_res_stride])); + out_r3 = _mm_loadu_si128((__m128i *) (&pi2_res_ptr[3 * i4_res_stride])); + + out_r0 = _mm_and_si128(out_r0, chroma_mask); + out_r1 = _mm_and_si128(out_r1, chroma_mask); + out_r2 = _mm_and_si128(out_r2, chroma_mask); + out_r3 = _mm_and_si128(out_r3, chroma_mask); + + res_r0 = _mm_cvtepu16_epi32(temp0); + res_r2 = _mm_cvtepu16_epi32(temp1); + res_r1 = _mm_srli_si128(temp0, 8); + res_r3 = _mm_srli_si128(temp1, 8); + res_r1 = _mm_cvtepu16_epi32(res_r1); + res_r3 = _mm_cvtepu16_epi32(res_r3); + + out_r0 = _mm_add_epi16(out_r0, res_r0); + out_r1 = _mm_add_epi16(out_r1, res_r1); + out_r2 = _mm_add_epi16(out_r2, res_r2); + out_r3 = _mm_add_epi16(out_r3, res_r3); + + _mm_storeu_si128((__m128i *) (&pi2_res_ptr[0 * i4_res_stride]), out_r0); + _mm_storeu_si128((__m128i *) (&pi2_res_ptr[1 * i4_res_stride]), out_r1); + _mm_storeu_si128((__m128i *) (&pi2_res_ptr[2 * i4_res_stride]), out_r2); + _mm_storeu_si128((__m128i *) (&pi2_res_ptr[3 * i4_res_stride]), out_r3); + + resq_r0 = _mm_add_epi16(pred_r0, res_r0); + resq_r1 = _mm_add_epi16(pred_r1, res_r1); + resq_r2 = _mm_add_epi16(pred_r2, res_r2); + resq_r3 = _mm_add_epi16(pred_r3, res_r3); + + temp0 = _mm_packus_epi32(resq_r0, resq_r1); + temp1 = _mm_packus_epi32(resq_r2, resq_r3); /*------------------------------------------------------------------*/ /* Clipping the results to 8 bits */ @@ -1362,7 +1353,7 @@ void isvc_iquant_itrans_recon_chroma_4x4_sse42( resq_r2 = _mm_cvtepu8_epi16(resq_r2); resq_r3 = _mm_cvtepu8_epi16(resq_r3); - chroma_mask = _mm_set1_epi16(0xFF00); + chroma_mask = _mm_set1_epi16(0xff00); out_r0 = _mm_loadl_epi64((__m128i *) (&pu1_out[0])); out_r1 = _mm_loadl_epi64((__m128i *) (&pu1_out[i4_out_stride])); out_r2 = _mm_loadl_epi64((__m128i *) (&pu1_out[2 * i4_out_stride])); @@ -1384,7 +1375,7 @@ void isvc_iquant_itrans_recon_chroma_4x4_sse42( _mm_storel_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]), out_r3); } -void isvc_iquant_itrans_recon_res_chroma_4x4_sse42( +void isvc_iquant_itrans_recon_res_chroma_4x4_with_res_acc_sse42( buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred, buffer_container_t *ps_res, buffer_container_t *ps_rec, iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src, @@ -1392,11 +1383,12 @@ void isvc_iquant_itrans_recon_res_chroma_4x4_sse42( { WORD16 *pi2_src = (WORD16 *) ps_src->pv_data; WORD16 *pi2_res = (WORD16 *) ps_res->pv_data; - WORD16 *pi2_res_ptr = pi2_res; + WORD16 *pi2_res_pred = (WORD16 *) ps_res_pred->pv_data; UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data; UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data; WORD32 i4_src_stride = ps_src->i4_data_stride; WORD32 i4_res_stride = ps_res->i4_data_stride; + WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride; WORD32 i4_pred_stride = ps_pred->i4_data_stride; WORD32 i4_out_stride = ps_rec->i4_data_stride; const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat; @@ -1406,9 +1398,12 @@ void isvc_iquant_itrans_recon_res_chroma_4x4_sse42( __m128i src_r0, src_r1, src_r2, src_r3; __m128i scalemat_r0_r1, scalemat_r2_r3; __m128i pred_r0, pred_r1, pred_r2, pred_r3; - __m128i sign_reg, dequant_r0_r1, dequant_r2_r3; + __m128i res_pred_r0, res_pred_r1, res_pred_r2, res_pred_r3; + __m128i res_r0, res_r1, res_r2, res_r3; + __m128i dequant_r0_r1, dequant_r2_r3; /* all bits reset to zero */ __m128i zero_8x16b = _mm_setzero_si128(); + __m128i reg_chroma = _mm_set1_epi32(0xFFFF); __m128i neg_255_8x16b = _mm_set1_epi16(-((WORD16) UINT8_MAX)); __m128i pos_255_8x16b = _mm_set1_epi16(((WORD16) UINT8_MAX)); __m128i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; @@ -1417,14 +1412,13 @@ void isvc_iquant_itrans_recon_res_chroma_4x4_sse42( __m128i value_32 = _mm_set1_epi32(32); __m128i chroma_mask = _mm_set1_epi16(0xFF); __m128i out_r0, out_r1, out_r2, out_r3; - __m128i res_r0, res_r1, res_r2, res_r3; + __m128i mask_r0; ASSERT(4 == i4_src_stride); - ASSERT(0 == u1_res_accumulate); + ASSERT(1 == u1_res_accumulate); UNUSED(i4_src_stride); UNUSED(u1_res_accumulate); - UNUSED(ps_res_pred); UNUSED(i4_iq_start_idx); /*************************************************************/ @@ -1545,7 +1539,7 @@ void isvc_iquant_itrans_recon_res_chroma_4x4_sse42( temp2 = _mm_srai_epi32(resq_r1, 1); temp2 = _mm_sub_epi32(temp2, resq_r3); /* z3 = w1 + (w3 >> 1) */ - temp3 = _mm_srai_epi32(resq_r3, 1); + temp3 = _mm_srai_epi32(resq_r3, 1); //(w3>>1) + w1 temp3 = _mm_add_epi32(temp3, resq_r1); /*----------------------------------------------------------*/ /* x0 = z0 + z3 */ @@ -1598,11 +1592,6 @@ void isvc_iquant_itrans_recon_res_chroma_4x4_sse42( pred_r2 = _mm_and_si128(pred_r2, chroma_mask); pred_r3 = _mm_and_si128(pred_r3, chroma_mask); - pred_r0 = _mm_cvtepu16_epi32(pred_r0); - pred_r1 = _mm_cvtepu16_epi32(pred_r1); - pred_r2 = _mm_cvtepu16_epi32(pred_r2); - pred_r3 = _mm_cvtepu16_epi32(pred_r3); - /*--------------------------------------------------------------*/ /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6 */ /* */ @@ -1623,22 +1612,50 @@ void isvc_iquant_itrans_recon_res_chroma_4x4_sse42( temp4 = _mm_add_epi32(temp0, temp3); temp4 = _mm_add_epi32(temp4, value_32); temp4 = _mm_srai_epi32(temp4, 6); + res_r0 = temp4; /* x1j = z1j + z2j */ temp5 = _mm_add_epi32(temp1, temp2); temp5 = _mm_add_epi32(temp5, value_32); temp5 = _mm_srai_epi32(temp5, 6); + res_r1 = temp5; /* x2j = z1j - z2j */ temp6 = _mm_sub_epi32(temp1, temp2); temp6 = _mm_add_epi32(temp6, value_32); temp6 = _mm_srai_epi32(temp6, 6); + res_r2 = temp6; /* x3j = z0j - z3j */ temp7 = _mm_sub_epi32(temp0, temp3); temp7 = _mm_add_epi32(temp7, value_32); temp7 = _mm_srai_epi32(temp7, 6); + res_r3 = temp7; - /* 32-bit to 16-bit conversion */ - temp0 = _mm_packs_epi32(temp4, temp5); - temp1 = _mm_packs_epi32(temp6, temp7); + res_pred_r0 = _mm_loadu_si128((__m128i *) &pi2_res_pred[0 * i4_res_pred_stride]); + res_pred_r1 = _mm_loadu_si128((__m128i *) &pi2_res_pred[1 * i4_res_pred_stride]); + res_pred_r2 = _mm_loadu_si128((__m128i *) &pi2_res_pred[2 * i4_res_pred_stride]); + res_pred_r3 = _mm_loadu_si128((__m128i *) &pi2_res_pred[3 * i4_res_pred_stride]); + + res_pred_r0 = _mm_and_si128(res_pred_r0, reg_chroma); + res_pred_r1 = _mm_and_si128(res_pred_r1, reg_chroma); + res_pred_r2 = _mm_and_si128(res_pred_r2, reg_chroma); + res_pred_r3 = _mm_and_si128(res_pred_r3, reg_chroma); + + temp0 = _mm_packs_epi32(res_r0, res_r1); + temp1 = _mm_packs_epi32(res_r2, res_r3); + + res_r0 = _mm_cvtepu16_epi32(temp0); + res_r2 = _mm_cvtepu16_epi32(temp1); + res_r1 = _mm_srli_si128(temp0, 8); + res_r3 = _mm_srli_si128(temp1, 8); + res_r1 = _mm_cvtepu16_epi32(res_r1); + res_r3 = _mm_cvtepu16_epi32(res_r3); + + res_r0 = _mm_add_epi16(res_pred_r0, res_r0); + res_r1 = _mm_add_epi16(res_pred_r1, res_r1); + res_r2 = _mm_add_epi16(res_pred_r2, res_r2); + res_r3 = _mm_add_epi16(res_pred_r3, res_r3); + + temp0 = _mm_packus_epi32(res_r0, res_r1); + temp1 = _mm_packus_epi32(res_r2, res_r3); /* Saturate all values < -255 to -255 and retain the rest as it is */ temp0 = _mm_max_epi16(temp0, neg_255_8x16b); @@ -1650,33 +1667,39 @@ void isvc_iquant_itrans_recon_res_chroma_4x4_sse42( /* Saturate all values > 255 to 255 and retain the rest as it is */ temp1 = _mm_min_epi16(temp1, pos_255_8x16b); + res_r0 = _mm_cvtepu16_epi32(temp0); + res_r1 = _mm_srli_si128(temp0, 8); + res_r1 = _mm_cvtepu16_epi32(res_r1); + + res_r2 = _mm_cvtepu16_epi32(temp1); + res_r3 = _mm_srli_si128(temp1, 8); + res_r3 = _mm_cvtepu16_epi32(res_r3); + chroma_mask = _mm_set1_epi32(0xffff0000); - out_r0 = _mm_loadu_si128((__m128i *) (&pi2_res_ptr[0 * i4_res_stride])); - out_r1 = _mm_loadu_si128((__m128i *) (&pi2_res_ptr[1 * i4_res_stride])); - out_r2 = _mm_loadu_si128((__m128i *) (&pi2_res_ptr[2 * i4_res_stride])); - out_r3 = _mm_loadu_si128((__m128i *) (&pi2_res_ptr[3 * i4_res_stride])); + out_r0 = _mm_loadu_si128((__m128i *) (&pi2_res[0 * i4_res_stride])); + out_r1 = _mm_loadu_si128((__m128i *) (&pi2_res[1 * i4_res_stride])); + out_r2 = _mm_loadu_si128((__m128i *) (&pi2_res[2 * i4_res_stride])); + out_r3 = _mm_loadu_si128((__m128i *) (&pi2_res[3 * i4_res_stride])); out_r0 = _mm_and_si128(out_r0, chroma_mask); out_r1 = _mm_and_si128(out_r1, chroma_mask); out_r2 = _mm_and_si128(out_r2, chroma_mask); out_r3 = _mm_and_si128(out_r3, chroma_mask); - res_r0 = _mm_cvtepu16_epi32(temp0); - res_r2 = _mm_cvtepu16_epi32(temp1); - res_r1 = _mm_srli_si128(temp0, 8); - res_r3 = _mm_srli_si128(temp1, 8); - res_r1 = _mm_cvtepu16_epi32(res_r1); - res_r3 = _mm_cvtepu16_epi32(res_r3); - out_r0 = _mm_add_epi16(out_r0, res_r0); out_r1 = _mm_add_epi16(out_r1, res_r1); out_r2 = _mm_add_epi16(out_r2, res_r2); out_r3 = _mm_add_epi16(out_r3, res_r3); - _mm_storeu_si128((__m128i *) (&pi2_res_ptr[0 * i4_res_stride]), out_r0); - _mm_storeu_si128((__m128i *) (&pi2_res_ptr[1 * i4_res_stride]), out_r1); - _mm_storeu_si128((__m128i *) (&pi2_res_ptr[2 * i4_res_stride]), out_r2); - _mm_storeu_si128((__m128i *) (&pi2_res_ptr[3 * i4_res_stride]), out_r3); + _mm_storeu_si128((__m128i *) (&pi2_res[0 * i4_res_stride]), out_r0); + _mm_storeu_si128((__m128i *) (&pi2_res[1 * i4_res_stride]), out_r1); + _mm_storeu_si128((__m128i *) (&pi2_res[2 * i4_res_stride]), out_r2); + _mm_storeu_si128((__m128i *) (&pi2_res[3 * i4_res_stride]), out_r3); + + pred_r0 = _mm_cvtepu16_epi32(pred_r0); + pred_r1 = _mm_cvtepu16_epi32(pred_r1); + pred_r2 = _mm_cvtepu16_epi32(pred_r2); + pred_r3 = _mm_cvtepu16_epi32(pred_r3); resq_r0 = _mm_add_epi16(pred_r0, res_r0); resq_r1 = _mm_add_epi16(pred_r1, res_r1); @@ -1686,12 +1709,11 @@ void isvc_iquant_itrans_recon_res_chroma_4x4_sse42( temp0 = _mm_packus_epi32(resq_r0, resq_r1); temp1 = _mm_packus_epi32(resq_r2, resq_r3); - /*------------------------------------------------------------------*/ /* Clipping the results to 8 bits */ - sign_reg = _mm_cmpgt_epi16(temp0, zero_8x16b); - temp0 = _mm_and_si128(temp0, sign_reg); - sign_reg = _mm_cmpgt_epi16(temp1, zero_8x16b); - temp1 = _mm_and_si128(temp1, sign_reg); + mask_r0 = _mm_cmpgt_epi16(temp0, zero_8x16b); + temp0 = _mm_and_si128(temp0, mask_r0); + mask_r0 = _mm_cmpgt_epi16(temp1, zero_8x16b); + temp1 = _mm_and_si128(temp1, mask_r0); resq_r0 = _mm_packus_epi16(temp0, temp1); resq_r1 = _mm_srli_si128(resq_r0, 4); @@ -1703,9 +1725,9 @@ void isvc_iquant_itrans_recon_res_chroma_4x4_sse42( resq_r2 = _mm_cvtepu8_epi16(resq_r2); resq_r3 = _mm_cvtepu8_epi16(resq_r3); - chroma_mask = _mm_set1_epi16(0xff00); - out_r0 = _mm_loadl_epi64((__m128i *) (&pu1_out[0])); - out_r1 = _mm_loadl_epi64((__m128i *) (&pu1_out[i4_out_stride])); + chroma_mask = _mm_set1_epi16(0xFF00); + out_r0 = _mm_loadl_epi64((__m128i *) (&pu1_out[0 * i4_out_stride])); + out_r1 = _mm_loadl_epi64((__m128i *) (&pu1_out[1 * i4_out_stride])); out_r2 = _mm_loadl_epi64((__m128i *) (&pu1_out[2 * i4_out_stride])); out_r3 = _mm_loadl_epi64((__m128i *) (&pu1_out[3 * i4_out_stride])); @@ -1719,498 +1741,30 @@ void isvc_iquant_itrans_recon_res_chroma_4x4_sse42( out_r2 = _mm_add_epi8(out_r2, resq_r2); out_r3 = _mm_add_epi8(out_r3, resq_r3); - _mm_storel_epi64((__m128i *) (&pu1_out[0]), out_r0); - _mm_storel_epi64((__m128i *) (&pu1_out[i4_out_stride]), out_r1); + _mm_storel_epi64((__m128i *) (&pu1_out[0 * i4_out_stride]), out_r0); + _mm_storel_epi64((__m128i *) (&pu1_out[1 * i4_out_stride]), out_r1); _mm_storel_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]), out_r2); _mm_storel_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]), out_r3); } -void isvc_iquant_itrans_recon_res_chroma_4x4_with_res_acc_sse42( - buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred, - buffer_container_t *ps_res, buffer_container_t *ps_rec, - iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src, - WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate) +void isvc_iquant_itrans_recon_dc_4x4_sse42(buffer_container_t *ps_src, buffer_container_t *ps_pred, + buffer_container_t *ps_res_pred, + buffer_container_t *ps_res, buffer_container_t *ps_rec, + iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, + WORD16 *pi2_tmp, WORD16 *pi2_dc_src, + WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate) { - WORD16 *pi2_src = (WORD16 *) ps_src->pv_data; - WORD16 *pi2_res = (WORD16 *) ps_res->pv_data; - WORD16 *pi2_res_pred = (WORD16 *) ps_res_pred->pv_data; UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data; UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data; - WORD32 i4_src_stride = ps_src->i4_data_stride; - WORD32 i4_res_stride = ps_res->i4_data_stride; - WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride; WORD32 i4_pred_stride = ps_pred->i4_data_stride; WORD32 i4_out_stride = ps_rec->i4_data_stride; const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat; const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat; UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6; - __m128i src_r0_r1, src_r2_r3; - __m128i src_r0, src_r1, src_r2, src_r3; - __m128i scalemat_r0_r1, scalemat_r2_r3; - __m128i pred_r0, pred_r1, pred_r2, pred_r3; - __m128i res_pred_r0, res_pred_r1, res_pred_r2, res_pred_r3; - __m128i res_r0, res_r1, res_r2, res_r3; - __m128i dequant_r0_r1, dequant_r2_r3; - /* all bits reset to zero */ - __m128i zero_8x16b = _mm_setzero_si128(); - __m128i reg_chroma = _mm_set1_epi32(0xFFFF); - __m128i neg_255_8x16b = _mm_set1_epi16(-((WORD16) UINT8_MAX)); - __m128i pos_255_8x16b = _mm_set1_epi16(((WORD16) UINT8_MAX)); - __m128i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; - __m128i resq_r0, resq_r1, resq_r2, resq_r3; - __m128i add_rshift = _mm_set1_epi32((u4_qp_div_6 < 4) ? (1 << (3 - u4_qp_div_6)) : 0); - __m128i value_32 = _mm_set1_epi32(32); - __m128i chroma_mask = _mm_set1_epi16(0xFF); - __m128i out_r0, out_r1, out_r2, out_r3; - __m128i mask_r0; - - ASSERT(4 == i4_src_stride); - ASSERT(1 == u1_res_accumulate); - - UNUSED(i4_src_stride); - UNUSED(u1_res_accumulate); - UNUSED(i4_iq_start_idx); - - /*************************************************************/ - /* Dequantization of coefficients. Will be replaced by SIMD */ - /* operations on platform */ - /*************************************************************/ - /* a00 a01 a02 a03 a10 a11 a12 a13 -- the source - matrix 0th,1st row */ - src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); - - /* a20 a21 a22 a23 a30 a31 a32 a33 -- the - source matrix 2nd,3rd row */ - src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); - - /* b00 b01 b02 b03 b10 b11 b12 b13 -- the - scaling matrix 0th,1st row */ - scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat)); - - /* b20 b21 b22 b23 b30 b31 b32 b33 --b12 b13 -- the - the scaling matrix 2nd,3rd row */ - scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 8)); - - /* q00 q01 q02 q03 q10 q11 - q12 q13 -- all 16 bits */ - dequant_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat)); - - /* q20 q21 q22 q23 q30 q31 - q32 q33 -- all 16 bits */ - dequant_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat + 8)); - - temp0 = _mm_mullo_epi16(scalemat_r0_r1, - dequant_r0_r1); // b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11 - // b12*q12 b13*q13 -- 16 bit result - - temp1 = _mm_mullo_epi16(scalemat_r2_r3, dequant_r2_r3); - - /* b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long */ - temp4 = _mm_unpacklo_epi16(temp0, zero_8x16b); - - /* b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long */ - temp5 = _mm_unpackhi_epi16(temp0, zero_8x16b); - - /* b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long */ - temp6 = _mm_unpacklo_epi16(temp1, zero_8x16b); - - /* b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long */ - temp7 = _mm_unpackhi_epi16(temp1, zero_8x16b); - - /* a00 0 a01 0 a02 0 a03 0 -- 16 bit long */ - src_r0 = _mm_unpacklo_epi16(src_r0_r1, zero_8x16b); - /* a10 0 a11 0 a12 0 a13 0 -- 16 bit long */ - src_r1 = _mm_unpackhi_epi16(src_r0_r1, zero_8x16b); - /* a20 0 a21 0 a22 0 a23 0 -- 16 bit long */ - src_r2 = _mm_unpacklo_epi16(src_r2_r3, zero_8x16b); - /* a30 0 a31 0 a32 0 a33 0 -- 16 bit long */ - src_r3 = _mm_unpackhi_epi16(src_r2_r3, zero_8x16b); - - temp4 = _mm_madd_epi16(src_r0, temp4); - temp5 = _mm_madd_epi16(src_r1, temp5); - temp6 = _mm_madd_epi16(src_r2, temp6); - temp7 = _mm_madd_epi16(src_r3, temp7); - - if(u4_qp_div_6 >= 4) - { - resq_r0 = _mm_slli_epi32(temp4, u4_qp_div_6 - 4); - resq_r1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 4); - resq_r2 = _mm_slli_epi32(temp6, u4_qp_div_6 - 4); - resq_r3 = _mm_slli_epi32(temp7, u4_qp_div_6 - 4); - } - else - { - temp4 = _mm_add_epi32(temp4, add_rshift); - temp5 = _mm_add_epi32(temp5, add_rshift); - temp6 = _mm_add_epi32(temp6, add_rshift); - temp7 = _mm_add_epi32(temp7, add_rshift); - resq_r0 = _mm_srai_epi32(temp4, 4 - u4_qp_div_6); - resq_r1 = _mm_srai_epi32(temp5, 4 - u4_qp_div_6); - resq_r2 = _mm_srai_epi32(temp6, 4 - u4_qp_div_6); - resq_r3 = _mm_srai_epi32(temp7, 4 - u4_qp_div_6); - } - - resq_r0 = _mm_insert_epi32(resq_r0, (WORD32) pi2_dc_src[0], 0); - /* Perform Inverse transform */ - /*-------------------------------------------------------------*/ - /* IDCT [ Horizontal transformation ] */ - /*-------------------------------------------------------------*/ - // Matrix transpose - /* - * a0 a1 a2 a3 - * b0 b1 b2 b3 - * c0 c1 c2 c3 - * d0 d1 d2 d3 - */ - /* a0 b0 a1 b1 */ - temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1); - /* c0 d0 c1 d1 */ - temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3); - /* a2 b2 a3 b3 */ - temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1); - /* c2 d2 c3 d3 */ - temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3); - /* a0 b0 c0 d0 */ - resq_r0 = _mm_unpacklo_epi64(temp1, temp3); - /* a1 b1 c1 d1 */ - resq_r1 = _mm_unpackhi_epi64(temp1, temp3); - /* a2 b2 c2 d2 */ - resq_r2 = _mm_unpacklo_epi64(temp2, temp4); - /* a3 b3 c3 d3 */ - resq_r3 = _mm_unpackhi_epi64(temp2, temp4); - /* Transform starts -- horizontal transform */ - - /*------------------------------------------------------------------*/ - /* z0 = w0 + w2 */ - temp0 = _mm_add_epi32(resq_r0, resq_r2); - /* z1 = w0 - w2 */ - temp1 = _mm_sub_epi32(resq_r0, resq_r2); - /* z2 = (w1 >> 1) - w3 */ - temp2 = _mm_srai_epi32(resq_r1, 1); - temp2 = _mm_sub_epi32(temp2, resq_r3); - /* z3 = w1 + (w3 >> 1) */ - temp3 = _mm_srai_epi32(resq_r3, 1); //(w3>>1) + w1 - temp3 = _mm_add_epi32(temp3, resq_r1); - /*----------------------------------------------------------*/ - /* x0 = z0 + z3 */ - resq_r0 = _mm_add_epi32(temp0, temp3); - /* x1 = z1 + z2 */ - resq_r1 = _mm_add_epi32(temp1, temp2); - /* x2 = z1 - z2 */ - resq_r2 = _mm_sub_epi32(temp1, temp2); - /* x3 = z0 - z3 */ - resq_r3 = _mm_sub_epi32(temp0, temp3); - // Matrix transpose - /* - * a0 b0 c0 d0 - * a1 b1 c1 d1 - * a2 b2 c2 d2 - * a3 b3 c3 d3 - */ - /* a0 a1 b0 b1 */ - temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1); - /* a2 a3 b2 b3 */ - temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3); - /* c0 c1 d0 d1 */ - temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1); - /* c2 c3 d2 d3 */ - temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3); - /* a0 a1 a2 a3 */ - resq_r0 = _mm_unpacklo_epi64(temp1, temp3); - /* b0 b1 b2 b3 */ - resq_r1 = _mm_unpackhi_epi64(temp1, temp3); - /* c0 c1 c2 c3 */ - resq_r2 = _mm_unpacklo_epi64(temp2, temp4); - /* d0 d1 d2 d3 */ - resq_r3 = _mm_unpackhi_epi64(temp2, temp4); - /* Transform ends -- horizontal transform */ - - temp0 = _mm_packs_epi32(resq_r0, resq_r1); - temp1 = _mm_packs_epi32(resq_r2, resq_r3); - - _mm_storeu_si128((__m128i *) (&pi2_tmp[0]), temp0); - _mm_storeu_si128((__m128i *) (&pi2_tmp[2 * 4]), temp1); - - /* Load pred buffer */ - pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); - pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride])); - pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride])); - pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride])); - - pred_r0 = _mm_and_si128(pred_r0, chroma_mask); - pred_r1 = _mm_and_si128(pred_r1, chroma_mask); - pred_r2 = _mm_and_si128(pred_r2, chroma_mask); - pred_r3 = _mm_and_si128(pred_r3, chroma_mask); - - /*--------------------------------------------------------------*/ - /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6 */ - /* */ - /* Add the prediction and store it back to same buffer */ - /*--------------------------------------------------------------*/ - /* z0j = y0j + y2j */ - temp0 = _mm_add_epi32(resq_r0, resq_r2); - /* z1j = y0j - y2j */ - temp1 = _mm_sub_epi32(resq_r0, resq_r2); - /* z2j = (y1j>>1) - y3j */ - temp2 = _mm_srai_epi32(resq_r1, 1); - temp2 = _mm_sub_epi32(temp2, resq_r3); - /* z3j = y1j + (y3j>>1) */ - temp3 = _mm_srai_epi32(resq_r3, 1); - temp3 = _mm_add_epi32(temp3, resq_r1); - - /* x0j = z0j + z3j */ - temp4 = _mm_add_epi32(temp0, temp3); - temp4 = _mm_add_epi32(temp4, value_32); - temp4 = _mm_srai_epi32(temp4, 6); - res_r0 = temp4; - /* x1j = z1j + z2j */ - temp5 = _mm_add_epi32(temp1, temp2); - temp5 = _mm_add_epi32(temp5, value_32); - temp5 = _mm_srai_epi32(temp5, 6); - res_r1 = temp5; - /* x2j = z1j - z2j */ - temp6 = _mm_sub_epi32(temp1, temp2); - temp6 = _mm_add_epi32(temp6, value_32); - temp6 = _mm_srai_epi32(temp6, 6); - res_r2 = temp6; - /* x3j = z0j - z3j */ - temp7 = _mm_sub_epi32(temp0, temp3); - temp7 = _mm_add_epi32(temp7, value_32); - temp7 = _mm_srai_epi32(temp7, 6); - res_r3 = temp7; - - res_pred_r0 = _mm_loadu_si128((__m128i *) &pi2_res_pred[0 * i4_res_pred_stride]); - res_pred_r1 = _mm_loadu_si128((__m128i *) &pi2_res_pred[1 * i4_res_pred_stride]); - res_pred_r2 = _mm_loadu_si128((__m128i *) &pi2_res_pred[2 * i4_res_pred_stride]); - res_pred_r3 = _mm_loadu_si128((__m128i *) &pi2_res_pred[3 * i4_res_pred_stride]); - - res_pred_r0 = _mm_and_si128(res_pred_r0, reg_chroma); - res_pred_r1 = _mm_and_si128(res_pred_r1, reg_chroma); - res_pred_r2 = _mm_and_si128(res_pred_r2, reg_chroma); - res_pred_r3 = _mm_and_si128(res_pred_r3, reg_chroma); - - temp0 = _mm_packs_epi32(res_r0, res_r1); - temp1 = _mm_packs_epi32(res_r2, res_r3); - - res_r0 = _mm_cvtepu16_epi32(temp0); - res_r2 = _mm_cvtepu16_epi32(temp1); - res_r1 = _mm_srli_si128(temp0, 8); - res_r3 = _mm_srli_si128(temp1, 8); - res_r1 = _mm_cvtepu16_epi32(res_r1); - res_r3 = _mm_cvtepu16_epi32(res_r3); - - res_r0 = _mm_add_epi16(res_pred_r0, res_r0); - res_r1 = _mm_add_epi16(res_pred_r1, res_r1); - res_r2 = _mm_add_epi16(res_pred_r2, res_r2); - res_r3 = _mm_add_epi16(res_pred_r3, res_r3); - - temp0 = _mm_packus_epi32(res_r0, res_r1); - temp1 = _mm_packus_epi32(res_r2, res_r3); - - /* Saturate all values < -255 to -255 and retain the rest as it is */ - temp0 = _mm_max_epi16(temp0, neg_255_8x16b); - /* Saturate all values > 255 to 255 and retain the rest as it is */ - temp0 = _mm_min_epi16(temp0, pos_255_8x16b); - - /* Saturate all values < -255 to -255 and retain the rest as it is */ - temp1 = _mm_max_epi16(temp1, neg_255_8x16b); - /* Saturate all values > 255 to 255 and retain the rest as it is */ - temp1 = _mm_min_epi16(temp1, pos_255_8x16b); - - res_r0 = _mm_cvtepu16_epi32(temp0); - res_r1 = _mm_srli_si128(temp0, 8); - res_r1 = _mm_cvtepu16_epi32(res_r1); - - res_r2 = _mm_cvtepu16_epi32(temp1); - res_r3 = _mm_srli_si128(temp1, 8); - res_r3 = _mm_cvtepu16_epi32(res_r3); - - chroma_mask = _mm_set1_epi32(0xffff0000); - out_r0 = _mm_loadu_si128((__m128i *) (&pi2_res[0 * i4_res_stride])); - out_r1 = _mm_loadu_si128((__m128i *) (&pi2_res[1 * i4_res_stride])); - out_r2 = _mm_loadu_si128((__m128i *) (&pi2_res[2 * i4_res_stride])); - out_r3 = _mm_loadu_si128((__m128i *) (&pi2_res[3 * i4_res_stride])); - - out_r0 = _mm_and_si128(out_r0, chroma_mask); - out_r1 = _mm_and_si128(out_r1, chroma_mask); - out_r2 = _mm_and_si128(out_r2, chroma_mask); - out_r3 = _mm_and_si128(out_r3, chroma_mask); - - out_r0 = _mm_add_epi16(out_r0, res_r0); - out_r1 = _mm_add_epi16(out_r1, res_r1); - out_r2 = _mm_add_epi16(out_r2, res_r2); - out_r3 = _mm_add_epi16(out_r3, res_r3); - - _mm_storeu_si128((__m128i *) (&pi2_res[0 * i4_res_stride]), out_r0); - _mm_storeu_si128((__m128i *) (&pi2_res[1 * i4_res_stride]), out_r1); - _mm_storeu_si128((__m128i *) (&pi2_res[2 * i4_res_stride]), out_r2); - _mm_storeu_si128((__m128i *) (&pi2_res[3 * i4_res_stride]), out_r3); - - pred_r0 = _mm_cvtepu16_epi32(pred_r0); - pred_r1 = _mm_cvtepu16_epi32(pred_r1); - pred_r2 = _mm_cvtepu16_epi32(pred_r2); - pred_r3 = _mm_cvtepu16_epi32(pred_r3); - - resq_r0 = _mm_add_epi16(pred_r0, res_r0); - resq_r1 = _mm_add_epi16(pred_r1, res_r1); - resq_r2 = _mm_add_epi16(pred_r2, res_r2); - resq_r3 = _mm_add_epi16(pred_r3, res_r3); - - temp0 = _mm_packus_epi32(resq_r0, resq_r1); - temp1 = _mm_packus_epi32(resq_r2, resq_r3); - - /* Clipping the results to 8 bits */ - mask_r0 = _mm_cmpgt_epi16(temp0, zero_8x16b); - temp0 = _mm_and_si128(temp0, mask_r0); - mask_r0 = _mm_cmpgt_epi16(temp1, zero_8x16b); - temp1 = _mm_and_si128(temp1, mask_r0); - - resq_r0 = _mm_packus_epi16(temp0, temp1); - resq_r1 = _mm_srli_si128(resq_r0, 4); - resq_r2 = _mm_srli_si128(resq_r1, 4); - resq_r3 = _mm_srli_si128(resq_r2, 4); - - resq_r0 = _mm_cvtepu8_epi16(resq_r0); - resq_r1 = _mm_cvtepu8_epi16(resq_r1); - resq_r2 = _mm_cvtepu8_epi16(resq_r2); - resq_r3 = _mm_cvtepu8_epi16(resq_r3); - - chroma_mask = _mm_set1_epi16(0xFF00); - out_r0 = _mm_loadl_epi64((__m128i *) (&pu1_out[0 * i4_out_stride])); - out_r1 = _mm_loadl_epi64((__m128i *) (&pu1_out[1 * i4_out_stride])); - out_r2 = _mm_loadl_epi64((__m128i *) (&pu1_out[2 * i4_out_stride])); - out_r3 = _mm_loadl_epi64((__m128i *) (&pu1_out[3 * i4_out_stride])); - - out_r0 = _mm_and_si128(out_r0, chroma_mask); - out_r1 = _mm_and_si128(out_r1, chroma_mask); - out_r2 = _mm_and_si128(out_r2, chroma_mask); - out_r3 = _mm_and_si128(out_r3, chroma_mask); - - out_r0 = _mm_add_epi8(out_r0, resq_r0); - out_r1 = _mm_add_epi8(out_r1, resq_r1); - out_r2 = _mm_add_epi8(out_r2, resq_r2); - out_r3 = _mm_add_epi8(out_r3, resq_r3); - - _mm_storel_epi64((__m128i *) (&pu1_out[0 * i4_out_stride]), out_r0); - _mm_storel_epi64((__m128i *) (&pu1_out[1 * i4_out_stride]), out_r1); - _mm_storel_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]), out_r2); - _mm_storel_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]), out_r3); -} - -void isvc_iquant_itrans_recon_dc_4x4_sse42(buffer_container_t *ps_src, buffer_container_t *ps_pred, - buffer_container_t *ps_res_pred, - buffer_container_t *ps_res, buffer_container_t *ps_rec, - iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, - WORD16 *pi2_tmp, WORD16 *pi2_dc_src, - WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate) -{ - UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data; - UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data; - WORD32 i4_pred_stride = ps_pred->i4_data_stride; - WORD32 i4_out_stride = ps_rec->i4_data_stride; - const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat; - const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat; - UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6; - UWORD32 *pu4_out = (UWORD32 *) pu1_out; - WORD32 q0 = ((WORD16 *) (ps_src->pv_data))[0]; - WORD16 i_macro, rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0; - - __m128i pred_r0, pred_r1, pred_r2, pred_r3; - __m128i sign_reg; - /* all bits reset to zero */ - __m128i zero_8x16b = _mm_setzero_si128(); - __m128i temp4, temp5, temp6, temp7; - __m128i value_add; - - ASSERT(0 == u1_res_accumulate); - - UNUSED(pi2_tmp); - UNUSED(ps_res); - UNUSED(ps_res_pred); - UNUSED(u1_res_accumulate); - - INV_QUANT(q0, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4); - - /* Restoring dc value for intra case */ - if(i4_iq_start_idx != 0) - { - q0 = pi2_dc_src[0]; - } - - i_macro = ((q0 + 32) >> 6); - - value_add = _mm_set1_epi16(i_macro); - - zero_8x16b = _mm_setzero_si128(); - - /* Load pred buffer */ - - /* p00 p01 p02 p03 0 0 0 0 -- all 8 bits */ - pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); - - /* p10 p11 p12 p13 0 0 0 0 -- all 8 bits */ - pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride])); - - /* p20 p21 p22 p23 0 0 0 0 -- all 8 bits */ - pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride])); - - /* p30 p31 p32 p33 0 0 0 0 -- all 8 bits */ - pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride])); - - pred_r0 = _mm_cvtepu8_epi16(pred_r0); - pred_r1 = _mm_cvtepu8_epi16(pred_r1); - pred_r2 = _mm_cvtepu8_epi16(pred_r2); - pred_r3 = _mm_cvtepu8_epi16(pred_r3); - - pred_r0 = _mm_unpacklo_epi64(pred_r0, pred_r1); - pred_r2 = _mm_unpacklo_epi64(pred_r2, pred_r3); - - temp4 = _mm_add_epi16(value_add, pred_r0); - temp5 = _mm_add_epi16(value_add, pred_r2); - /*------------------------------------------------------------------*/ - /* Clipping the results to 8 bits */ - sign_reg = _mm_cmpgt_epi16(temp4, zero_8x16b); - temp4 = _mm_and_si128(temp4, sign_reg); - sign_reg = _mm_cmpgt_epi16(temp5, zero_8x16b); - temp5 = _mm_and_si128(temp5, sign_reg); - - temp4 = _mm_packus_epi16(temp4, temp5); - temp5 = _mm_srli_si128(temp4, 4); - temp6 = _mm_srli_si128(temp5, 4); - temp7 = _mm_srli_si128(temp6, 4); - - *pu4_out = _mm_cvtsi128_si32(temp4); - pu1_out += i4_out_stride; - pu4_out = (UWORD32 *) (pu1_out); - *(pu4_out) = _mm_cvtsi128_si32(temp5); - pu1_out += i4_out_stride; - pu4_out = (UWORD32 *) (pu1_out); - *(pu4_out) = _mm_cvtsi128_si32(temp6); - pu1_out += i4_out_stride; - pu4_out = (UWORD32 *) (pu1_out); - *(pu4_out) = _mm_cvtsi128_si32(temp7); -} - -void isvc_iquant_itrans_recon_res_dc_4x4_sse42( - buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred, - buffer_container_t *ps_res, buffer_container_t *ps_rec, - iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src, - WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate) -{ - WORD16 *pi2_res = (WORD16 *) ps_res->pv_data; - WORD16 *pi2_res_ptr = pi2_res; - UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data; - UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data; - WORD32 i4_res_stride = ps_res->i4_data_stride; - WORD32 i4_pred_stride = ps_pred->i4_data_stride; - WORD32 i4_out_stride = ps_rec->i4_data_stride; - const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat; - const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat; - UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6; - UWORD32 *pu4_out = (UWORD32 *) pu1_out; - WORD32 q0 = ((WORD16 *) (ps_src->pv_data))[0]; - WORD16 i_macro, rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0; - + UWORD32 *pu4_out = (UWORD32 *) pu1_out; + WORD32 q0 = ((WORD16 *) (ps_src->pv_data))[0]; + WORD16 i_macro, rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0; + __m128i pred_r0, pred_r1, pred_r2, pred_r3; __m128i sign_reg; /* all bits reset to zero */ @@ -2221,17 +1775,21 @@ void isvc_iquant_itrans_recon_res_dc_4x4_sse42( ASSERT(0 == u1_res_accumulate); UNUSED(pi2_tmp); + UNUSED(ps_res); UNUSED(ps_res_pred); UNUSED(u1_res_accumulate); INV_QUANT(q0, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4); /* Restoring dc value for intra case */ - if(i4_iq_start_idx != 0) q0 = pi2_dc_src[0]; + if(i4_iq_start_idx != 0) + { + q0 = pi2_dc_src[0]; + } i_macro = ((q0 + 32) >> 6); - value_add = _mm_set1_epi16(isvc_get_residue(i_macro, 0, 0)); + value_add = _mm_set1_epi16(i_macro); zero_8x16b = _mm_setzero_si128(); @@ -2259,141 +1817,6 @@ void isvc_iquant_itrans_recon_res_dc_4x4_sse42( temp4 = _mm_add_epi16(value_add, pred_r0); temp5 = _mm_add_epi16(value_add, pred_r2); - - _mm_storeu_si128((__m128i *) (&pi2_res_ptr[0]), value_add); - _mm_storeu_si128((__m128i *) (&pi2_res_ptr[i4_res_stride]), value_add); - _mm_storeu_si128((__m128i *) (&pi2_res_ptr[2 * i4_res_stride]), value_add); - _mm_storeu_si128((__m128i *) (&pi2_res_ptr[3 * i4_res_stride]), value_add); - /*------------------------------------------------------------------*/ - /* Clipping the results to 8 bits */ - sign_reg = _mm_cmpgt_epi16(temp4, zero_8x16b); - temp4 = _mm_and_si128(temp4, sign_reg); - sign_reg = _mm_cmpgt_epi16(temp5, zero_8x16b); - temp5 = _mm_and_si128(temp5, sign_reg); - - temp4 = _mm_packus_epi16(temp4, temp5); - temp5 = _mm_srli_si128(temp4, 4); - temp6 = _mm_srli_si128(temp5, 4); - temp7 = _mm_srli_si128(temp6, 4); - - *pu4_out = _mm_cvtsi128_si32(temp4); - pu1_out += i4_out_stride; - pu4_out = (UWORD32 *) (pu1_out); - *(pu4_out) = _mm_cvtsi128_si32(temp5); - pu1_out += i4_out_stride; - pu4_out = (UWORD32 *) (pu1_out); - *(pu4_out) = _mm_cvtsi128_si32(temp6); - pu1_out += i4_out_stride; - pu4_out = (UWORD32 *) (pu1_out); - *(pu4_out) = _mm_cvtsi128_si32(temp7); -} - -void isvc_iquant_itrans_recon_res_dc_with_res_acc_4x4_sse42( - buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred, - buffer_container_t *ps_res, buffer_container_t *ps_rec, - iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src, - WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate) -{ - WORD16 *pi2_res = (WORD16 *) ps_res->pv_data; - WORD16 *pi2_res_ptr = pi2_res; - WORD16 *pi2_res_pred = (WORD16 *) ps_res_pred->pv_data; - WORD16 *pi2_res_pred_ptr = pi2_res_pred; - UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data; - UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data; - WORD32 i4_res_stride = ps_res->i4_data_stride; - WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride; - WORD32 i4_pred_stride = ps_pred->i4_data_stride; - WORD32 i4_out_stride = ps_rec->i4_data_stride; - const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat; - const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat; - UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6; - UWORD32 *pu4_out = (UWORD32 *) pu1_out; - WORD32 q0 = ((WORD16 *) (ps_src->pv_data))[0]; - WORD16 i_macro, rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0; - - __m128i pred_r0, pred_r1, pred_r2, pred_r3; - __m128i sign_reg; - /* all bits reset to zero */ - __m128i zero_8x16b = _mm_setzero_si128(); - __m128i temp4, temp5, temp6, temp7; - __m128i value_add; - __m128i res_pred_r0, res_pred_r1, res_pred_r2, res_pred_r3; - __m128i temp0, temp1; - __m128i neg_255_8x16b = _mm_set1_epi16(-((WORD16) UINT8_MAX)); - __m128i pos_255_8x16b = _mm_set1_epi16(((WORD16) UINT8_MAX)); - - ASSERT(1 == u1_res_accumulate); - - UNUSED(pi2_tmp); - UNUSED(u1_res_accumulate); - - INV_QUANT(q0, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4); - - /* Restoring dc value for intra case */ - if(i4_iq_start_idx != 0) q0 = pi2_dc_src[0]; - - i_macro = ((q0 + 32) >> 6); - - value_add = _mm_set1_epi16(i_macro); - - zero_8x16b = _mm_setzero_si128(); - - /* Load pred buffer */ - - /* p00 p01 p02 p03 0 0 0 0 -- all 8 bits */ - pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); - - /* p10 p11 p12 p13 0 0 0 0 -- all 8 bits */ - pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride])); - - /* p20 p21 p22 p23 0 0 0 0 -- all 8 bits */ - pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride])); - - /* p30 p31 p32 p33 0 0 0 0 -- all 8 bits */ - pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride])); - - pred_r0 = _mm_cvtepu8_epi16(pred_r0); - pred_r1 = _mm_cvtepu8_epi16(pred_r1); - pred_r2 = _mm_cvtepu8_epi16(pred_r2); - pred_r3 = _mm_cvtepu8_epi16(pred_r3); - - pred_r0 = _mm_unpacklo_epi64(pred_r0, pred_r1); - pred_r2 = _mm_unpacklo_epi64(pred_r2, pred_r3); - - /* Accumulating res */ - res_pred_r0 = _mm_loadl_epi64((__m128i *) &pi2_res_pred_ptr[0]); - res_pred_r1 = _mm_loadl_epi64((__m128i *) &pi2_res_pred_ptr[i4_res_pred_stride]); - res_pred_r2 = _mm_loadl_epi64((__m128i *) &pi2_res_pred_ptr[2 * i4_res_pred_stride]); - res_pred_r3 = _mm_loadl_epi64((__m128i *) &pi2_res_pred_ptr[3 * i4_res_pred_stride]); - - res_pred_r0 = _mm_unpacklo_epi64(res_pred_r0, res_pred_r1); - res_pred_r1 = _mm_unpacklo_epi64(res_pred_r2, res_pred_r3); - - temp0 = _mm_add_epi16(value_add, res_pred_r0); - temp1 = _mm_add_epi16(value_add, res_pred_r1); - - /* Saturate all values < -255 to -255 and retain the rest as it is */ - temp0 = _mm_max_epi16(temp0, neg_255_8x16b); - /* Saturate all values > 255 to 255 and retain the rest as it is */ - temp0 = _mm_min_epi16(temp0, pos_255_8x16b); - - /* Saturate all values < -255 to -255 and retain the rest as it is */ - temp1 = _mm_max_epi16(temp1, neg_255_8x16b); - /* Saturate all values > 255 to 255 and retain the rest as it is */ - temp1 = _mm_min_epi16(temp1, pos_255_8x16b); - - _mm_storeu_si128((__m128i *) (&pi2_res_ptr[0]), temp0); - _mm_storeu_si128((__m128i *) (&pi2_res_ptr[2 * i4_res_stride]), temp1); - - temp4 = _mm_add_epi16(temp0, pred_r0); - temp5 = _mm_add_epi16(temp1, pred_r2); - - temp0 = _mm_srli_si128(temp0, 8); - temp1 = _mm_srli_si128(temp1, 8); - - _mm_storeu_si128((__m128i *) (&pi2_res_ptr[i4_res_stride]), temp0); - _mm_storeu_si128((__m128i *) (&pi2_res_ptr[3 * i4_res_stride]), temp1); - /*------------------------------------------------------------------*/ /* Clipping the results to 8 bits */ sign_reg = _mm_cmpgt_epi16(temp4, zero_8x16b); @@ -2418,110 +1841,6 @@ void isvc_iquant_itrans_recon_res_dc_with_res_acc_4x4_sse42( *(pu4_out) = _mm_cvtsi128_si32(temp7); } -void isvc_iquant_itrans_recon_chroma_4x4_dc_sse42( - buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred, - buffer_container_t *ps_res, buffer_container_t *ps_rec, - iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src, - WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate) -{ - WORD16 *pi2_src = (WORD16 *) ps_src->pv_data; - UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data; - UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data; - WORD32 i4_pred_stride = ps_pred->i4_data_stride; - WORD32 i4_out_stride = ps_rec->i4_data_stride; - const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat; - const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat; - UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6; - /* DC value won't be dequantized for chroma - inverse transform */ - WORD16 q0 = pi2_dc_src[0]; - WORD16 i_macro = ((q0 + 32) >> 6); - - __m128i pred_r0, pred_r1, pred_r2, pred_r3; - /* all bits reset to zero */ - __m128i zero_8x16b = _mm_setzero_si128(); - __m128i chroma_mask = _mm_set1_epi16(0xFF); - __m128i value_add = _mm_set1_epi16(i_macro); - __m128i out_r0, out_r1, out_r2, out_r3; - - ASSERT(0 == u1_res_accumulate); - - UNUSED(pi2_src); - UNUSED(pu2_iscal_mat); - UNUSED(pu2_weigh_mat); - UNUSED(u4_qp_div_6); - UNUSED(pi2_tmp); - UNUSED(ps_res_pred); - UNUSED(ps_res); - UNUSED(i4_iq_start_idx); - UNUSED(u1_res_accumulate); - - /* Load pred buffer */ - pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); - - pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride])); - - pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride])); - - pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride])); - - /* Mask alternate pred values from the interleaved pred buf */ - pred_r0 = _mm_and_si128(pred_r0, chroma_mask); - pred_r1 = _mm_and_si128(pred_r1, chroma_mask); - pred_r2 = _mm_and_si128(pred_r2, chroma_mask); - pred_r3 = _mm_and_si128(pred_r3, chroma_mask); - - /* Pack the first four 16 bit values of 2 regs into a single reg*/ - pred_r0 = _mm_unpacklo_epi64(pred_r0, pred_r1); - pred_r2 = _mm_unpacklo_epi64(pred_r2, pred_r3); - - /* Compute out pixel by adding res to pred */ - pred_r0 = _mm_add_epi16(value_add, pred_r0); - pred_r2 = _mm_add_epi16(value_add, pred_r2); - /*------------------------------------------------------------------*/ - /* Clipping the results to 8 bits */ - pred_r0 = _mm_packus_epi16(pred_r0, pred_r2); - pred_r1 = _mm_srli_si128(pred_r0, 4); - pred_r2 = _mm_srli_si128(pred_r1, 4); - pred_r3 = _mm_srli_si128(pred_r2, 4); - - /* p00 p01 p02 p03 -- all 16 bits */ - pred_r0 = _mm_unpacklo_epi8(pred_r0, zero_8x16b); - /* p10 p11 p12 p13 -- all 16 bits */ - pred_r1 = _mm_unpacklo_epi8(pred_r1, zero_8x16b); - /* p20 p21 p22 p23 -- all 16 bits */ - pred_r2 = _mm_unpacklo_epi8(pred_r2, zero_8x16b); - /* p30 p31 p32 p33 -- all 16 bits */ - pred_r3 = _mm_unpacklo_epi8(pred_r3, zero_8x16b); - - /* Load interleaved out buffer */ - out_r0 = _mm_loadl_epi64((__m128i *) (&pu1_out[0])); - out_r1 = _mm_loadl_epi64((__m128i *) (&pu1_out[i4_out_stride])); - out_r2 = _mm_loadl_epi64((__m128i *) (&pu1_out[2 * i4_out_stride])); - out_r3 = _mm_loadl_epi64((__m128i *) (&pu1_out[3 * i4_out_stride])); - - /* Mask the interleaved out buf in order to save the U/V out pixel computed in - this function call without thrashing the U/V out pixel that was saved - during an earlier function call */ - chroma_mask = _mm_set1_epi16(0xFF00); - - out_r0 = _mm_and_si128(out_r0, chroma_mask); - out_r1 = _mm_and_si128(out_r1, chroma_mask); - out_r2 = _mm_and_si128(out_r2, chroma_mask); - out_r3 = _mm_and_si128(out_r3, chroma_mask); - - /* Save the out pixels in alternate locations */ - out_r0 = _mm_add_epi8(out_r0, pred_r0); - out_r1 = _mm_add_epi8(out_r1, pred_r1); - out_r2 = _mm_add_epi8(out_r2, pred_r2); - out_r3 = _mm_add_epi8(out_r3, pred_r3); - - _mm_storel_epi64((__m128i *) (&pu1_out[0]), out_r0); - _mm_storel_epi64((__m128i *) (&pu1_out[i4_out_stride]), out_r1); - _mm_storel_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]), out_r2); - _mm_storel_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]), out_r3); -} - void isvc_iquant_itrans_recon_res_chroma_4x4_dc_sse42( buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred, buffer_container_t *ps_res, buffer_container_t *ps_rec, diff --git a/encoder/arm/svc/isvce_function_selector_a9q.c b/encoder/arm/svc/isvce_function_selector_a9q.c index b5f8ba4d..6a1abb17 100644 --- a/encoder/arm/svc/isvce_function_selector_a9q.c +++ b/encoder/arm/svc/isvce_function_selector_a9q.c @@ -161,35 +161,23 @@ void isvce_init_function_ptr_neon_a9q(isvce_codec_t *ps_codec) isvc_resi_trans_quant_chroma_4x4_with_residual_sub_neon; /* Init inverse transform fn ptr */ - ps_enc_loop_fxns->apf_iquant_itrans_recon_8x8[0] = isvc_iquant_itrans_recon_8x8; - ps_enc_loop_fxns->apf_iquant_itrans_recon_8x8[1] = isvc_iquant_itrans_recon_8x8; - ps_enc_loop_fxns->apf_iquant_itrans_recon_8x8[2] = isvc_iquant_itrans_recon_8x8; - ps_enc_loop_fxns->apf_iquant_itrans_recon_4x4[0] = isvc_iquant_itrans_recon_4x4_with_res_output_neon; ps_enc_loop_fxns->apf_iquant_itrans_recon_4x4[1] = isvc_iquant_itrans_recon_4x4_with_res_accumulate_neon; ps_enc_loop_fxns->apf_iquant_itrans_recon_4x4[2] = isvc_iquant_itrans_recon_4x4_neon; - ps_enc_loop_fxns->apf_iquant_itrans_recon_4x4_dc[0] = - isvc_iquant_itrans_recon_4x4_dc_with_res_output_neon; - ps_enc_loop_fxns->apf_iquant_itrans_recon_4x4_dc[1] = - isvc_iquant_itrans_recon_4x4_dc_with_res_accumulate_neon; ps_enc_loop_fxns->apf_iquant_itrans_recon_4x4_dc[2] = isvc_iquant_itrans_recon_4x4_dc_neon; ps_enc_loop_fxns->apf_iquant_itrans_recon_chroma_4x4[0] = isvc_iquant_itrans_recon_chroma_4x4_with_res_output_neon; ps_enc_loop_fxns->apf_iquant_itrans_recon_chroma_4x4[1] = isvc_iquant_itrans_recon_chroma_4x4_with_res_accumulate_neon; - ps_enc_loop_fxns->apf_iquant_itrans_recon_chroma_4x4[2] = - isvc_iquant_itrans_recon_chroma_4x4_neon; ps_enc_loop_fxns->apf_iquant_itrans_recon_chroma_4x4_dc[0] = isvc_iquant_itrans_recon_chroma_4x4_dc_with_res_output_neon; ps_enc_loop_fxns->apf_iquant_itrans_recon_chroma_4x4_dc[1] = isvc_iquant_itrans_recon_chroma_4x4_dc_with_res_accumulate_neon; - ps_enc_loop_fxns->apf_iquant_itrans_recon_chroma_4x4_dc[2] = - isvc_iquant_itrans_recon_chroma_4x4_dc_neon; ps_enc_loop_fxns->pf_ihadamard_scaling_4x4 = ih264_ihadamard_scaling_4x4_a9; ps_enc_loop_fxns->pf_ihadamard_scaling_2x2_uv = ih264_ihadamard_scaling_2x2_uv_a9; @@ -243,9 +231,7 @@ void isvce_init_function_ptr_neon_a9q(isvce_codec_t *ps_codec) ps_codec->pf_compute_sad_16x8 = ime_compute_sad_16x8_a9q; /* memor handling operations */ - ps_mem_fxns->pf_mem_cpy = ih264_memcpy_a9q; ps_mem_fxns->pf_mem_cpy_mul8 = ih264_memcpy_mul_8_a9q; - ps_mem_fxns->pf_mem_set = ih264_memset_a9q; ps_mem_fxns->pf_mem_set_mul8 = ih264_memset_mul_8_a9q; /* sad me level functions */ diff --git a/encoder/arm/svc/isvce_function_selector_av8.c b/encoder/arm/svc/isvce_function_selector_av8.c index 16c08bb9..c517c03f 100644 --- a/encoder/arm/svc/isvce_function_selector_av8.c +++ b/encoder/arm/svc/isvce_function_selector_av8.c @@ -161,35 +161,23 @@ void isvce_init_function_ptr_neon_av8(isvce_codec_t *ps_codec) isvc_resi_trans_quant_chroma_4x4_with_residual_sub_neon; /* Init inverse transform fn ptr */ - ps_enc_loop_fxns->apf_iquant_itrans_recon_8x8[0] = isvc_iquant_itrans_recon_8x8; - ps_enc_loop_fxns->apf_iquant_itrans_recon_8x8[1] = isvc_iquant_itrans_recon_8x8; - ps_enc_loop_fxns->apf_iquant_itrans_recon_8x8[2] = isvc_iquant_itrans_recon_8x8; - ps_enc_loop_fxns->apf_iquant_itrans_recon_4x4[0] = isvc_iquant_itrans_recon_4x4_with_res_output_neon; ps_enc_loop_fxns->apf_iquant_itrans_recon_4x4[1] = isvc_iquant_itrans_recon_4x4_with_res_accumulate_neon; ps_enc_loop_fxns->apf_iquant_itrans_recon_4x4[2] = isvc_iquant_itrans_recon_4x4_neon; - - ps_enc_loop_fxns->apf_iquant_itrans_recon_4x4_dc[0] = - isvc_iquant_itrans_recon_4x4_dc_with_res_output_neon; - ps_enc_loop_fxns->apf_iquant_itrans_recon_4x4_dc[1] = - isvc_iquant_itrans_recon_4x4_dc_with_res_accumulate_neon; + ; ps_enc_loop_fxns->apf_iquant_itrans_recon_4x4_dc[2] = isvc_iquant_itrans_recon_4x4_dc_neon; ps_enc_loop_fxns->apf_iquant_itrans_recon_chroma_4x4[0] = isvc_iquant_itrans_recon_chroma_4x4_with_res_output_neon; ps_enc_loop_fxns->apf_iquant_itrans_recon_chroma_4x4[1] = isvc_iquant_itrans_recon_chroma_4x4_with_res_accumulate_neon; - ps_enc_loop_fxns->apf_iquant_itrans_recon_chroma_4x4[2] = - isvc_iquant_itrans_recon_chroma_4x4_neon; ps_enc_loop_fxns->apf_iquant_itrans_recon_chroma_4x4_dc[0] = isvc_iquant_itrans_recon_chroma_4x4_dc_with_res_output_neon; ps_enc_loop_fxns->apf_iquant_itrans_recon_chroma_4x4_dc[1] = isvc_iquant_itrans_recon_chroma_4x4_dc_with_res_accumulate_neon; - ps_enc_loop_fxns->apf_iquant_itrans_recon_chroma_4x4_dc[2] = - isvc_iquant_itrans_recon_chroma_4x4_dc_neon; ps_enc_loop_fxns->pf_ihadamard_scaling_4x4 = ih264_ihadamard_scaling_4x4_av8; ps_enc_loop_fxns->pf_ihadamard_scaling_2x2_uv = ih264_ihadamard_scaling_2x2_uv_av8; @@ -243,9 +231,7 @@ void isvce_init_function_ptr_neon_av8(isvce_codec_t *ps_codec) ps_codec->pf_compute_sad_16x8 = ime_compute_sad_16x8_av8; /* memor handling operations */ - ps_mem_fxns->pf_mem_cpy = ih264_memcpy_av8; ps_mem_fxns->pf_mem_cpy_mul8 = ih264_memcpy_mul_8_av8; - ps_mem_fxns->pf_mem_set = ih264_memset_av8; ps_mem_fxns->pf_mem_set_mul8 = ih264_memset_mul_8_av8; /* sad me level functions */ diff --git a/encoder/svc/isvce_api.c b/encoder/svc/isvce_api.c index 2004dbaf..d44d7e72 100644 --- a/encoder/svc/isvce_api.c +++ b/encoder/svc/isvce_api.c @@ -3342,10 +3342,14 @@ static WORD32 isvce_init_mem_rec(iv_obj_t *ps_codec_obj, void *pv_api_ip, void * WORD32 max_wd_luma, max_ht_luma; WORD32 max_mb_rows, max_mb_cols, max_mb_cnt; - /* temp var */ WORD32 i, j; WORD32 status = IV_SUCCESS; + if(MAX_CTXT_SETS != 1) + { + return IV_FAIL; + } + /* mem records */ ps_mem_rec_base = ps_ip->s_ive_ip.ps_mem_rec; @@ -3493,117 +3497,59 @@ static WORD32 isvce_init_mem_rec(iv_obj_t *ps_codec_obj, void *pv_api_ip, void * ps_mem_rec = &ps_mem_rec_base[ISVCE_MEM_REC_ENTROPY]; { /* temp var */ - WORD32 size = 0, offset; + WORD32 size = 0; for(i = 0; i < MAX_PROCESS_CTXT; i++) { - if(i < MAX_PROCESS_CTXT / MAX_CTXT_SETS) - { - /* base ptr */ - UWORD8 *pu1_buf = ps_mem_rec->pv_base; - - /* reset size */ - size = 0; - - /* skip mb run */ - ps_codec->as_process[i].s_entropy.pi4_mb_skip_run = (WORD32 *) (pu1_buf + size); - size += sizeof(WORD32); - size = ALIGN8(size); - - /* entropy map */ - ps_codec->as_process[i].s_entropy.pu1_entropy_map = - (UWORD8 *) (pu1_buf + size + max_mb_cols); - /* size in bytes to store entropy status of an entire frame */ - size += (max_mb_cols * max_mb_rows); - /* add an additional 1 row of bytes to evade the special case of row 0 - */ - size += max_mb_cols; - size = ALIGN128(size); - - /* bit stream ptr */ - ps_codec->as_process[i].s_entropy.ps_bitstrm = (bitstrm_t *) (pu1_buf + size); - size += sizeof(ps_codec->as_process[i].s_entropy.ps_bitstrm[0]); - size = ALIGN128(size); - -#if ENABLE_RE_ENC_AS_SKIP - ps_codec->as_process[i].s_entropy.ps_bitstrm_after_slice_hdr = - (bitstrm_t *) (pu1_buf + size); - size += sizeof(ps_codec->as_process[i].s_entropy.ps_bitstrm_after_slice_hdr[0]); - size = ALIGN128(size); -#endif - - /* nnz luma */ - ps_codec->as_process[i].s_entropy.pu1_top_nnz_luma = (UWORD8(*)[4])(pu1_buf + size); - size += (max_mb_cols * 4 * sizeof(UWORD8)); - size = ALIGN128(size); - - /* nnz chroma */ - ps_codec->as_process[i].s_entropy.pu1_top_nnz_cbcr = (UWORD8(*)[4])(pu1_buf + size); - size += (max_mb_cols * 4 * sizeof(UWORD8)); - size = ALIGN128(size); - - /* ps_mb_qp_ctxt */ - ps_codec->as_process[i].s_entropy.ps_mb_qp_ctxt = (mb_qp_ctxt_t *) (pu1_buf + size); - size += ALIGN128(sizeof(ps_codec->as_process[i].s_entropy.ps_mb_qp_ctxt[0])); - - offset = size; - - /* cabac Context */ - ps_codec->as_process[i].s_entropy.ps_cabac = ps_cabac; - } - else - { - /* base ptr */ - UWORD8 *pu1_buf = ps_mem_rec->pv_base; - - /* reset size */ - size = offset; - - /* skip mb run */ - ps_codec->as_process[i].s_entropy.pi4_mb_skip_run = (WORD32 *) (pu1_buf + size); - size += sizeof(WORD32); - size = ALIGN8(size); - - /* entropy map */ - ps_codec->as_process[i].s_entropy.pu1_entropy_map = - (UWORD8 *) (pu1_buf + size + max_mb_cols); - /* size in bytes to store entropy status of an entire frame */ - size += (max_mb_cols * max_mb_rows); - /* add an additional 1 row of bytes to evade the special case of row 0 - */ - size += max_mb_cols; - size = ALIGN128(size); - - /* bit stream ptr */ - ps_codec->as_process[i].s_entropy.ps_bitstrm = (bitstrm_t *) (pu1_buf + size); - size += sizeof(ps_codec->as_process[i].s_entropy.ps_bitstrm[0]); - size = ALIGN128(size); + /* base ptr */ + UWORD8 *pu1_buf = ps_mem_rec->pv_base; + + /* reset size */ + size = 0; + + /* skip mb run */ + ps_codec->as_process[i].s_entropy.pi4_mb_skip_run = (WORD32 *) (pu1_buf + size); + size += sizeof(WORD32); + size = ALIGN8(size); + + /* entropy map */ + ps_codec->as_process[i].s_entropy.pu1_entropy_map = + (UWORD8 *) (pu1_buf + size + max_mb_cols); + /* size in bytes to store entropy status of an entire frame */ + size += (max_mb_cols * max_mb_rows); + /* add an additional 1 row of bytes to evade the special case of row 0 + */ + size += max_mb_cols; + size = ALIGN128(size); + + /* bit stream ptr */ + ps_codec->as_process[i].s_entropy.ps_bitstrm = (bitstrm_t *) (pu1_buf + size); + size += sizeof(ps_codec->as_process[i].s_entropy.ps_bitstrm[0]); + size = ALIGN128(size); #if ENABLE_RE_ENC_AS_SKIP - ps_codec->as_process[i].s_entropy.ps_bitstrm_after_slice_hdr = - (bitstrm_t *) (pu1_buf + size); - size += sizeof(ps_codec->as_process[i].s_entropy.ps_bitstrm_after_slice_hdr[0]); - size = ALIGN128(size); + ps_codec->as_process[i].s_entropy.ps_bitstrm_after_slice_hdr = + (bitstrm_t *) (pu1_buf + size); + size += sizeof(ps_codec->as_process[i].s_entropy.ps_bitstrm_after_slice_hdr[0]); + size = ALIGN128(size); #endif - /* nnz luma */ - ps_codec->as_process[i].s_entropy.pu1_top_nnz_luma = - (UWORD8(*)[4])(UWORD8(*)[4])(pu1_buf + size); - size += (max_mb_cols * 4 * sizeof(UWORD8)); - size = ALIGN128(size); + /* nnz luma */ + ps_codec->as_process[i].s_entropy.pu1_top_nnz_luma = (UWORD8(*)[4])(pu1_buf + size); + size += (max_mb_cols * 4 * sizeof(UWORD8)); + size = ALIGN128(size); - /* nnz chroma */ - ps_codec->as_process[i].s_entropy.pu1_top_nnz_cbcr = (UWORD8(*)[4])(pu1_buf + size); - size += (max_mb_cols * 4 * sizeof(UWORD8)); - size = ALIGN128(size); + /* nnz chroma */ + ps_codec->as_process[i].s_entropy.pu1_top_nnz_cbcr = (UWORD8(*)[4])(pu1_buf + size); + size += (max_mb_cols * 4 * sizeof(UWORD8)); + size = ALIGN128(size); - /* ps_mb_qp_ctxt */ - ps_codec->as_process[i].s_entropy.ps_mb_qp_ctxt = (mb_qp_ctxt_t *) (pu1_buf + size); - size = ALIGN128(sizeof(ps_codec->as_process[i].s_entropy.ps_mb_qp_ctxt[0])); + /* ps_mb_qp_ctxt */ + ps_codec->as_process[i].s_entropy.ps_mb_qp_ctxt = (mb_qp_ctxt_t *) (pu1_buf + size); + size += ALIGN128(sizeof(ps_codec->as_process[i].s_entropy.ps_mb_qp_ctxt[0])); - /* cabac Context */ - ps_codec->as_process[i].s_entropy.ps_cabac = ps_cabac; - } + /* cabac Context */ + ps_codec->as_process[i].s_entropy.ps_cabac = ps_cabac; } ps_codec->as_process[0].s_entropy.ps_cabac->ps_mb_map_ctxt_inc_base = ps_mb_map_ctxt_inc; } @@ -3631,16 +3577,8 @@ static WORD32 isvce_init_mem_rec(iv_obj_t *ps_codec_obj, void *pv_api_ip, void * for(i = 0; i < MAX_PROCESS_CTXT; i++) { - if(i < MAX_PROCESS_CTXT / MAX_CTXT_SETS) - { - ps_codec->as_process[i].pv_pic_mb_coeff_data = pu1_buf; - ps_codec->as_process[i].s_entropy.pv_pic_mb_coeff_data = pu1_buf; - } - else - { - ps_codec->as_process[i].pv_pic_mb_coeff_data = pu1_buf + size; - ps_codec->as_process[i].s_entropy.pv_pic_mb_coeff_data = pu1_buf + size; - } + ps_codec->as_process[i].pv_pic_mb_coeff_data = pu1_buf; + ps_codec->as_process[i].s_entropy.pv_pic_mb_coeff_data = pu1_buf; } } @@ -3667,16 +3605,8 @@ static WORD32 isvce_init_mem_rec(iv_obj_t *ps_codec_obj, void *pv_api_ip, void * for(i = 0; i < MAX_PROCESS_CTXT; i++) { - if(i < MAX_PROCESS_CTXT / MAX_CTXT_SETS) - { - ps_codec->as_process[i].pv_pic_mb_header_data = pu1_buf; - ps_codec->as_process[i].s_entropy.pv_pic_mb_header_data = pu1_buf; - } - else - { - ps_codec->as_process[i].pv_pic_mb_header_data = pu1_buf + size; - ps_codec->as_process[i].s_entropy.pv_pic_mb_header_data = pu1_buf + size; - } + ps_codec->as_process[i].pv_pic_mb_header_data = pu1_buf; + ps_codec->as_process[i].s_entropy.pv_pic_mb_header_data = pu1_buf; } } @@ -3729,17 +3659,7 @@ static WORD32 isvce_init_mem_rec(iv_obj_t *ps_codec_obj, void *pv_api_ip, void * for(i = 0; i < MAX_PROCESS_CTXT; i++) { - if(i < MAX_PROCESS_CTXT / MAX_CTXT_SETS) - { - ps_codec->as_process[i].ps_svc_nalu_ext_base = ps_mem_rec->pv_base; - } - else - { - WORD32 size = SVC_MAX_SLICE_HDR_CNT * sizeof(slice_header_t); - void *pv_buf = (UWORD8 *) ps_mem_rec->pv_base + size; - - ps_codec->as_process[i].ps_svc_nalu_ext_base = pv_buf; - } + ps_codec->as_process[i].ps_svc_nalu_ext_base = ps_mem_rec->pv_base; } } @@ -3757,18 +3677,7 @@ static WORD32 isvce_init_mem_rec(iv_obj_t *ps_codec_obj, void *pv_api_ip, void * for(i = 0; i < MAX_PROCESS_CTXT; i++) { - if(i < MAX_PROCESS_CTXT / MAX_CTXT_SETS) - { - ps_codec->as_process[i].ps_slice_hdr_base = ps_mem_rec->pv_base; - } - else - { - /* temp var */ - WORD32 size = SVC_MAX_SLICE_HDR_CNT * sizeof(slice_header_t); - void *pv_buf = (UWORD8 *) ps_mem_rec->pv_base + size; - - ps_codec->as_process[i].ps_slice_hdr_base = pv_buf; - } + ps_codec->as_process[i].ps_slice_hdr_base = ps_mem_rec->pv_base; } } @@ -3789,14 +3698,7 @@ static WORD32 isvce_init_mem_rec(iv_obj_t *ps_codec_obj, void *pv_api_ip, void * for(i = 0; i < MAX_PROCESS_CTXT; i++) { - if(i < MAX_PROCESS_CTXT / MAX_CTXT_SETS) - { - ps_codec->as_process[i].pu1_is_intra_coded = pu1_buf; - } - else - { - ps_codec->as_process[i].pu1_is_intra_coded = pu1_buf + max_mb_cnt; - } + ps_codec->as_process[i].pu1_is_intra_coded = pu1_buf; } ps_codec->pu2_intr_rfrsh_map = (UWORD16 *) (pu1_buf + max_mb_cnt * MAX_CTXT_SETS); @@ -3805,22 +3707,14 @@ static WORD32 isvce_init_mem_rec(iv_obj_t *ps_codec_obj, void *pv_api_ip, void * ps_mem_rec = &ps_mem_rec_base[ISVCE_MEM_REC_SLICE_MAP]; { /* pointer to storage space */ - UWORD8 *pu1_buf_ping, *pu1_buf_pong; + UWORD8 *pu1_buf_ping; /* init pointer */ pu1_buf_ping = ps_mem_rec->pv_base; - pu1_buf_pong = pu1_buf_ping + ALIGN64(max_mb_cnt); for(i = 0; i < MAX_PROCESS_CTXT; i++) { - if(i < MAX_PROCESS_CTXT / MAX_CTXT_SETS) - { - ps_codec->as_process[i].pu1_slice_idx = pu1_buf_ping; - } - else - { - ps_codec->as_process[i].pu1_slice_idx = pu1_buf_pong; - } + ps_codec->as_process[i].pu1_slice_idx = pu1_buf_ping; } } @@ -3862,25 +3756,9 @@ static WORD32 isvce_init_mem_rec(iv_obj_t *ps_codec_obj, void *pv_api_ip, void * /* pointer to storage space */ UWORD8 *pu1_buf = ps_mem_rec->pv_base; - /* total size of the mem record */ - WORD32 total_size = 0; - - /* size in bytes to mb core coding status of an entire frame */ - total_size = max_mb_cnt; - - /* add an additional 1 row of bytes to evade the special case of row 0 */ - total_size += max_mb_cols; - for(i = 0; i < MAX_PROCESS_CTXT; i++) { - if(i < MAX_PROCESS_CTXT / MAX_CTXT_SETS) - { - ps_codec->as_process[i].pu1_proc_map = pu1_buf + max_mb_cols; - } - else - { - ps_codec->as_process[i].pu1_proc_map = pu1_buf + total_size + max_mb_cols; - } + ps_codec->as_process[i].pu1_proc_map = pu1_buf + max_mb_cols; } } @@ -3903,14 +3781,7 @@ static WORD32 isvce_init_mem_rec(iv_obj_t *ps_codec_obj, void *pv_api_ip, void * for(i = 0; i < MAX_PROCESS_CTXT; i++) { - if(i < MAX_PROCESS_CTXT / MAX_CTXT_SETS) - { - ps_codec->as_process[i].pu1_deblk_map = pu1_buf + max_mb_cols; - } - else - { - ps_codec->as_process[i].pu1_deblk_map = pu1_buf + total_size + max_mb_cols; - } + ps_codec->as_process[i].pu1_deblk_map = pu1_buf + max_mb_cols; } } @@ -3919,25 +3790,9 @@ static WORD32 isvce_init_mem_rec(iv_obj_t *ps_codec_obj, void *pv_api_ip, void * /* pointer to storage space */ UWORD8 *pu1_buf = (UWORD8 *) ps_mem_rec->pv_base; - /* total size of the mem record */ - WORD32 total_size = 0; - - /* size in bytes to mb core coding status of an entire frame */ - total_size = max_mb_cnt; - - /* add an additional 1 row of bytes to evade the special case of row 0 */ - total_size += max_mb_cols; - for(i = 0; i < MAX_PROCESS_CTXT; i++) { - if(i < MAX_PROCESS_CTXT / MAX_CTXT_SETS) - { - ps_codec->as_process[i].pu1_me_map = pu1_buf + max_mb_cols; - } - else - { - ps_codec->as_process[i].pu1_me_map = pu1_buf + total_size + max_mb_cols; - } + ps_codec->as_process[i].pu1_me_map = pu1_buf + max_mb_cols; } } diff --git a/encoder/svc/isvce_encode.c b/encoder/svc/isvce_encode.c index 8c6aa114..7d57ced2 100644 --- a/encoder/svc/isvce_encode.c +++ b/encoder/svc/isvce_encode.c @@ -108,39 +108,6 @@ /* Function Definitions */ /*****************************************************************************/ -/** -****************************************************************************** -* -* @brief This function puts the current thread to sleep for a duration -* of sleep_us -* -* @par Description -* ithread_yield() method causes the calling thread to yield execution to -*another thread that is ready to run on the current processor. The operating -*system selects the thread to yield to. ithread_usleep blocks the current thread -*for the specified number of milliseconds. In other words, yield just says, end -*my timeslice prematurely, look around for other threads to run. If there is -*nothing better than me, continue. Sleep says I don't want to run for x -* milliseconds. Even if no other thread wants to run, don't make me run. -* -* @param[in] sleep_us -* thread sleep duration -* -* @returns error_status -* -****************************************************************************** -*/ -IH264E_ERROR_T isvce_wait_for_thread(UWORD32 sleep_us) -{ - /* yield thread */ - ithread_yield(); - - /* put thread to sleep */ - ithread_sleep(sleep_us); - - return IH264E_SUCCESS; -} - /** ****************************************************************************** * diff --git a/encoder/svc/isvce_function_selector_generic.c b/encoder/svc/isvce_function_selector_generic.c index 044bbeb0..c3a48bfa 100644 --- a/encoder/svc/isvce_function_selector_generic.c +++ b/encoder/svc/isvce_function_selector_generic.c @@ -265,14 +265,10 @@ void isvce_init_function_ptr_generic(isvce_codec_t *ps_codec) ps_codec->pf_compute_sad_16x8 = ime_compute_sad_16x8; /* memory handling operations */ - ps_mem_fxns->pf_mem_cpy = ih264_memcpy; ps_mem_fxns->pf_mem_cpy_mul8 = ih264_memcpy_mul_8; - ps_mem_fxns->pf_mem_set = ih264_memset; ps_mem_fxns->pf_mem_set_mul8 = ih264_memset_mul_8; ps_mem_fxns->pf_copy_2d = isvc_copy_2d; ps_mem_fxns->pf_memset_2d = isvc_memset_2d; - ps_mem_fxns->pf_16bit_interleaved_copy = isvc_16bit_interleaved_copy; - ps_mem_fxns->pf_16bit_interleaved_memset = isvc_16bit_interleaved_memset; ps_mem_fxns->pf_nonzero_checker = isvc_is_nonzero_blk; /* sad me level functions */ diff --git a/encoder/svc/isvce_ilp_mv.c b/encoder/svc/isvce_ilp_mv.c index 9aa45a36..e9a7c172 100644 --- a/encoder/svc/isvce_ilp_mv.c +++ b/encoder/svc/isvce_ilp_mv.c @@ -204,7 +204,7 @@ void isvce_ilp_mv_ctxt_init(isvce_codec_t *ps_codec, iv_mem_rec_t *ps_mem_rec) if(u1_num_spatial_layers > 1) { - ilp_mv_layer_state_t *ps_layer_states; + ilp_mv_layer_state_t *ps_layer_states = NULL; ilp_mv_mb_state_t *aps_luma_mb_states[MAX_NUM_SPATIAL_LAYERS]; DOUBLE d_spatial_res_ratio = ps_codec->s_cfg.s_svc_params.d_spatial_res_ratio; diff --git a/encoder/svc/isvce_structs.h b/encoder/svc/isvce_structs.h index 9d8f6c52..68619bc3 100644 --- a/encoder/svc/isvce_structs.h +++ b/encoder/svc/isvce_structs.h @@ -1958,10 +1958,6 @@ typedef struct inter_pred_fxns_t typedef struct mem_fxns_t { - FT_MEMCPY *pf_mem_cpy; - - FT_MEMSET *pf_mem_set; - FT_MEMCPY *pf_mem_cpy_mul8; FT_MEMSET *pf_mem_set_mul8; @@ -1970,10 +1966,6 @@ typedef struct mem_fxns_t FT_MEMSET_2D *pf_memset_2d; - FT_16BIT_INTERLEAVED_COPY *pf_16bit_interleaved_copy; - - FT_16BIT_INTERLEAVED_MEMSET *pf_16bit_interleaved_memset; - FT_NONZERO_CHECKER *pf_nonzero_checker; } mem_fxns_t; diff --git a/encoder/x86/svc/isvce_function_selector_sse42.c b/encoder/x86/svc/isvce_function_selector_sse42.c index 709155f0..03f1cf0a 100644 --- a/encoder/x86/svc/isvce_function_selector_sse42.c +++ b/encoder/x86/svc/isvce_function_selector_sse42.c @@ -128,20 +128,13 @@ void isvce_init_function_ptr_sse42(isvce_codec_t *ps_codec) isvc_iquant_itrans_recon_res_chroma_4x4_sse42; ps_enc_loop_fxns->apf_iquant_itrans_recon_chroma_4x4[1] = isvc_iquant_itrans_recon_res_chroma_4x4_with_res_acc_sse42; - ps_enc_loop_fxns->apf_iquant_itrans_recon_chroma_4x4[2] = - isvc_iquant_itrans_recon_chroma_4x4_sse42; - ps_enc_loop_fxns->apf_iquant_itrans_recon_4x4_dc[0] = isvc_iquant_itrans_recon_res_dc_4x4_sse42; - ps_enc_loop_fxns->apf_iquant_itrans_recon_4x4_dc[1] = - isvc_iquant_itrans_recon_res_dc_with_res_acc_4x4_sse42; ps_enc_loop_fxns->apf_iquant_itrans_recon_4x4_dc[2] = isvc_iquant_itrans_recon_dc_4x4_sse42; ps_enc_loop_fxns->apf_iquant_itrans_recon_chroma_4x4_dc[0] = isvc_iquant_itrans_recon_res_chroma_4x4_dc_sse42; ps_enc_loop_fxns->apf_iquant_itrans_recon_chroma_4x4_dc[1] = isvc_iquant_itrans_recon_res_chroma_4x4_dc_with_res_acc_sse42; - ps_enc_loop_fxns->apf_iquant_itrans_recon_chroma_4x4_dc[2] = - isvc_iquant_itrans_recon_chroma_4x4_dc_sse42; ps_enc_loop_fxns->pf_ihadamard_scaling_4x4 = ih264_ihadamard_scaling_4x4_sse42;