encoder/arm/ihevce_subpel_neon.c - platform/external/libhevc - Git at Google

 /******************************************************************************
  *
  * Copyright (C) 2018 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at:
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  *
  *****************************************************************************
  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 */
 /**
 ******************************************************************************
 * @file
 *  ihevce_subpel_neon.c
 *
 * @brief
 *  Subpel refinement modules for ME algo
 *
 * @author
 *  Ittiam
 *
 * @par List of Functions:
 *
 * @remarks
 *  None
 *
 ********************************************************************************
 */

 /*****************************************************************************/
 /* File Includes                                                             */
 /*****************************************************************************/
 /* System include files */
 #include <stdio.h>
 #include <string.h>
 #include <assert.h>
 #include <arm_neon.h>

 /* User include files */
 #include "ihevc_typedefs.h"
 #include "itt_video_api.h"
 #include "ihevc_cmn_utils_neon.h"
 #include "ihevc_chroma_itrans_recon.h"
 #include "ihevc_chroma_intra_pred.h"
 #include "ihevc_debug.h"
 #include "ihevc_deblk.h"
 #include "ihevc_defs.h"
 #include "ihevc_itrans_recon.h"
 #include "ihevc_intra_pred.h"
 #include "ihevc_inter_pred.h"
 #include "ihevc_macros.h"
 #include "ihevc_mem_fns.h"
 #include "ihevc_padding.h"
 #include "ihevc_quant_iquant_ssd.h"
 #include "ihevc_resi_trans.h"
 #include "ihevc_sao.h"
 #include "ihevc_structs.h"
 #include "ihevc_weighted_pred.h"

 #include "rc_cntrl_param.h"
 #include "rc_frame_info_collector.h"
 #include "rc_look_ahead_params.h"

 #include "ihevce_api.h"
 #include "ihevce_defs.h"
 #include "ihevce_lap_enc_structs.h"
 #include "ihevce_multi_thrd_structs.h"
 #include "ihevce_function_selector.h"
 #include "ihevce_me_common_defs.h"
 #include "ihevce_enc_structs.h"
 #include "ihevce_had_satd.h"
 #include "ihevce_ipe_instr_set_router.h"
 #include "ihevce_global_tables.h"

 #include "hme_datatype.h"
 #include "hme_common_defs.h"
 #include "hme_interface.h"
 #include "hme_defs.h"

 #include "ihevce_me_instr_set_router.h"

 /*****************************************************************************/
 /* Function Declarations                                                     */
 /*****************************************************************************/
 FT_CALC_SATD_AND_RESULT hme_evalsatd_update_1_best_result_pt_pu_16x16_neon;

 WORD32 ihevce_had4_4x4_neon(
     UWORD8 *pu1_src,
     WORD32 src_strd,
     UWORD8 *pu1_pred,
     WORD32 pred_strd,
     WORD16 *pi2_dst4x4,
     WORD32 dst_strd,
     WORD32 *pi4_hsad,
     WORD32 hsad_stride,
     WORD32 i4_frm_qstep);

 /*****************************************************************************/
 /* Function Definitions                                                      */
 /*****************************************************************************/

 static void hme_4x4_qpel_interp_avg_neon(
     UWORD8 *pu1_src_a,
     UWORD8 *pu1_src_b,
     WORD32 src_a_strd,
     WORD32 src_b_strd,
     UWORD8 *pu1_dst,
     WORD32 dst_strd)
 {
     uint8x16_t src_a = load_unaligned_u8q(pu1_src_a, src_a_strd);
     uint8x16_t src_b = load_unaligned_u8q(pu1_src_b, src_b_strd);
     uint8x16_t dst = vrhaddq_u8(src_a, src_b);

     store_unaligned_u8q(pu1_dst, dst_strd, dst);
 }

 static void hme_8xn_qpel_interp_avg_neon(
     UWORD8 *pu1_src_a,
     UWORD8 *pu1_src_b,
     WORD32 src_a_strd,
     WORD32 src_b_strd,
     UWORD8 *pu1_dst,
     WORD32 dst_strd,
     WORD32 ht)
 {
     WORD32 i;

     for(i = 0; i < ht; i++)
     {
         uint8x8_t src_a = vld1_u8(pu1_src_a);
         uint8x8_t src_b = vld1_u8(pu1_src_b);
         uint8x8_t dst = vrhadd_u8(src_a, src_b);

         vst1_u8(pu1_dst, dst);
         pu1_src_a += src_a_strd;
         pu1_src_b += src_b_strd;
         pu1_dst += dst_strd;
     }
 }

 static void hme_16xn_qpel_interp_avg_neon(
     UWORD8 *pu1_src_a,
     UWORD8 *pu1_src_b,
     WORD32 src_a_strd,
     WORD32 src_b_strd,
     UWORD8 *pu1_dst,
     WORD32 dst_strd,
     WORD32 ht)
 {
     WORD32 i;

     for(i = 0; i < ht; i++)
     {
         uint8x16_t src_a = vld1q_u8(pu1_src_a);
         uint8x16_t src_b = vld1q_u8(pu1_src_b);
         uint8x16_t dst = vrhaddq_u8(src_a, src_b);

         vst1q_u8(pu1_dst, dst);
         pu1_src_a += src_a_strd;
         pu1_src_b += src_b_strd;
         pu1_dst += dst_strd;
     }
 }

 static void hme_32xn_qpel_interp_avg_neon(
     UWORD8 *pu1_src_a,
     UWORD8 *pu1_src_b,
     WORD32 src_a_strd,
     WORD32 src_b_strd,
     UWORD8 *pu1_dst,
     WORD32 dst_strd,
     WORD32 ht)
 {
     WORD32 i;

     for(i = 0; i < ht; i++)
     {
         uint8x16_t src_a_0 = vld1q_u8(pu1_src_a);
         uint8x16_t src_b_0 = vld1q_u8(pu1_src_b);
         uint8x16_t dst_0 = vrhaddq_u8(src_a_0, src_b_0);

         uint8x16_t src_a_1 = vld1q_u8(pu1_src_a + 16);
         uint8x16_t src_b_1 = vld1q_u8(pu1_src_b + 16);
         uint8x16_t dst_1 = vrhaddq_u8(src_a_1, src_b_1);

         vst1q_u8(pu1_dst, dst_0);
         vst1q_u8(pu1_dst + 16, dst_1);
         pu1_src_a += src_a_strd;
         pu1_src_b += src_b_strd;
         pu1_dst += dst_strd;
     }
 }

 static void hme_4mx4n_qpel_interp_avg_neon(
     UWORD8 *pu1_src_a,
     UWORD8 *pu1_src_b,
     WORD32 src_a_strd,
     WORD32 src_b_strd,
     UWORD8 *pu1_dst,
     WORD32 dst_strd,
     WORD32 blk_wd,
     WORD32 blk_ht)
 {
     WORD32 i, j;

     assert(blk_wd % 4 == 0);
     assert(blk_ht % 4 == 0);

     for(i = 0; i < blk_ht; i += 4)
     {
         for(j = 0; j < blk_wd;)
         {
             WORD32 wd = blk_wd - j;

             if(wd >= 32)
             {
                 hme_32xn_qpel_interp_avg_neon(
                     pu1_src_a + j, pu1_src_b + j, src_a_strd, src_b_strd, pu1_dst + j, dst_strd, 4);
                 j += 32;
             }
             else if(wd >= 16)
             {
                 hme_16xn_qpel_interp_avg_neon(
                     pu1_src_a + j, pu1_src_b + j, src_a_strd, src_b_strd, pu1_dst + j, dst_strd, 4);
                 j += 16;
             }
             else if(wd >= 8)
             {
                 hme_8xn_qpel_interp_avg_neon(
                     pu1_src_a + j, pu1_src_b + j, src_a_strd, src_b_strd, pu1_dst + j, dst_strd, 4);
                 j += 8;
             }
             else
             {
                 hme_4x4_qpel_interp_avg_neon(
                     pu1_src_a + j, pu1_src_b + j, src_a_strd, src_b_strd, pu1_dst + j, dst_strd);
                 j += 4;
             }
         }
         pu1_src_a += (4 * src_a_strd);
         pu1_src_b += (4 * src_b_strd);
         pu1_dst += (4 * dst_strd);
     }
 }

 void hme_qpel_interp_avg_neon(interp_prms_t *ps_prms, S32 i4_mv_x, S32 i4_mv_y, S32 i4_buf_id)
 {
     U08 *pu1_src1, *pu1_src2, *pu1_dst;
     qpel_input_buf_cfg_t *ps_inp_cfg;
     S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
     S32 i4_ref_stride = ps_prms->i4_ref_stride;

     i4_mv_x_frac = i4_mv_x & 3;
     i4_mv_y_frac = i4_mv_y & 3;

     i4_offset = (i4_mv_x >> 2) + (i4_mv_y >> 2) * i4_ref_stride;

     /* Derive the descriptor that has all offset and size info */
     ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];

     if(ps_inp_cfg->i1_buf_id1 == ps_inp_cfg->i1_buf_id2)
     {
         /* This is case for fxfy/hxfy/fxhy/hxhy */
         ps_prms->pu1_final_out = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
         ps_prms->pu1_final_out += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
         ps_prms->pu1_final_out += (ps_inp_cfg->i1_buf_yoff1 * ps_prms->i4_ref_stride);
         ps_prms->i4_final_out_stride = i4_ref_stride;

         return;
     }

     pu1_src1 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
     pu1_src1 += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
     pu1_src1 += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);

     pu1_src2 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id2];
     pu1_src2 += ps_inp_cfg->i1_buf_xoff2 + i4_offset;
     pu1_src2 += (ps_inp_cfg->i1_buf_yoff2 * i4_ref_stride);

     pu1_dst = ps_prms->apu1_interp_out[i4_buf_id];

     hme_4mx4n_qpel_interp_avg_neon(
         pu1_src1,
         pu1_src2,
         ps_prms->i4_ref_stride,
         ps_prms->i4_ref_stride,
         pu1_dst,
         ps_prms->i4_out_stride,
         ps_prms->i4_blk_wd,
         ps_prms->i4_blk_ht);
     ps_prms->pu1_final_out = pu1_dst;
     ps_prms->i4_final_out_stride = ps_prms->i4_out_stride;
 }

 // TODO: Can this function and above function be unified
 void hme_qpel_interp_avg_1pt_neon(
     interp_prms_t *ps_prms,
     S32 i4_mv_x,
     S32 i4_mv_y,
     S32 i4_buf_id,
     U08 **ppu1_final,
     S32 *pi4_final_stride)
 {
     U08 *pu1_src1, *pu1_src2, *pu1_dst;
     qpel_input_buf_cfg_t *ps_inp_cfg;
     S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
     S32 i4_ref_stride = ps_prms->i4_ref_stride;

     i4_mv_x_frac = i4_mv_x & 3;
     i4_mv_y_frac = i4_mv_y & 3;

     i4_offset = (i4_mv_x >> 2) + (i4_mv_y >> 2) * i4_ref_stride;

     /* Derive the descriptor that has all offset and size info */
     ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];

     pu1_src1 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
     pu1_src1 += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
     pu1_src1 += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);

     pu1_src2 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id2];
     pu1_src2 += ps_inp_cfg->i1_buf_xoff2 + i4_offset;
     pu1_src2 += (ps_inp_cfg->i1_buf_yoff2 * i4_ref_stride);

     pu1_dst = ps_prms->apu1_interp_out[i4_buf_id];

     hme_4mx4n_qpel_interp_avg_neon(
         pu1_src1,
         pu1_src2,
         ps_prms->i4_ref_stride,
         ps_prms->i4_ref_stride,
         pu1_dst,
         ps_prms->i4_out_stride,
         ps_prms->i4_blk_wd,
         ps_prms->i4_blk_ht);
     ppu1_final[i4_buf_id] = pu1_dst;
     pi4_final_stride[i4_buf_id] = ps_prms->i4_out_stride;
 }

 void hme_qpel_interp_avg_2pt_vert_with_reuse_neon(
     interp_prms_t *ps_prms, S32 i4_mv_x, S32 i4_mv_y, U08 **ppu1_final, S32 *pi4_final_stride)
 {
     hme_qpel_interp_avg_1pt_neon(ps_prms, i4_mv_x, i4_mv_y + 1, 3, ppu1_final, pi4_final_stride);

     hme_qpel_interp_avg_1pt_neon(ps_prms, i4_mv_x, i4_mv_y - 1, 1, ppu1_final, pi4_final_stride);
 }

 void hme_qpel_interp_avg_2pt_horz_with_reuse_neon(
     interp_prms_t *ps_prms, S32 i4_mv_x, S32 i4_mv_y, U08 **ppu1_final, S32 *pi4_final_stride)
 {
     hme_qpel_interp_avg_1pt_neon(ps_prms, i4_mv_x + 1, i4_mv_y, 2, ppu1_final, pi4_final_stride);

     hme_qpel_interp_avg_1pt_neon(ps_prms, i4_mv_x - 1, i4_mv_y, 0, ppu1_final, pi4_final_stride);
 }

 void hme_evalsatd_update_1_best_result_pt_pu_16x16_neon(
     err_prms_t *ps_prms, result_upd_prms_t *ps_result_prms)
 {
     mv_refine_ctxt_t *refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
     S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
     S32 *pi4_valid_part_ids = &refine_ctxt->ai4_part_id[0];

     S32 ai4_satd_4x4[16];
     S32 ai4_satd_8x8[4];

     U08 *pu1_inp = ps_prms->pu1_inp;
     U08 *pu1_ref = ps_prms->pu1_ref;

     S32 inp_stride = ps_prms->i4_inp_stride;
     S32 ref_stride = ps_prms->i4_ref_stride;

     S32 i;

     /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */
     for(i = 0; i < 4; i++)
     {
         U08 *pu1_src = pu1_inp + (i & 0x1) * 8 + (i >> 1) * inp_stride * 8;
         U08 *pu1_pred = pu1_ref + (i & 0x1) * 8 + (i >> 1) * ref_stride * 8;
         S16 idx = (i & 0x1) * 2 + (i >> 1) * 8;

         ai4_satd_8x8[i] = ihevce_had4_4x4_neon(
             pu1_src, inp_stride, pu1_pred, ref_stride, NULL, 0, &ai4_satd_4x4[idx], 4, 0);
     }

     /* Update 16x16 SATDs */
     pi4_sad_grid[PART_ID_2Nx2N] =
         ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[2] + ai4_satd_8x8[3];

     pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_8x8[0];
     pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_8x8[1];
     pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_8x8[2];
     pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_8x8[3];

     /* Update 8x16 / 16x8 SATDs */
     pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_8x8[0] + ai4_satd_8x8[2];
     pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_8x8[1] + ai4_satd_8x8[3];
     pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_8x8[0] + ai4_satd_8x8[1];
     pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_8x8[2] + ai4_satd_8x8[3];

     /* Update AMP SATDs 16x12,16x4, 12x16,4x16  */
     pi4_sad_grid[PART_ID_nLx2N_L] =
         ai4_satd_4x4[0] + ai4_satd_4x4[2] + ai4_satd_4x4[8] + ai4_satd_4x4[10];
     pi4_sad_grid[PART_ID_nRx2N_R] =
         ai4_satd_4x4[5] + ai4_satd_4x4[7] + ai4_satd_4x4[13] + ai4_satd_4x4[15];
     pi4_sad_grid[PART_ID_2NxnU_T] =
         ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[4] + ai4_satd_4x4[5];
     pi4_sad_grid[PART_ID_2NxnD_B] =
         ai4_satd_4x4[10] + ai4_satd_4x4[11] + ai4_satd_4x4[14] + ai4_satd_4x4[15];

     pi4_sad_grid[PART_ID_nLx2N_R] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
     pi4_sad_grid[PART_ID_nRx2N_L] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
     pi4_sad_grid[PART_ID_2NxnU_B] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
     pi4_sad_grid[PART_ID_2NxnD_T] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];

     /* For each valid partition, update the refine_prm structure to
      * reflect the best and second best candidates for that partition */
     for(i = 0; i < refine_ctxt->i4_num_valid_parts; i++)
     {
         S32 part_id = pi4_valid_part_ids[i];
         S32 id = (refine_ctxt->i4_num_valid_parts > 8) ? part_id : i;
         S32 i4_mv_cost = refine_ctxt->i2_mv_cost[0][id];
         S32 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
         S32 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
         S32 best_node_cost = CLIP_S16(refine_ctxt->i2_tot_cost[0][id]);

         if(i4_tot_cost < best_node_cost)
         {
             refine_ctxt->i2_tot_cost[0][id] = i4_tot_cost;
             refine_ctxt->i2_mv_cost[0][id] = i4_mv_cost;
             refine_ctxt->i2_mv_x[0][id] = ps_result_prms->i2_mv_x;
             refine_ctxt->i2_mv_y[0][id] = ps_result_prms->i2_mv_y;
             refine_ctxt->i2_ref_idx[0][id] = ps_result_prms->i1_ref_idx;
         }
     }
 }
	/******************************************************************************
	*
	* Copyright (C) 2018 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at:
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*
	*****************************************************************************
	* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
	*/
	/**
	******************************************************************************
	* @file
	* ihevce_subpel_neon.c
	*
	* @brief
	* Subpel refinement modules for ME algo
	*
	* @author
	* Ittiam
	*
	* @par List of Functions:
	*
	* @remarks
	* None
	*
	********************************************************************************
	*/

	/*****************************************************************************/
	/* File Includes */
	/*****************************************************************************/
	/* System include files */
	#include <stdio.h>
	#include <string.h>
	#include <assert.h>
	#include <arm_neon.h>

	/* User include files */
	#include "ihevc_typedefs.h"
	#include "itt_video_api.h"
	#include "ihevc_cmn_utils_neon.h"
	#include "ihevc_chroma_itrans_recon.h"
	#include "ihevc_chroma_intra_pred.h"
	#include "ihevc_debug.h"
	#include "ihevc_deblk.h"
	#include "ihevc_defs.h"
	#include "ihevc_itrans_recon.h"
	#include "ihevc_intra_pred.h"
	#include "ihevc_inter_pred.h"
	#include "ihevc_macros.h"
	#include "ihevc_mem_fns.h"
	#include "ihevc_padding.h"
	#include "ihevc_quant_iquant_ssd.h"
	#include "ihevc_resi_trans.h"
	#include "ihevc_sao.h"
	#include "ihevc_structs.h"
	#include "ihevc_weighted_pred.h"

	#include "rc_cntrl_param.h"
	#include "rc_frame_info_collector.h"
	#include "rc_look_ahead_params.h"

	#include "ihevce_api.h"
	#include "ihevce_defs.h"
	#include "ihevce_lap_enc_structs.h"
	#include "ihevce_multi_thrd_structs.h"
	#include "ihevce_function_selector.h"
	#include "ihevce_me_common_defs.h"
	#include "ihevce_enc_structs.h"
	#include "ihevce_had_satd.h"
	#include "ihevce_ipe_instr_set_router.h"
	#include "ihevce_global_tables.h"

	#include "hme_datatype.h"
	#include "hme_common_defs.h"
	#include "hme_interface.h"
	#include "hme_defs.h"

	#include "ihevce_me_instr_set_router.h"

	/*****************************************************************************/
	/* Function Declarations */
	/*****************************************************************************/
	FT_CALC_SATD_AND_RESULT hme_evalsatd_update_1_best_result_pt_pu_16x16_neon;

	WORD32 ihevce_had4_4x4_neon(
	UWORD8 *pu1_src,
	WORD32 src_strd,
	UWORD8 *pu1_pred,
	WORD32 pred_strd,
	WORD16 *pi2_dst4x4,
	WORD32 dst_strd,
	WORD32 *pi4_hsad,
	WORD32 hsad_stride,
	WORD32 i4_frm_qstep);

	/*****************************************************************************/
	/* Function Definitions */
	/*****************************************************************************/

	static void hme_4x4_qpel_interp_avg_neon(
	UWORD8 *pu1_src_a,
	UWORD8 *pu1_src_b,
	WORD32 src_a_strd,
	WORD32 src_b_strd,
	UWORD8 *pu1_dst,
	WORD32 dst_strd)
	{
	uint8x16_t src_a = load_unaligned_u8q(pu1_src_a, src_a_strd);
	uint8x16_t src_b = load_unaligned_u8q(pu1_src_b, src_b_strd);
	uint8x16_t dst = vrhaddq_u8(src_a, src_b);

	store_unaligned_u8q(pu1_dst, dst_strd, dst);
	}

	static void hme_8xn_qpel_interp_avg_neon(
	UWORD8 *pu1_src_a,
	UWORD8 *pu1_src_b,
	WORD32 src_a_strd,
	WORD32 src_b_strd,
	UWORD8 *pu1_dst,
	WORD32 dst_strd,
	WORD32 ht)
	{
	WORD32 i;

	for(i = 0; i < ht; i++)
	{
	uint8x8_t src_a = vld1_u8(pu1_src_a);
	uint8x8_t src_b = vld1_u8(pu1_src_b);
	uint8x8_t dst = vrhadd_u8(src_a, src_b);

	vst1_u8(pu1_dst, dst);
	pu1_src_a += src_a_strd;
	pu1_src_b += src_b_strd;
	pu1_dst += dst_strd;
	}
	}

	static void hme_16xn_qpel_interp_avg_neon(
	UWORD8 *pu1_src_a,
	UWORD8 *pu1_src_b,
	WORD32 src_a_strd,
	WORD32 src_b_strd,
	UWORD8 *pu1_dst,
	WORD32 dst_strd,
	WORD32 ht)
	{
	WORD32 i;

	for(i = 0; i < ht; i++)
	{
	uint8x16_t src_a = vld1q_u8(pu1_src_a);
	uint8x16_t src_b = vld1q_u8(pu1_src_b);
	uint8x16_t dst = vrhaddq_u8(src_a, src_b);

	vst1q_u8(pu1_dst, dst);
	pu1_src_a += src_a_strd;
	pu1_src_b += src_b_strd;
	pu1_dst += dst_strd;
	}
	}

	static void hme_32xn_qpel_interp_avg_neon(
	UWORD8 *pu1_src_a,
	UWORD8 *pu1_src_b,
	WORD32 src_a_strd,
	WORD32 src_b_strd,
	UWORD8 *pu1_dst,
	WORD32 dst_strd,
	WORD32 ht)
	{
	WORD32 i;

	for(i = 0; i < ht; i++)
	{
	uint8x16_t src_a_0 = vld1q_u8(pu1_src_a);
	uint8x16_t src_b_0 = vld1q_u8(pu1_src_b);
	uint8x16_t dst_0 = vrhaddq_u8(src_a_0, src_b_0);

	uint8x16_t src_a_1 = vld1q_u8(pu1_src_a + 16);
	uint8x16_t src_b_1 = vld1q_u8(pu1_src_b + 16);
	uint8x16_t dst_1 = vrhaddq_u8(src_a_1, src_b_1);

	vst1q_u8(pu1_dst, dst_0);
	vst1q_u8(pu1_dst + 16, dst_1);
	pu1_src_a += src_a_strd;
	pu1_src_b += src_b_strd;
	pu1_dst += dst_strd;
	}
	}

	static void hme_4mx4n_qpel_interp_avg_neon(
	UWORD8 *pu1_src_a,
	UWORD8 *pu1_src_b,
	WORD32 src_a_strd,
	WORD32 src_b_strd,
	UWORD8 *pu1_dst,
	WORD32 dst_strd,
	WORD32 blk_wd,
	WORD32 blk_ht)
	{
	WORD32 i, j;

	assert(blk_wd % 4 == 0);
	assert(blk_ht % 4 == 0);

	for(i = 0; i < blk_ht; i += 4)
	{
	for(j = 0; j < blk_wd;)
	{
	WORD32 wd = blk_wd - j;

	if(wd >= 32)
	{
	hme_32xn_qpel_interp_avg_neon(
	pu1_src_a + j, pu1_src_b + j, src_a_strd, src_b_strd, pu1_dst + j, dst_strd, 4);
	j += 32;
	}
	else if(wd >= 16)
	{
	hme_16xn_qpel_interp_avg_neon(
	pu1_src_a + j, pu1_src_b + j, src_a_strd, src_b_strd, pu1_dst + j, dst_strd, 4);
	j += 16;
	}
	else if(wd >= 8)
	{
	hme_8xn_qpel_interp_avg_neon(
	pu1_src_a + j, pu1_src_b + j, src_a_strd, src_b_strd, pu1_dst + j, dst_strd, 4);
	j += 8;
	}
	else
	{
	hme_4x4_qpel_interp_avg_neon(
	pu1_src_a + j, pu1_src_b + j, src_a_strd, src_b_strd, pu1_dst + j, dst_strd);
	j += 4;
	}
	}
	pu1_src_a += (4 * src_a_strd);
	pu1_src_b += (4 * src_b_strd);
	pu1_dst += (4 * dst_strd);
	}
	}

	void hme_qpel_interp_avg_neon(interp_prms_t *ps_prms, S32 i4_mv_x, S32 i4_mv_y, S32 i4_buf_id)
	{
	U08 pu1_src1, pu1_src2, *pu1_dst;
	qpel_input_buf_cfg_t *ps_inp_cfg;
	S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
	S32 i4_ref_stride = ps_prms->i4_ref_stride;

	i4_mv_x_frac = i4_mv_x & 3;
	i4_mv_y_frac = i4_mv_y & 3;

	i4_offset = (i4_mv_x >> 2) + (i4_mv_y >> 2) * i4_ref_stride;

	/* Derive the descriptor that has all offset and size info */
	ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];

	if(ps_inp_cfg->i1_buf_id1 == ps_inp_cfg->i1_buf_id2)
	{
	/* This is case for fxfy/hxfy/fxhy/hxhy */
	ps_prms->pu1_final_out = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
	ps_prms->pu1_final_out += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
	ps_prms->pu1_final_out += (ps_inp_cfg->i1_buf_yoff1 * ps_prms->i4_ref_stride);
	ps_prms->i4_final_out_stride = i4_ref_stride;

	return;
	}

	pu1_src1 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
	pu1_src1 += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
	pu1_src1 += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);

	pu1_src2 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id2];
	pu1_src2 += ps_inp_cfg->i1_buf_xoff2 + i4_offset;
	pu1_src2 += (ps_inp_cfg->i1_buf_yoff2 * i4_ref_stride);

	pu1_dst = ps_prms->apu1_interp_out[i4_buf_id];

	hme_4mx4n_qpel_interp_avg_neon(
	pu1_src1,
	pu1_src2,
	ps_prms->i4_ref_stride,
	ps_prms->i4_ref_stride,
	pu1_dst,
	ps_prms->i4_out_stride,
	ps_prms->i4_blk_wd,
	ps_prms->i4_blk_ht);
	ps_prms->pu1_final_out = pu1_dst;
	ps_prms->i4_final_out_stride = ps_prms->i4_out_stride;
	}

	// TODO: Can this function and above function be unified
	void hme_qpel_interp_avg_1pt_neon(
	interp_prms_t *ps_prms,
	S32 i4_mv_x,
	S32 i4_mv_y,
	S32 i4_buf_id,
	U08 **ppu1_final,
	S32 *pi4_final_stride)
	{
	U08 pu1_src1, pu1_src2, *pu1_dst;
	qpel_input_buf_cfg_t *ps_inp_cfg;
	S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
	S32 i4_ref_stride = ps_prms->i4_ref_stride;

	i4_mv_x_frac = i4_mv_x & 3;
	i4_mv_y_frac = i4_mv_y & 3;

	i4_offset = (i4_mv_x >> 2) + (i4_mv_y >> 2) * i4_ref_stride;

	/* Derive the descriptor that has all offset and size info */
	ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];

	pu1_src1 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
	pu1_src1 += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
	pu1_src1 += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);

	pu1_src2 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id2];
	pu1_src2 += ps_inp_cfg->i1_buf_xoff2 + i4_offset;
	pu1_src2 += (ps_inp_cfg->i1_buf_yoff2 * i4_ref_stride);

	pu1_dst = ps_prms->apu1_interp_out[i4_buf_id];

	hme_4mx4n_qpel_interp_avg_neon(
	pu1_src1,
	pu1_src2,
	ps_prms->i4_ref_stride,
	ps_prms->i4_ref_stride,
	pu1_dst,
	ps_prms->i4_out_stride,
	ps_prms->i4_blk_wd,
	ps_prms->i4_blk_ht);
	ppu1_final[i4_buf_id] = pu1_dst;
	pi4_final_stride[i4_buf_id] = ps_prms->i4_out_stride;
	}

	void hme_qpel_interp_avg_2pt_vert_with_reuse_neon(
	interp_prms_t ps_prms, S32 i4_mv_x, S32 i4_mv_y, U08 ppu1_final, S32 pi4_final_stride)
	{
	hme_qpel_interp_avg_1pt_neon(ps_prms, i4_mv_x, i4_mv_y + 1, 3, ppu1_final, pi4_final_stride);

	hme_qpel_interp_avg_1pt_neon(ps_prms, i4_mv_x, i4_mv_y - 1, 1, ppu1_final, pi4_final_stride);
	}

	void hme_qpel_interp_avg_2pt_horz_with_reuse_neon(
	interp_prms_t ps_prms, S32 i4_mv_x, S32 i4_mv_y, U08 ppu1_final, S32 pi4_final_stride)
	{
	hme_qpel_interp_avg_1pt_neon(ps_prms, i4_mv_x + 1, i4_mv_y, 2, ppu1_final, pi4_final_stride);

	hme_qpel_interp_avg_1pt_neon(ps_prms, i4_mv_x - 1, i4_mv_y, 0, ppu1_final, pi4_final_stride);
	}

	void hme_evalsatd_update_1_best_result_pt_pu_16x16_neon(
	err_prms_t ps_prms, result_upd_prms_t ps_result_prms)
	{
	mv_refine_ctxt_t *refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
	S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
	S32 *pi4_valid_part_ids = &refine_ctxt->ai4_part_id[0];

	S32 ai4_satd_4x4[16];
	S32 ai4_satd_8x8[4];

	U08 *pu1_inp = ps_prms->pu1_inp;
	U08 *pu1_ref = ps_prms->pu1_ref;

	S32 inp_stride = ps_prms->i4_inp_stride;
	S32 ref_stride = ps_prms->i4_ref_stride;

	S32 i;

	/* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */
	for(i = 0; i < 4; i++)
	{
	U08 pu1_src = pu1_inp + (i & 0x1) 8 + (i >> 1) * inp_stride * 8;
	U08 pu1_pred = pu1_ref + (i & 0x1) 8 + (i >> 1) * ref_stride * 8;
	S16 idx = (i & 0x1) * 2 + (i >> 1) * 8;

	ai4_satd_8x8[i] = ihevce_had4_4x4_neon(
	pu1_src, inp_stride, pu1_pred, ref_stride, NULL, 0, &ai4_satd_4x4[idx], 4, 0);
	}

	/* Update 16x16 SATDs */
	pi4_sad_grid[PART_ID_2Nx2N] =
	ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[2] + ai4_satd_8x8[3];

	pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_8x8[0];
	pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_8x8[1];
	pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_8x8[2];
	pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_8x8[3];

	/* Update 8x16 / 16x8 SATDs */
	pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_8x8[0] + ai4_satd_8x8[2];
	pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_8x8[1] + ai4_satd_8x8[3];
	pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_8x8[0] + ai4_satd_8x8[1];
	pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_8x8[2] + ai4_satd_8x8[3];

	/* Update AMP SATDs 16x12,16x4, 12x16,4x16 */
	pi4_sad_grid[PART_ID_nLx2N_L] =
	ai4_satd_4x4[0] + ai4_satd_4x4[2] + ai4_satd_4x4[8] + ai4_satd_4x4[10];
	pi4_sad_grid[PART_ID_nRx2N_R] =
	ai4_satd_4x4[5] + ai4_satd_4x4[7] + ai4_satd_4x4[13] + ai4_satd_4x4[15];
	pi4_sad_grid[PART_ID_2NxnU_T] =
	ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[4] + ai4_satd_4x4[5];
	pi4_sad_grid[PART_ID_2NxnD_B] =
	ai4_satd_4x4[10] + ai4_satd_4x4[11] + ai4_satd_4x4[14] + ai4_satd_4x4[15];

	pi4_sad_grid[PART_ID_nLx2N_R] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
	pi4_sad_grid[PART_ID_nRx2N_L] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
	pi4_sad_grid[PART_ID_2NxnU_B] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
	pi4_sad_grid[PART_ID_2NxnD_T] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];

	/* For each valid partition, update the refine_prm structure to
	* reflect the best and second best candidates for that partition */
	for(i = 0; i < refine_ctxt->i4_num_valid_parts; i++)
	{
	S32 part_id = pi4_valid_part_ids[i];
	S32 id = (refine_ctxt->i4_num_valid_parts > 8) ? part_id : i;
	S32 i4_mv_cost = refine_ctxt->i2_mv_cost[0][id];
	S32 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
	S32 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
	S32 best_node_cost = CLIP_S16(refine_ctxt->i2_tot_cost[0][id]);

	if(i4_tot_cost < best_node_cost)
	{
	refine_ctxt->i2_tot_cost[0][id] = i4_tot_cost;
	refine_ctxt->i2_mv_cost[0][id] = i4_mv_cost;
	refine_ctxt->i2_mv_x[0][id] = ps_result_prms->i2_mv_x;
	refine_ctxt->i2_mv_y[0][id] = ps_result_prms->i2_mv_y;
	refine_ctxt->i2_ref_idx[0][id] = ps_result_prms->i1_ref_idx;
	}
	}
	}