common/arm/ihevc_weighted_pred_neon_intr.c - platform/external/libhevc - Git at Google

 /******************************************************************************
 *
 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 ******************************************************************************/
 /**
 *******************************************************************************
 * @file
 *  ihevc_weighted_pred_neon_intr.c
 *
 * @brief
 *  Contains function definitions for weighted prediction used in inter
 * prediction
 *
 * @author
 *  Parthiban V
 *
 * @par List of Functions:
 *  - ihevc_weighted_pred_uni()
 *  - ihevc_weighted_pred_bi()
 *  - ihevc_weighted_pred_bi_default()
 *
 * @remarks
 *  None
 *
 *******************************************************************************
 */
 /*****************************************************************************/
 /* File Includes                                                             */
 /*****************************************************************************/
 #include "ihevc_typedefs.h"
 #include "ihevc_defs.h"
 #include "ihevc_macros.h"
 #include "ihevc_func_selector.h"
 #include "ihevc_inter_pred.h"
 #include "arm_neon.h"


 /**
 *******************************************************************************
 *
 * @brief
 *  Does uni-weighted prediction on the array pointed by  pi2_src and stores
 * it at the location pointed by pi2_dst Assumptions : The function is
 * optimized considering the fact Width and  height are multiple of 2.
 *
 * @par Description:
 *  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
 * offset
 *
 * @param[in] pi2_src
 *  Pointer to the source
 *
 * @param[out] pu1_dst
 *  Pointer to the destination
 *
 * @param[in] src_strd
 *  Source stride
 *
 * @param[in] dst_strd
 *  Destination stride
 *
 * @param[in] wgt0
 *  weight to be multiplied to the source
 *
 * @param[in] off0
 *  offset to be added after rounding and
 *
 * @param[in] shifting
 *
 *
 * @param[in] shift
 *  (14 Bit depth) + log2_weight_denominator
 *
 * @param[in] lvl_shift
 *  added before shift and offset
 *
 * @param[in] ht
 *  height of the source
 *
 * @param[in] wd
 *  width of the source
 *
 * @returns
 *
 * @remarks
 *  None
 *
 *******************************************************************************
 */

 void ihevc_weighted_pred_uni_neonintr(WORD16 *pi2_src,
                                       UWORD8 *pu1_dst,
                                       WORD32 src_strd,
                                       WORD32 dst_strd,
                                       WORD32 wgt0,
                                       WORD32 off0,
                                       WORD32 shift,
                                       WORD32 lvl_shift,
                                       WORD32 ht,
                                       WORD32 wd)
 {
     WORD32 row, col;
     int16x4_t pi2_src_val1;
     int16x4_t pi2_src_val2;
     int32x4_t i4_tmp1_t;
     int32x4_t i4_tmp2_t;
     int32x4_t sto_res_tmp1;
     uint16x4_t sto_res_tmp2;
     uint16x8_t sto_res_tmp3;
     uint8x8_t sto_res;
     int32x4_t tmp_lvl_shift_t;
     WORD32 tmp_shift = 0 - shift;
     int32x4_t tmp_shift_t;
     WORD16 *pi2_src_tmp;
     UWORD8 *pu1_dst_tmp;

     WORD32 tmp_lvl_shift = lvl_shift * wgt0 + (off0 << shift);
     tmp_lvl_shift += (1 << (shift - 1));
     tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
     tmp_shift_t = vmovq_n_s32(tmp_shift);

     /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
     /* height has also been unrolled, hence 2 rows will processed at a time                     */
     /* store also has been taken care for two row process                                       */
     /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
     /* saturated and narrowed                                                                   */

     for(row = ht; row > 0; row -= 2)
     {
         for(col = wd; col > 0; col -= 4)
         {
             pi2_src_tmp = pi2_src + src_strd;

             pu1_dst_tmp = pu1_dst + dst_strd;

             pi2_src_val1 = vld1_s16((int16_t *)pi2_src);
             pi2_src += 4;

             pi2_src_val2 = vld1_s16((int16_t *)pi2_src_tmp);
             i4_tmp1_t = vmull_n_s16(pi2_src_val1, (int16_t)wgt0);

             i4_tmp1_t = vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t);
             i4_tmp2_t = vmull_n_s16(pi2_src_val2, (int16_t)wgt0);

             sto_res_tmp1 = vshlq_s32(i4_tmp1_t, tmp_shift_t);
             i4_tmp2_t = vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t);

             sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
             sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);

             sto_res_tmp1 = vshlq_s32(i4_tmp2_t, tmp_shift_t);
             sto_res = vqmovn_u16(sto_res_tmp3);

             sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
             sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);

             vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
             pu1_dst += 4;

             sto_res = vqmovn_u16(sto_res_tmp3);
             vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
         }
         pi2_src += 2 * src_strd - wd;
         pu1_dst += 2 * dst_strd - wd;
     }
 }
 //WEIGHTED_PRED_UNI

 /**
 *******************************************************************************
 *
 * @brief
 * Chroma uni-weighted prediction on the array pointed by  pi2_src and stores
 * it at the location pointed by pi2_dst Assumptions : The function is
 * optimized considering the fact Width and  height are multiple of 2.
 *
 * @par Description:
 *  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
 * offset
 *
 * @param[in] pi2_src
 *  Pointer to the source
 *
 * @param[out] pu1_dst
 *  Pointer to the destination
 *
 * @param[in] src_strd
 *  Source stride
 *
 * @param[in] dst_strd
 *  Destination stride
 *
 * @param[in] wgt0
 *  weight to be multiplied to the source
 *
 * @param[in] off0
 *  offset to be added after rounding and
 *
 * @param[in] shifting
 *
 *
 * @param[in] shift
 *  (14 Bit depth) + log2_weight_denominator
 *
 * @param[in] lvl_shift
 *  added before shift and offset
 *
 * @param[in] ht
 *  height of the source
 *
 * @param[in] wd
 *  width of the source
 *
 * @returns
 *
 * @remarks
 *  None
 *
 *******************************************************************************
 */

 void ihevc_weighted_pred_chroma_uni_neonintr(WORD16 *pi2_src,
                                              UWORD8 *pu1_dst,
                                              WORD32 src_strd,
                                              WORD32 dst_strd,
                                              WORD32 wgt0_cb,
                                              WORD32 wgt0_cr,
                                              WORD32 off0_cb,
                                              WORD32 off0_cr,
                                              WORD32 shift,
                                              WORD32 lvl_shift,
                                              WORD32 ht,
                                              WORD32 wd)
 {
     WORD32 row, col;
     int16x4_t pi2_src_val1;
     int16x4_t pi2_src_val2;
     int32x4_t i4_tmp1_t;
     int32x4_t i4_tmp2_t;
     int32x4_t sto_res_tmp1;
     uint16x4_t sto_res_tmp2;
     uint16x8_t sto_res_tmp3;
     uint8x8_t sto_res;
     int32x4_t tmp_lvl_shift_t_u, tmp_lvl_shift_t_v;
     int32x4x2_t tmp_lvl_shift_t;
     WORD32 tmp_shift = 0 - shift;
     int32x4_t tmp_shift_t;
     int16x4_t tmp_wgt0_u, tmp_wgt0_v;
     int16x4x2_t wgt0;
     WORD16 *pi2_src_tmp;
     UWORD8 *pu1_dst_tmp;

     WORD32 tmp_lvl_shift = lvl_shift * wgt0_cb + (off0_cb << shift);
     tmp_lvl_shift += (1 << (shift - 1));
     tmp_lvl_shift_t_u = vmovq_n_s32(tmp_lvl_shift);

     tmp_lvl_shift = lvl_shift * wgt0_cr + (off0_cr << shift);
     tmp_lvl_shift += (1 << (shift - 1));
     tmp_lvl_shift_t_v = vmovq_n_s32(tmp_lvl_shift);

     tmp_lvl_shift_t = vzipq_s32(tmp_lvl_shift_t_u, tmp_lvl_shift_t_v);

     tmp_shift_t = vmovq_n_s32(tmp_shift);

     tmp_wgt0_u = vdup_n_s16(wgt0_cb);
     tmp_wgt0_v = vdup_n_s16(wgt0_cr);
     wgt0 = vzip_s16(tmp_wgt0_u, tmp_wgt0_v);

     /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
     /* height has also been unrolled, hence 2 rows will processed at a time                     */
     /* store also has been taken care for two row process                                       */
     /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
     /* saturated and narrowed                                                                   */

     for(row = ht; row > 0; row -= 2)
     {
         for(col = 2 * wd; col > 0; col -= 4)
         {
             pi2_src_tmp = pi2_src + src_strd;

             pu1_dst_tmp = pu1_dst + dst_strd;

             pi2_src_val1 = vld1_s16((int16_t *)pi2_src);
             pi2_src += 4;

             pi2_src_val2 = vld1_s16((int16_t *)pi2_src_tmp);
             i4_tmp1_t = vmull_s16(pi2_src_val1, wgt0.val[0]);

             i4_tmp1_t = vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t.val[0]);
             i4_tmp2_t = vmull_s16(pi2_src_val2, wgt0.val[0]);

             sto_res_tmp1 = vshlq_s32(i4_tmp1_t, tmp_shift_t);
             i4_tmp2_t = vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t.val[0]);

             sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
             sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);

             sto_res_tmp1 = vshlq_s32(i4_tmp2_t, tmp_shift_t);
             sto_res = vqmovn_u16(sto_res_tmp3);

             sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
             sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);

             vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
             pu1_dst += 4;

             sto_res = vqmovn_u16(sto_res_tmp3);
             vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
         }
         pi2_src += 2 * src_strd - 2 * wd;
         pu1_dst += 2 * dst_strd - 2 * wd;
     }
 }
 //WEIGHTED_PRED_CHROMA_UNI

 /**
 *******************************************************************************
 *
 * @brief
 *  Does bi-weighted prediction on the arrays pointed by  pi2_src1 and
 * pi2_src2 and stores it at location pointed  by pi2_dst   Assumptions : The
 * function is optimized considering the fact Width and  height are multiple
 * of 2.
 *
 * @par Description:
 *  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
 * off1 + 1) << (shift - 1) ) >> shift
 *
 * @param[in] pi2_src1
 *  Pointer to source 1
 *
 * @param[in] pi2_src2
 *  Pointer to source 2
 *
 * @param[out] pu1_dst
 *  Pointer to destination
 *
 * @param[in] src_strd1
 *  Source stride 1
 *
 * @param[in] src_strd2
 *  Source stride 2
 *
 * @param[in] dst_strd
 *  Destination stride
 *
 * @param[in] wgt0
 *  weight to be multiplied to source 1
 *
 * @param[in] off0
 *  offset 0
 *
 * @param[in] wgt1
 *  weight to be multiplied to source 2
 *
 * @param[in] off1
 *  offset 1
 *
 * @param[in] shift
 *  (14 Bit depth) + log2_weight_denominator
 *
 * @param[in] lvl_shift1
 *  added before shift and offset
 *
 * @param[in] lvl_shift2
 *  added before shift and offset
 *
 * @param[in] ht
 *  height of the source
 *
 * @param[in] wd
 *  width of the source
 *
 * @returns
 *
 * @remarks
 *  None
 *
 *******************************************************************************
 */

 void ihevc_weighted_pred_bi_neonintr(WORD16 *pi2_src1,
                                      WORD16 *pi2_src2,
                                      UWORD8 *pu1_dst,
                                      WORD32 src_strd1,
                                      WORD32 src_strd2,
                                      WORD32 dst_strd,
                                      WORD32 wgt0,
                                      WORD32 off0,
                                      WORD32 wgt1,
                                      WORD32 off1,
                                      WORD32 shift,
                                      WORD32 lvl_shift1,
                                      WORD32 lvl_shift2,
                                      WORD32 ht,
                                      WORD32 wd)
 {
     WORD32 row, col;
     int16x4_t pi2_src1_val1;
     int16x4_t pi2_src1_val2;
     int16x4_t pi2_src2_val1;
     int16x4_t pi2_src2_val2;
     int32x4_t i4_tmp1_t1;
     int32x4_t i4_tmp1_t2;
     int32x4_t i4_tmp2_t1;
     int32x4_t i4_tmp2_t2;
     int32x4_t sto_res_tmp1;
     uint16x4_t sto_res_tmp2;
     uint16x8_t sto_res_tmp3;
     uint8x8_t sto_res;
     int32x4_t tmp_lvl_shift_t;
     WORD32 tmp_shift = 0 - shift;
     int32x4_t tmp_shift_t;
     WORD16 *pi2_src_tmp1;
     WORD16 *pi2_src_tmp2;
     UWORD8 *pu1_dst_tmp;

     WORD32 tmp_lvl_shift = (lvl_shift1 * wgt0) + (lvl_shift2 * wgt1);
     tmp_lvl_shift += ((off0 + off1 + 1) << (shift - 1));
     tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
     tmp_shift_t = vmovq_n_s32(tmp_shift);

     /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
     /* height has also been unrolled, hence 2 rows will processed at a time                     */
     /* store also has been taken care for two row process                                       */
     /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
     /* saturated and narrowed                                                                   */

     for(row = ht; row > 0; row -= 2)
     {
         for(col = wd; col > 0; col -= 4)
         {
             pi2_src_tmp1 = pi2_src1 + src_strd1;
             pi2_src_tmp2 = pi2_src2 + src_strd2;

             pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
             pi2_src1 += 4;
             pu1_dst_tmp = pu1_dst + dst_strd;

             pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
             pi2_src2 += 4;
             i4_tmp1_t1 = vmull_n_s16(pi2_src1_val1, (int16_t)wgt0);

             pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
             i4_tmp1_t2 = vmull_n_s16(pi2_src2_val1, (int16_t)wgt1);

             pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
             i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);

             i4_tmp2_t1 = vmull_n_s16(pi2_src1_val2, (int16_t)wgt0);
             i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t);

             i4_tmp2_t2 = vmull_n_s16(pi2_src2_val2, (int16_t)wgt1);
             sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);

             i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
             sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);

             i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t);
             sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);

             sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
             sto_res = vqmovn_u16(sto_res_tmp3);

             sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
             sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);

             vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
             pu1_dst += 4;

             sto_res = vqmovn_u16(sto_res_tmp3);
             vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
         }
         pi2_src1 += 2 * src_strd1 - wd;
         pi2_src2 += 2 * src_strd2 - wd;
         pu1_dst += 2 * dst_strd - wd;
     }
 }
 //WEIGHTED_PRED_BI

 /**
 *******************************************************************************
 *
 * @brief
 *  Chroma bi-weighted prediction on the arrays pointed by  pi2_src1 and
 * pi2_src2 and stores it at location pointed  by pi2_dst   Assumptions : The
 * function is optimized considering the fact Width and  height are multiple
 * of 2.
 *
 * @par Description:
 *  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
 * off1 + 1) << (shift - 1) ) >> shift
 *
 * @param[in] pi2_src1
 *  Pointer to source 1
 *
 * @param[in] pi2_src2
 *  Pointer to source 2
 *
 * @param[out] pu1_dst
 *  Pointer to destination
 *
 * @param[in] src_strd1
 *  Source stride 1
 *
 * @param[in] src_strd2
 *  Source stride 2
 *
 * @param[in] dst_strd
 *  Destination stride
 *
 * @param[in] wgt0
 *  weight to be multiplied to source 1
 *
 * @param[in] off0
 *  offset 0
 *
 * @param[in] wgt1
 *  weight to be multiplied to source 2
 *
 * @param[in] off1
 *  offset 1
 *
 * @param[in] shift
 *  (14 Bit depth) + log2_weight_denominator
 *
 * @param[in] lvl_shift1
 *  added before shift and offset
 *
 * @param[in] lvl_shift2
 *  added before shift and offset
 *
 * @param[in] ht
 *  height of the source
 *
 * @param[in] wd
 *  width of the source
 *
 * @returns
 *
 * @remarks
 *  None
 *
 *******************************************************************************
 */

 void ihevc_weighted_pred_chroma_bi_neonintr(WORD16 *pi2_src1,
                                             WORD16 *pi2_src2,
                                             UWORD8 *pu1_dst,
                                             WORD32 src_strd1,
                                             WORD32 src_strd2,
                                             WORD32 dst_strd,
                                             WORD32 wgt0_cb,
                                             WORD32 wgt0_cr,
                                             WORD32 off0_cb,
                                             WORD32 off0_cr,
                                             WORD32 wgt1_cb,
                                             WORD32 wgt1_cr,
                                             WORD32 off1_cb,
                                             WORD32 off1_cr,
                                             WORD32 shift,
                                             WORD32 lvl_shift1,
                                             WORD32 lvl_shift2,
                                             WORD32 ht,
                                             WORD32 wd)
 {
     WORD32 row, col;
     int16x4_t pi2_src1_val1;
     int16x4_t pi2_src1_val2;
     int16x4_t pi2_src2_val1;
     int16x4_t pi2_src2_val2;
     int32x4_t i4_tmp1_t1;
     int32x4_t i4_tmp1_t2;
     int32x4_t i4_tmp2_t1;
     int32x4_t i4_tmp2_t2;
     int32x4_t sto_res_tmp1;
     uint16x4_t sto_res_tmp2;
     uint16x8_t sto_res_tmp3;
     uint8x8_t sto_res;
     int32x4_t tmp_lvl_shift_t_u, tmp_lvl_shift_t_v;
     int32x4x2_t tmp_lvl_shift_t;
     WORD32 tmp_shift = 0 - shift;
     int32x4_t tmp_shift_t;
     int16x4_t tmp_wgt0_u, tmp_wgt0_v, tmp_wgt1_u, tmp_wgt1_v;
     int16x4x2_t wgt0, wgt1;
     WORD16 *pi2_src_tmp1;
     WORD16 *pi2_src_tmp2;
     UWORD8 *pu1_dst_tmp;

     WORD32 tmp_lvl_shift = (lvl_shift1 * wgt0_cb) + (lvl_shift2 * wgt1_cb);
     tmp_lvl_shift += ((off0_cb + off1_cb + 1) << (shift - 1));
     tmp_lvl_shift_t_u = vmovq_n_s32(tmp_lvl_shift);

     tmp_lvl_shift = (lvl_shift1 * wgt0_cr) + (lvl_shift2 * wgt1_cr);
     tmp_lvl_shift += ((off0_cr + off1_cr + 1) << (shift - 1));
     tmp_lvl_shift_t_v = vmovq_n_s32(tmp_lvl_shift);

     tmp_lvl_shift_t = vzipq_s32(tmp_lvl_shift_t_u, tmp_lvl_shift_t_v);

     tmp_shift_t = vmovq_n_s32(tmp_shift);

     tmp_wgt0_u = vdup_n_s16(wgt0_cb);
     tmp_wgt0_v = vdup_n_s16(wgt0_cr);
     wgt0 = vzip_s16(tmp_wgt0_u, tmp_wgt0_v);
     tmp_wgt1_u = vdup_n_s16(wgt1_cb);
     tmp_wgt1_v = vdup_n_s16(wgt1_cr);
     wgt1 = vzip_s16(tmp_wgt1_u, tmp_wgt1_v);

     /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
     /* height has also been unrolled, hence 2 rows will processed at a time                     */
     /* store also has been taken care for two row process                                       */
     /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
     /* saturated and narrowed                                                                   */

     for(row = ht; row > 0; row -= 2)
     {
         for(col = 2 * wd; col > 0; col -= 4)
         {
             pi2_src_tmp1 = pi2_src1 + src_strd1;
             pi2_src_tmp2 = pi2_src2 + src_strd2;

             pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
             pi2_src1 += 4;
             pu1_dst_tmp = pu1_dst + dst_strd;

             pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
             pi2_src2 += 4;
             i4_tmp1_t1 = vmull_s16(pi2_src1_val1, wgt0.val[0]);

             pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
             i4_tmp1_t2 = vmull_s16(pi2_src2_val1, wgt1.val[0]);

             pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
             i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);

             i4_tmp2_t1 = vmull_s16(pi2_src1_val2, wgt0.val[0]);
             i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t.val[0]);

             i4_tmp2_t2 = vmull_s16(pi2_src2_val2, wgt1.val[0]);
             sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);

             i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
             sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);

             i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t.val[0]);
             sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);

             sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
             sto_res = vqmovn_u16(sto_res_tmp3);

             sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
             sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);

             vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
             pu1_dst += 4;

             sto_res = vqmovn_u16(sto_res_tmp3);
             vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
         }
         pi2_src1 += 2 * src_strd1 - 2 * wd;
         pi2_src2 += 2 * src_strd2 - 2 * wd;
         pu1_dst += 2 * dst_strd - 2 * wd;
     }
 }
 //WEIGHTED_PRED_CHROMA_BI

 /**
 *******************************************************************************
 *
 * @brief
 *  Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
 * pi2_src2 and stores it at location  pointed by pi2_dst Assumptions : The
 * function is optimized considering the fact Width and  height are multiple
 * of 2.
 *
 * @par Description:
 *  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
 * >> shift  where shift = 15 - BitDepth
 *
 * @param[in] pi2_src1
 *  Pointer to source 1
 *
 * @param[in] pi2_src2
 *  Pointer to source 2
 *
 * @param[out] pu1_dst
 *  Pointer to destination
 *
 * @param[in] src_strd1
 *  Source stride 1
 *
 * @param[in] src_strd2
 *  Source stride 2
 *
 * @param[in] dst_strd
 *  Destination stride
 *
 * @param[in] lvl_shift1
 *  added before shift and offset
 *
 * @param[in] lvl_shift2
 *  added before shift and offset
 *
 * @param[in] ht
 *  height of the source
 *
 * @param[in] wd
 *  width of the source
 *
 * @returns
 *
 * @remarks
 *  None
 *
 *******************************************************************************
 */

 void ihevc_weighted_pred_bi_default_neonintr(WORD16 *pi2_src1,
                                              WORD16 *pi2_src2,
                                              UWORD8 *pu1_dst,
                                              WORD32 src_strd1,
                                              WORD32 src_strd2,
                                              WORD32 dst_strd,
                                              WORD32 lvl_shift1,
                                              WORD32 lvl_shift2,
                                              WORD32 ht,
                                              WORD32 wd)
 {
     WORD32 row, col;
     int16x4_t pi2_src1_val1;
     int16x4_t pi2_src1_val2;
     int16x4_t pi2_src2_val1;
     int16x4_t pi2_src2_val2;
     int32x4_t i4_tmp1_t1;
     int32x4_t i4_tmp1_t2;
     int32x4_t i4_tmp2_t1;
     int32x4_t i4_tmp2_t2;
     int32x4_t sto_res_tmp1;
     uint16x4_t sto_res_tmp2;
     uint16x8_t sto_res_tmp3;
     uint8x8_t sto_res;
     int32x4_t tmp_lvl_shift_t;
     int32x4_t tmp_shift_t;
     WORD16 *pi2_src_tmp1;
     WORD16 *pi2_src_tmp2;
     UWORD8 *pu1_dst_tmp;
     WORD32 shift;

     shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
     WORD32 tmp_shift = 0 - shift;
     WORD32 tmp_lvl_shift = 1 << (shift - 1);
     tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
     tmp_shift_t = vmovq_n_s32(tmp_shift);

     int16x4_t lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1);
     int16x4_t lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2);

     /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
     /* height has also been unrolled, hence 2 rows will processed at a time                     */
     /* store also has been taken care for two row process                                       */
     /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
     /* saturated and narrowed                                                                   */

     for(row = ht; row > 0; row -= 2)
     {
         for(col = wd; col > 0; col -= 4)
         {
             pi2_src_tmp1 = pi2_src1 + src_strd1;
             pi2_src_tmp2 = pi2_src2 + src_strd2;

             pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
             pi2_src1 += 4;
             pu1_dst_tmp = pu1_dst + dst_strd;

             pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
             pi2_src2 += 4;
             i4_tmp1_t1 = vaddl_s16(pi2_src1_val1, lvl_shift1_t);

             pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
             i4_tmp1_t2 = vaddl_s16(pi2_src2_val1, lvl_shift2_t);

             pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
             i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);

             i4_tmp2_t1 = vaddl_s16(pi2_src1_val2, lvl_shift1_t);
             i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t);

             i4_tmp2_t2 = vaddl_s16(pi2_src2_val2, lvl_shift2_t);
             sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);

             i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
             sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);

             i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t);
             sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);

             sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
             sto_res = vqmovn_u16(sto_res_tmp3);

             sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
             sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);

             vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
             pu1_dst += 4;

             sto_res = vqmovn_u16(sto_res_tmp3);
             vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
         }
         pi2_src1 += 2 * src_strd1 - wd;
         pi2_src2 += 2 * src_strd2 - wd;
         pu1_dst += 2 * dst_strd - wd;
     }
 }
 //WEIGHTED_PRED_BI_DEFAULT

 /**
 *******************************************************************************
 *
 * @brief
 *  Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
 * pi2_src2 and stores it at location  pointed by pi2_dst Assumptions : The
 * function is optimized considering the fact Width and  height are multiple
 * of 2.
 *
 * @par Description:
 *  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
 * >> shift  where shift = 15 - BitDepth
 *
 * @param[in] pi2_src1
 *  Pointer to source 1
 *
 * @param[in] pi2_src2
 *  Pointer to source 2
 *
 * @param[out] pu1_dst
 *  Pointer to destination
 *
 * @param[in] src_strd1
 *  Source stride 1
 *
 * @param[in] src_strd2
 *  Source stride 2
 *
 * @param[in] dst_strd
 *  Destination stride
 *
 * @param[in] lvl_shift1
 *  added before shift and offset
 *
 * @param[in] lvl_shift2
 *  added before shift and offset
 *
 * @param[in] ht
 *  height of the source
 *
 * @param[in] wd
 *  width of the source
 *
 * @returns
 *
 * @remarks
 *  None
 *
 *******************************************************************************
 */

 void ihevc_weighted_pred_chroma_bi_default_neonintr(WORD16 *pi2_src1,
                                                     WORD16 *pi2_src2,
                                                     UWORD8 *pu1_dst,
                                                     WORD32 src_strd1,
                                                     WORD32 src_strd2,
                                                     WORD32 dst_strd,
                                                     WORD32 lvl_shift1,
                                                     WORD32 lvl_shift2,
                                                     WORD32 ht,
                                                     WORD32 wd)
 {
     WORD32 row, col;
     int16x4_t pi2_src1_val1;
     int16x4_t pi2_src1_val2;
     int16x4_t pi2_src2_val1;
     int16x4_t pi2_src2_val2;
     int32x4_t i4_tmp1_t1;
     int32x4_t i4_tmp1_t2;
     int32x4_t i4_tmp2_t1;
     int32x4_t i4_tmp2_t2;
     int32x4_t sto_res_tmp1;
     uint16x4_t sto_res_tmp2;
     uint16x8_t sto_res_tmp3;
     uint8x8_t sto_res;
     int32x4_t tmp_lvl_shift_t;
     int32x4_t tmp_shift_t;
     WORD16 *pi2_src_tmp1;
     WORD16 *pi2_src_tmp2;
     UWORD8 *pu1_dst_tmp;
     WORD32 shift;
     WORD32 tmp_shift;
     WORD32 tmp_lvl_shift;
     int16x4_t lvl_shift1_t;
     int16x4_t lvl_shift2_t;
     shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
     tmp_shift = 0 - shift;
     tmp_lvl_shift = 1 << (shift - 1);
     tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
     tmp_shift_t = vmovq_n_s32(tmp_shift);

     lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1);
     lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2);

     /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
     /* height has also been unrolled, hence 2 rows will processed at a time                     */
     /* store also has been taken care for two row process                                       */
     /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
     /* saturated and narrowed                                                                   */

     for(row = ht; row > 0; row -= 2)
     {
         for(col = 2 * wd; col > 0; col -= 4)
         {
             pi2_src_tmp1 = pi2_src1 + src_strd1;
             pi2_src_tmp2 = pi2_src2 + src_strd2;

             pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
             pi2_src1 += 4;
             pu1_dst_tmp = pu1_dst + dst_strd;

             pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
             pi2_src2 += 4;
             i4_tmp1_t1 = vaddl_s16(pi2_src1_val1, lvl_shift1_t);

             pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
             i4_tmp1_t2 = vaddl_s16(pi2_src2_val1, lvl_shift2_t);

             pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
             i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);

             i4_tmp2_t1 = vaddl_s16(pi2_src1_val2, lvl_shift1_t);
             i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t);

             i4_tmp2_t2 = vaddl_s16(pi2_src2_val2, lvl_shift2_t);
             sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);

             i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
             sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);

             i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t);
             sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);

             sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
             sto_res = vqmovn_u16(sto_res_tmp3);

             sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
             sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);

             vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
             pu1_dst += 4;

             sto_res = vqmovn_u16(sto_res_tmp3);
             vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
         }
         pi2_src1 += 2 * src_strd1 - 2 * wd;
         pi2_src2 += 2 * src_strd2 - 2 * wd;
         pu1_dst += 2 * dst_strd - 2 * wd;
     }
 }
 //WEIGHTED_PRED_CHROMA_BI_DEFAULT