encoder/x86/ih264e_half_pel_ssse3.c - platform/external/libavc - Git at Google

 /******************************************************************************
  *
  * Copyright (C) 2015 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at:
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  *
  *****************************************************************************
  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 */
 /**
  *******************************************************************************
  * @file
  *  ih264e_half_pel_ssse3.c
  *
  * @brief
  *  Contains the x86 intrinsic function definitions for 6-tap vertical filter
  *  and cascaded 2D filter used in motion estimation in H264 encoder.
  *
  * @author
  *  Ittiam
  *
  * @par List of Functions:
  *  ih264e_sixtapfilter_horz_ssse3
  *  ih264e_sixtap_filter_2dvh_vert_ssse3
  *
  * @remarks
  *  None
  *
  *******************************************************************************
  */

 /*****************************************************************************/
 /* File Includes                                                             */
 /*****************************************************************************/

 /* System include files */
 #include <stdio.h>
 #include <assert.h>
 #include <limits.h>

 /* User include files */
 #include "ih264_typedefs.h"
 #include "ithread.h"
 #include "ih264_platform_macros.h"
 #include "ih264_defs.h"
 #include "ih264e_half_pel.h"
 #include "ih264_macros.h"
 #include "ih264e_debug.h"
 #include "ih264_inter_pred_filters.h"
 #include "ih264_mem_fns.h"
 #include "ih264_padding.h"
 #include "ih264_intra_pred_filters.h"
 #include "ih264_deblk_edge_filters.h"


 /*****************************************************************************/
 /* Function Definitions                                                      */
 /*****************************************************************************/
 /*
 *******************************************************************************
 *
 * @brief
 *  Interprediction luma filter for horizontal input(Filter run for width = 17
 *  and height =16)
 *
 * @par Description:
 *  Applies a 6 tap horizontal filter .The output is  clipped to 8 bits sec.
 *  8.4.2.2.1 titled "Luma sample interpolation process"
 *
 * @param[in] pu1_src
 *  UWORD8 pointer to the source
 *
 * @param[out] pu1_dst
 *  UWORD8 pointer to the destination
 *
 * @param[in] src_strd
 *  integer source stride
 *
 * @param[in] dst_strd
 *  integer destination stride
 *
 * @returns
 *  None
 *
 * @remarks
 *  None
 *
 *******************************************************************************
 */
 void ih264e_sixtapfilter_horz_ssse3(UWORD8 *pu1_src,
                                     UWORD8 *pu1_dst,
                                     WORD32 src_strd,
                                     WORD32 dst_strd)
 {
     WORD32 ht;
     WORD32 tmp;

     __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b;
     __m128i src_r0_t1_16x8b, src_r1_t1_16x8b;

     __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
     __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;

     __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
     __m128i const_val16_8x16b;

     ht = 16;
     pu1_src -= 2; // the filter input starts from x[-2] (till x[3])

     coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
     coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
     coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5
                                                  //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
     const_val16_8x16b = _mm_set1_epi16(16);

     //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
     //Row0 :                         b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
     //b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels.

     do
     {
         src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);                     //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
         src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8));               //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15

         src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                      //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
         src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1);                      //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0

         src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
         src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8

         res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b);    //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
                                                                                  //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
         res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b);    //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
                                                                                  //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1

         src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                          //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
         src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                          //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0

         src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                  //a3 a4 a5 a6 a7 a8 a9....a15 0  0  0
         src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                  //b3 b4 b5 b6 b7 b8 b9....b15 0  0  0

         src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
         src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10

         res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b);    //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
                                                                                  //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
         res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b);    //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
                                                                                  //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3

         src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                          //a4 a5 a6 a7 a8 a9....a15 0  0  0  0
         src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                          //b4 b5 b6 b7 b8 b9....b15 0  0  0  0

         src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                  //a5 a6 a7 a8 a9....a15 0  0  0  0  0
         src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                  //b5 b6 b7 b8 b9....b15 0  0  0  0  0

         src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
         src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12

         res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b);    //a4*c4+a5*c5 a5*c4+a6*c5  a6*c4+a7*c5   a7*c4+a8*c5
                                                                                  //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
         res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b);    //b4*c4+b5*c5 b5*c4+b6*c5  b6*c4+b7*c5   b7*c4+b8*c5
                                                                                  //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
         res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
         res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
         res_r0_t3_8x16b = _mm_add_epi16(res_r0_t3_8x16b, const_val16_8x16b);
         res_r1_t3_8x16b = _mm_add_epi16(res_r1_t3_8x16b, const_val16_8x16b);
         res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
         res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);

         tmp = ((pu1_src[18] + pu1_src[19]) << 2) - pu1_src[17] - pu1_src[20];
         tmp = pu1_src[16] + pu1_src[21] + (tmp << 2) + tmp;

         res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5);                    //shifting right by 5 bits.
         res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5);
         tmp = (tmp + 16) >> 5;

         src_r0_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r1_t1_8x16b);
         pu1_dst[16] = CLIP_U8(tmp);

         _mm_storeu_si128((__m128i *)pu1_dst, src_r0_16x8b);

         ht--;
         pu1_src += src_strd;
         pu1_dst += dst_strd;
     }
     while(ht > 0);
 }

 /*
 *******************************************************************************
 *
 * @brief
 *   This function implements a two stage cascaded six tap filter. It
 *    applies the six tap filter in the vertical direction on the
 *    predictor values, followed by applying the same filter in the
 *    horizontal direction on the output of the first stage. The six tap
 *    filtering operation is described in sec 8.4.2.2.1 titled "Luma sample
 *    interpolation process" (Filter run for width = 17 and height =17)
 *
 * @par Description:
 *    The function interpolates the predictors first in the vertical direction
 *    and then in the horizontal direction to output the (1/2,1/2). The output
 *    of the first stage of the filter is stored in the buffer pointed to by
 *    pi16_pred1(only in C) in 16 bit precision.
 *
 * @param[in] pu1_src
 *  UWORD8 pointer to the source
 *
 * @param[out] pu1_dst1
 *  UWORD8 pointer to the destination(Vertical filtered output)
 *
 * @param[out] pu1_dst2
 *  UWORD8 pointer to the destination(out put after applying horizontal filter
 *  to the intermediate vertical output)
 *
 * @param[in] src_strd
 *  integer source stride

 * @param[in] dst_strd
 *  integer destination stride of pu1_dst
 *
 * @param[in]pi16_pred1
 *  Pointer to 16bit intermediate buffer(used only in c)
 *
 * @param[in] pi16_pred1_strd
 *  integer destination stride of pi16_pred1
 *
 * @returns
 *  None
 *
 * @remarks
 *  None
 *
 *******************************************************************************
 */
 void ih264e_sixtap_filter_2dvh_vert_ssse3(UWORD8 *pu1_src,
                                           UWORD8 *pu1_dst1,
                                           UWORD8 *pu1_dst2,
                                           WORD32 src_strd,
                                           WORD32 dst_strd,
                                           WORD32 *pi4_pred1,
                                           WORD32 pred1_strd)
 {
     WORD32 ht;
     WORD16 *pi2_pred1;

     ht = 17;
     pi2_pred1 = (WORD16 *)pi4_pred1;
     pred1_strd = pred1_strd << 1;

     // Vertical 6-tap filter
     {
         __m128i src1_r0_16x8b, src1_r1_16x8b, src1_r2_16x8b;
         __m128i src1_r3_16x8b, src1_r4_16x8b, src1_r5_16x8b;
         __m128i src2_r0_16x8b, src2_r1_16x8b, src2_r2_16x8b;
         __m128i src2_r3_16x8b, src2_r4_16x8b, src2_r5_16x8b;

         __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;

         __m128i res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;
         __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;

         coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
         coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
         coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5
                                                      //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20

         pu1_src -= 2;
         pu1_src -= src_strd << 1; // the filter input starts from x[-2] (till x[3])

         // Loading first five rows to start first row processing.
         // 22 values loaded in each row.
         src1_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
         src2_r0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14));
         pu1_src += src_strd;

         src1_r1_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
         src2_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14));
         pu1_src += src_strd;

         src1_r2_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
         src2_r2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14));
         pu1_src += src_strd;

         src1_r3_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
         src2_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14));
         pu1_src += src_strd;

         src1_r4_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
         src2_r4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14));
         pu1_src += src_strd;

         do
         {
             src1_r5_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
             src2_r5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14));

             src_r0r1_16x8b = _mm_unpacklo_epi8(src1_r0_16x8b, src1_r1_16x8b);
             src_r2r3_16x8b = _mm_unpacklo_epi8(src1_r2_16x8b, src1_r3_16x8b);
             src_r4r5_16x8b = _mm_unpacklo_epi8(src1_r4_16x8b, src1_r5_16x8b);

             res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
             res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
             res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);

             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
             res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);

             _mm_storeu_si128((__m128i *)pi2_pred1, res_t1_8x16b);

             src_r0r1_16x8b = _mm_unpackhi_epi8(src1_r0_16x8b, src1_r1_16x8b);
             src_r2r3_16x8b = _mm_unpackhi_epi8(src1_r2_16x8b, src1_r3_16x8b);
             src_r4r5_16x8b = _mm_unpackhi_epi8(src1_r4_16x8b, src1_r5_16x8b);

             res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
             res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
             res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);

             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
             res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);

             _mm_storeu_si128((__m128i *)(pi2_pred1 + 8), res_t1_8x16b);

             src_r0r1_16x8b = _mm_unpacklo_epi8(src2_r0_16x8b, src2_r1_16x8b);
             src_r2r3_16x8b = _mm_unpacklo_epi8(src2_r2_16x8b, src2_r3_16x8b);
             src_r4r5_16x8b = _mm_unpacklo_epi8(src2_r4_16x8b, src2_r5_16x8b);

             res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
             res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
             res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);

             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
             res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);

             _mm_storeu_si128((__m128i *)(pi2_pred1 + 14), res_t1_8x16b);

             src1_r0_16x8b = src1_r1_16x8b;
             src1_r1_16x8b = src1_r2_16x8b;
             src1_r2_16x8b = src1_r3_16x8b;
             src1_r3_16x8b = src1_r4_16x8b;
             src1_r4_16x8b = src1_r5_16x8b;

             src2_r0_16x8b = src2_r1_16x8b;
             src2_r1_16x8b = src2_r2_16x8b;
             src2_r2_16x8b = src2_r3_16x8b;
             src2_r3_16x8b = src2_r4_16x8b;
             src2_r4_16x8b = src2_r5_16x8b;

             ht--;
             pu1_src += src_strd;
             pi2_pred1 += pred1_strd;
         }
         while(ht > 0);
     }

     ht = 17;
     pi2_pred1 = (WORD16 *)pi4_pred1;

     // Horizontal 6-tap filter
     {
         WORD32 temp;

         __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b;
         __m128i src_r4_8x16b, src_r5_8x16b;
         __m128i src_r0r1_8x16b, src_r2r3_8x16b, src_r4r5_8x16b;
         __m128i res_vert1_8x16b, res_vert2_8x16b, res_16x8b;

         __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b;
         __m128i res_c0_8x16b, res_c1_8x16b;

         __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
         __m128i const_val512_4x32b, const_val16_8x16b;

         coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001); //c0 c1 c0 c1 c0 c1 c0 c1
         coeff2_3_8x16b = _mm_set1_epi32(0x00140014); //c2 c3 c2 c3 c2 c3 c2 c3
         coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB); //c4 c5 c4 c5 c4 c5 c4 c5
                                                      //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
         const_val512_4x32b = _mm_set1_epi32(512);
         const_val16_8x16b = _mm_set1_epi16(16);

         do
         {
             src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1));
             src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 1));
             src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 2));
             src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 3));
             src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 4));
             src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 5));

             res_vert1_8x16b = _mm_add_epi16(src_r2_8x16b, const_val16_8x16b);
             res_vert1_8x16b = _mm_srai_epi16(res_vert1_8x16b, 5); //shifting right by 5 bits.

             src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
             src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
             src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);

             res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
             res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
             res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);

             res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
             res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
             res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
             res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

             src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
             src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
             src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);

             res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
             res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
             res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);

             res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
             res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
             res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
             res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

             res_c0_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);

             src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8));
             src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 1));
             src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 2));
             src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 3));
             src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 4));
             src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 5));

             res_vert2_8x16b = _mm_add_epi16(src_r2_8x16b, const_val16_8x16b);
             res_vert2_8x16b = _mm_srai_epi16(res_vert2_8x16b, 5); //shifting right by 5 bits.

             src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
             src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
             src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);

             res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
             res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
             res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);

             res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
             res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
             res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
             res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b ,10);

             src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
             src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
             src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);

             res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
             res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
             res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);

             res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
             res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
             res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
             res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

             res_c1_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);

             res_16x8b = _mm_packus_epi16(res_vert1_8x16b, res_vert2_8x16b);
             _mm_storeu_si128((__m128i *)pu1_dst1, res_16x8b);
             pu1_dst1[16] = CLIP_U8((pi2_pred1[18] + 16) >> 5);

             res_16x8b = _mm_packus_epi16(res_c0_8x16b, res_c1_8x16b);
             _mm_storeu_si128((__m128i *)pu1_dst2, res_16x8b);
             temp = ((pi2_pred1[18] + pi2_pred1[19]) << 2) - pi2_pred1[17] - pi2_pred1[20];
             temp = pi2_pred1[16] + pi2_pred1[21] + (temp << 2) + temp;
             pu1_dst2[16] = CLIP_U8((temp + 512) >> 10);

             ht--;
             pi2_pred1 += pred1_strd;
             pu1_dst1 += dst_strd;
             pu1_dst2 += dst_strd;
         }
         while(ht > 0);
     }
 }
	/******************************************************************************
	*
	* Copyright (C) 2015 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at:
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*
	*****************************************************************************
	* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
	*/
	/**
	*******************************************************************************
	* @file
	* ih264e_half_pel_ssse3.c
	*
	* @brief
	* Contains the x86 intrinsic function definitions for 6-tap vertical filter
	* and cascaded 2D filter used in motion estimation in H264 encoder.
	*
	* @author
	* Ittiam
	*
	* @par List of Functions:
	* ih264e_sixtapfilter_horz_ssse3
	* ih264e_sixtap_filter_2dvh_vert_ssse3
	*
	* @remarks
	* None
	*
	*******************************************************************************
	*/

	/*****************************************************************************/
	/* File Includes */
	/*****************************************************************************/

	/* System include files */
	#include <stdio.h>
	#include <assert.h>
	#include <limits.h>

	/* User include files */
	#include "ih264_typedefs.h"
	#include "ithread.h"
	#include "ih264_platform_macros.h"
	#include "ih264_defs.h"
	#include "ih264e_half_pel.h"
	#include "ih264_macros.h"
	#include "ih264e_debug.h"
	#include "ih264_inter_pred_filters.h"
	#include "ih264_mem_fns.h"
	#include "ih264_padding.h"
	#include "ih264_intra_pred_filters.h"
	#include "ih264_deblk_edge_filters.h"


	/*****************************************************************************/
	/* Function Definitions */
	/*****************************************************************************/
	/*
	*******************************************************************************
	*
	* @brief
	* Interprediction luma filter for horizontal input(Filter run for width = 17
	* and height =16)
	*
	* @par Description:
	* Applies a 6 tap horizontal filter .The output is clipped to 8 bits sec.
	* 8.4.2.2.1 titled "Luma sample interpolation process"
	*
	* @param[in] pu1_src
	* UWORD8 pointer to the source
	*
	* @param[out] pu1_dst
	* UWORD8 pointer to the destination
	*
	* @param[in] src_strd
	* integer source stride
	*
	* @param[in] dst_strd
	* integer destination stride
	*
	* @returns
	* None
	*
	* @remarks
	* None
	*
	*******************************************************************************
	*/
	void ih264e_sixtapfilter_horz_ssse3(UWORD8 *pu1_src,
	UWORD8 *pu1_dst,
	WORD32 src_strd,
	WORD32 dst_strd)
	{
	WORD32 ht;
	WORD32 tmp;

	__m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b;
	__m128i src_r0_t1_16x8b, src_r1_t1_16x8b;

	__m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
	__m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;

	__m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
	__m128i const_val16_8x16b;

	ht = 16;
	pu1_src -= 2; // the filter input starts from x[-2] (till x[3])

	coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
	coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
	coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5
	//c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
	const_val16_8x16b = _mm_set1_epi16(16);

	//Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
	//Row0 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
	//b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels.

	do
	{
	src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
	src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15

	src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
	src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0

	src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
	src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8

	res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0c0+a1c1 a1c0+a2c1 a2c0+a3c1 a3c0+a4c1
	//a4c0+a5c1 a5c0+a6c1 a6c0+a7c1 a7c0+a8c1
	res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0c0+b1c1 b1c0+b2c1 b2c0+b3c1 b3c0+b4c1
	//b4c0+b5c1 b5c0+b6c1 b6c0+b7c1 b7c0+b8c1

	src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
	src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0

	src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0
	src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0

	src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
	src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10

	res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2c2+a3c3 a3c2+a4c3 a4c2+a5c3 a5c2+a6c3
	//a6c2+a7c3 a7c2+a8c3 a8c2+a9c3 a9c2+a10c3
	res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2c2+b3c3 b3c2+b4c3 b2c4+b5c3 b5c2+b6c3
	//b6c2+b7c3 b7c2+b8c3 b8c2+b9c3 b9c2+b10c3

	src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0
	src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0

	src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0
	src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0

	src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
	src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12

	res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4c4+a5c5 a5c4+a6c5 a6c4+a7c5 a7c4+a8c5
	//a8c4+a9c5 a9c4+a10c5 a10c4+a11c5 a11c4+a12c5
	res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4c4+b5c5 b5c4+b6c5 b6c4+b7c5 b7c4+b8c5
	//b8c4+b9c5 b9c4+b10c5 b10c4+b11c5 b11c4+b12c5
	res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
	res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
	res_r0_t3_8x16b = _mm_add_epi16(res_r0_t3_8x16b, const_val16_8x16b);
	res_r1_t3_8x16b = _mm_add_epi16(res_r1_t3_8x16b, const_val16_8x16b);
	res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
	res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);

	tmp = ((pu1_src[18] + pu1_src[19]) << 2) - pu1_src[17] - pu1_src[20];
	tmp = pu1_src[16] + pu1_src[21] + (tmp << 2) + tmp;

	res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5); //shifting right by 5 bits.
	res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5);
	tmp = (tmp + 16) >> 5;

	src_r0_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r1_t1_8x16b);
	pu1_dst[16] = CLIP_U8(tmp);

	_mm_storeu_si128((__m128i *)pu1_dst, src_r0_16x8b);

	ht--;
	pu1_src += src_strd;
	pu1_dst += dst_strd;
	}
	while(ht > 0);
	}

	/*
	*******************************************************************************
	*
	* @brief
	* This function implements a two stage cascaded six tap filter. It
	* applies the six tap filter in the vertical direction on the
	* predictor values, followed by applying the same filter in the
	* horizontal direction on the output of the first stage. The six tap
	* filtering operation is described in sec 8.4.2.2.1 titled "Luma sample
	* interpolation process" (Filter run for width = 17 and height =17)
	*
	* @par Description:
	* The function interpolates the predictors first in the vertical direction
	* and then in the horizontal direction to output the (1/2,1/2). The output
	* of the first stage of the filter is stored in the buffer pointed to by
	* pi16_pred1(only in C) in 16 bit precision.
	*
	* @param[in] pu1_src
	* UWORD8 pointer to the source
	*
	* @param[out] pu1_dst1
	* UWORD8 pointer to the destination(Vertical filtered output)
	*
	* @param[out] pu1_dst2
	* UWORD8 pointer to the destination(out put after applying horizontal filter
	* to the intermediate vertical output)
	*
	* @param[in] src_strd
	* integer source stride

	* @param[in] dst_strd
	* integer destination stride of pu1_dst
	*
	* @param[in]pi16_pred1
	* Pointer to 16bit intermediate buffer(used only in c)
	*
	* @param[in] pi16_pred1_strd
	* integer destination stride of pi16_pred1
	*
	* @returns
	* None
	*
	* @remarks
	* None
	*
	*******************************************************************************
	*/
	void ih264e_sixtap_filter_2dvh_vert_ssse3(UWORD8 *pu1_src,
	UWORD8 *pu1_dst1,
	UWORD8 *pu1_dst2,
	WORD32 src_strd,
	WORD32 dst_strd,
	WORD32 *pi4_pred1,
	WORD32 pred1_strd)
	{
	WORD32 ht;
	WORD16 *pi2_pred1;

	ht = 17;
	pi2_pred1 = (WORD16 *)pi4_pred1;
	pred1_strd = pred1_strd << 1;

	// Vertical 6-tap filter
	{
	__m128i src1_r0_16x8b, src1_r1_16x8b, src1_r2_16x8b;
	__m128i src1_r3_16x8b, src1_r4_16x8b, src1_r5_16x8b;
	__m128i src2_r0_16x8b, src2_r1_16x8b, src2_r2_16x8b;
	__m128i src2_r3_16x8b, src2_r4_16x8b, src2_r5_16x8b;

	__m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;

	__m128i res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;
	__m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;

	coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
	coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
	coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5
	//c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20

	pu1_src -= 2;
	pu1_src -= src_strd << 1; // the filter input starts from x[-2] (till x[3])

	// Loading first five rows to start first row processing.
	// 22 values loaded in each row.
	src1_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
	src2_r0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14));
	pu1_src += src_strd;

	src1_r1_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
	src2_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14));
	pu1_src += src_strd;

	src1_r2_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
	src2_r2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14));
	pu1_src += src_strd;

	src1_r3_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
	src2_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14));
	pu1_src += src_strd;

	src1_r4_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
	src2_r4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14));
	pu1_src += src_strd;

	do
	{
	src1_r5_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
	src2_r5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14));

	src_r0r1_16x8b = _mm_unpacklo_epi8(src1_r0_16x8b, src1_r1_16x8b);
	src_r2r3_16x8b = _mm_unpacklo_epi8(src1_r2_16x8b, src1_r3_16x8b);
	src_r4r5_16x8b = _mm_unpacklo_epi8(src1_r4_16x8b, src1_r5_16x8b);

	res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
	res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
	res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);

	res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
	res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);

	_mm_storeu_si128((__m128i *)pi2_pred1, res_t1_8x16b);

	src_r0r1_16x8b = _mm_unpackhi_epi8(src1_r0_16x8b, src1_r1_16x8b);
	src_r2r3_16x8b = _mm_unpackhi_epi8(src1_r2_16x8b, src1_r3_16x8b);
	src_r4r5_16x8b = _mm_unpackhi_epi8(src1_r4_16x8b, src1_r5_16x8b);

	res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
	res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
	res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);

	res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
	res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);

	_mm_storeu_si128((__m128i *)(pi2_pred1 + 8), res_t1_8x16b);

	src_r0r1_16x8b = _mm_unpacklo_epi8(src2_r0_16x8b, src2_r1_16x8b);
	src_r2r3_16x8b = _mm_unpacklo_epi8(src2_r2_16x8b, src2_r3_16x8b);
	src_r4r5_16x8b = _mm_unpacklo_epi8(src2_r4_16x8b, src2_r5_16x8b);

	res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
	res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
	res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);

	res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
	res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);

	_mm_storeu_si128((__m128i *)(pi2_pred1 + 14), res_t1_8x16b);

	src1_r0_16x8b = src1_r1_16x8b;
	src1_r1_16x8b = src1_r2_16x8b;
	src1_r2_16x8b = src1_r3_16x8b;
	src1_r3_16x8b = src1_r4_16x8b;
	src1_r4_16x8b = src1_r5_16x8b;

	src2_r0_16x8b = src2_r1_16x8b;
	src2_r1_16x8b = src2_r2_16x8b;
	src2_r2_16x8b = src2_r3_16x8b;
	src2_r3_16x8b = src2_r4_16x8b;
	src2_r4_16x8b = src2_r5_16x8b;

	ht--;
	pu1_src += src_strd;
	pi2_pred1 += pred1_strd;
	}
	while(ht > 0);
	}

	ht = 17;
	pi2_pred1 = (WORD16 *)pi4_pred1;

	// Horizontal 6-tap filter
	{
	WORD32 temp;

	__m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b;
	__m128i src_r4_8x16b, src_r5_8x16b;
	__m128i src_r0r1_8x16b, src_r2r3_8x16b, src_r4r5_8x16b;
	__m128i res_vert1_8x16b, res_vert2_8x16b, res_16x8b;

	__m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b;
	__m128i res_c0_8x16b, res_c1_8x16b;

	__m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
	__m128i const_val512_4x32b, const_val16_8x16b;

	coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001); //c0 c1 c0 c1 c0 c1 c0 c1
	coeff2_3_8x16b = _mm_set1_epi32(0x00140014); //c2 c3 c2 c3 c2 c3 c2 c3
	coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB); //c4 c5 c4 c5 c4 c5 c4 c5
	//c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
	const_val512_4x32b = _mm_set1_epi32(512);
	const_val16_8x16b = _mm_set1_epi16(16);

	do
	{
	src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1));
	src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 1));
	src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 2));
	src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 3));
	src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 4));
	src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 5));

	res_vert1_8x16b = _mm_add_epi16(src_r2_8x16b, const_val16_8x16b);
	res_vert1_8x16b = _mm_srai_epi16(res_vert1_8x16b, 5); //shifting right by 5 bits.

	src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
	src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
	src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);

	res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
	res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
	res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);

	res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
	res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
	res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
	res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

	src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
	src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
	src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);

	res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
	res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
	res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);

	res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
	res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
	res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
	res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

	res_c0_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);

	src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8));
	src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 1));
	src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 2));
	src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 3));
	src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 4));
	src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 5));

	res_vert2_8x16b = _mm_add_epi16(src_r2_8x16b, const_val16_8x16b);
	res_vert2_8x16b = _mm_srai_epi16(res_vert2_8x16b, 5); //shifting right by 5 bits.

	src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
	src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
	src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);

	res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
	res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
	res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);

	res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
	res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
	res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
	res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b ,10);

	src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
	src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
	src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);

	res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
	res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
	res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);

	res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
	res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
	res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
	res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

	res_c1_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);

	res_16x8b = _mm_packus_epi16(res_vert1_8x16b, res_vert2_8x16b);
	_mm_storeu_si128((__m128i *)pu1_dst1, res_16x8b);
	pu1_dst1[16] = CLIP_U8((pi2_pred1[18] + 16) >> 5);

	res_16x8b = _mm_packus_epi16(res_c0_8x16b, res_c1_8x16b);
	_mm_storeu_si128((__m128i *)pu1_dst2, res_16x8b);
	temp = ((pi2_pred1[18] + pi2_pred1[19]) << 2) - pi2_pred1[17] - pi2_pred1[20];
	temp = pi2_pred1[16] + pi2_pred1[21] + (temp << 2) + temp;
	pu1_dst2[16] = CLIP_U8((temp + 512) >> 10);

	ht--;
	pi2_pred1 += pred1_strd;
	pu1_dst1 += dst_strd;
	pu1_dst2 += dst_strd;
	}
	while(ht > 0);
	}
	}