/******************************************************************************
 *
 * Copyright (C) 2015 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *****************************************************************************
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
*/
/*****************************************************************************/
/*                                                                           */
/*  File Name         : ih264_inter_pred_filters_intr_ssse3.c                */
/*                                                                           */
/*  Description       : Contains function definitions for weighted           */
/*                      prediction functions in x86 sse4 intrinsics          */
/*                                                                           */
/*  List of Functions : ih264_inter_pred_luma_copy_ssse3()                   */
/*                      ih264_inter_pred_luma_horz_ssse3()                   */
/*                      ih264_inter_pred_luma_vert_ssse3()                   */
/*                      ih264_inter_pred_luma_horz_hpel_vert_hpel_ssse3()    */
/*                      ih264_inter_pred_luma_horz_qpel_ssse3()              */
/*                      ih264_inter_pred_luma_vert_qpel_ssse3()              */
/*                      ih264_inter_pred_luma_horz_qpel_vert_qpel_ssse3()    */
/*                      ih264_inter_pred_luma_horz_hpel_vert_qpel_ssse3()    */
/*                      ih264_inter_pred_luma_horz_qpel_vert_hpel_ssse3()    */
/*                      ih264_inter_pred_chroma_ssse3()                      */
/*                                                                           */
/*  Issues / Problems : None                                                 */
/*                                                                           */
/*  Revision History  :                                                      */
/*                                                                           */
/*         DD MM YYYY   Author(s)       Changes                              */
/*         13 02 2015   Kaushik         Initial version                      */
/*                      Senthoor                                             */
/*                                                                           */
/*****************************************************************************/
/*****************************************************************************/
/* File Includes                                                             */
/*****************************************************************************/

#include <immintrin.h>
#include "ih264_typedefs.h"
#include "ih264_macros.h"
#include "ih264_platform_macros.h"
#include "ih264_inter_pred_filters.h"

/*****************************************************************************/
/* Constant Data variables                                                   */
/*****************************************************************************/

/* coefficients for 6 tap filtering*/
//const WORD32 ih264_g_six_tap[3] ={1,-5,20};
/*****************************************************************************/
/*  Function definitions .                                                   */
/*****************************************************************************/
/*****************************************************************************/
/*                                                                           */
/*  Function Name : ih264_inter_pred_luma_copy_ssse3                         */
/*                                                                           */
/*  Description   : This function copies the contents of ht x wd block from  */
/*                  source to destination. (ht,wd) can be (4,4), (8,4),      */
/*                  (4,8), (8,8), (16,8), (8,16) or (16,16).                 */
/*                                                                           */
/*  Inputs        : puc_src  - pointer to source                             */
/*                  puc_dst  - pointer to destination                        */
/*                  src_strd - stride for source                             */
/*                  dst_strd - stride for destination                        */
/*                  ht       - height of the block                           */
/*                  wd       - width of the block                            */
/*                                                                           */
/*  Issues        : None                                                     */
/*                                                                           */
/*  Revision History:                                                        */
/*                                                                           */
/*         DD MM YYYY   Author(s)       Changes                              */
/*         13 02 2015   Kaushik         Initial Version                      */
/*                      Senthoor                                             */
/*                                                                           */
/*****************************************************************************/
void ih264_inter_pred_luma_copy_ssse3(UWORD8 *pu1_src,
                                      UWORD8 *pu1_dst,
                                      WORD32 src_strd,
                                      WORD32 dst_strd,
                                      WORD32 ht,
                                      WORD32 wd,
                                      UWORD8* pu1_tmp,
                                      WORD32 dydx)
{
    __m128i y_0_16x8b, y_1_16x8b, y_2_16x8b, y_3_16x8b;

    WORD32 src_strd2, src_strd3, src_strd4, dst_strd2, dst_strd3, dst_strd4;
    UNUSED(pu1_tmp);
    UNUSED(dydx);

    src_strd2 = src_strd << 1;
    dst_strd2 = dst_strd << 1;
    src_strd4 = src_strd << 2;
    dst_strd4 = dst_strd << 2;
    src_strd3 = src_strd2 + src_strd;
    dst_strd3 = dst_strd2 + dst_strd;

    if(wd == 4)
    {
        __m128i mask_full_128b, mask_low_32b;

        mask_full_128b = _mm_set1_epi8(0xff);
        mask_low_32b = _mm_srli_si128(mask_full_128b, 12);
        // mask for first four bytes

        do
        {
            y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
            y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
            y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd2));
            y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd3));

            _mm_maskmoveu_si128(y_0_16x8b, mask_low_32b, (char*)pu1_dst);
            _mm_maskmoveu_si128(y_1_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd));
            _mm_maskmoveu_si128(y_2_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2));
            _mm_maskmoveu_si128(y_3_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3));

            ht -= 4;
            pu1_src += src_strd4;
            pu1_dst += dst_strd4;
        }
        while(ht > 0);
    }
    else if(wd == 8)
    {
        do
        {
            y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
            y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
            y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd2));
            y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd3));

            _mm_storel_epi64((__m128i *)pu1_dst, y_0_16x8b);
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd2), y_2_16x8b);
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd3), y_3_16x8b);

            ht -= 4;
            pu1_src += src_strd4;
            pu1_dst += dst_strd4;
        }
        while(ht > 0);
    }
    else // wd == 16
    {
        WORD32 src_strd5, src_strd6, src_strd7, src_strd8;
        WORD32 dst_strd5, dst_strd6, dst_strd7, dst_strd8;

        __m128i y_4_16x8b, y_5_16x8b, y_6_16x8b, y_7_16x8b;

        src_strd5 = src_strd2 + src_strd3;
        dst_strd5 = dst_strd2 + dst_strd3;
        src_strd6 = src_strd3 << 1;
        dst_strd6 = dst_strd3 << 1;
        src_strd7 = src_strd3 + src_strd4;
        dst_strd7 = dst_strd3 + dst_strd4;
        src_strd8 = src_strd << 3;
        dst_strd8 = dst_strd << 3;

        do
        {
            y_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
            y_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));
            y_2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd2));
            y_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd3));
            y_4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd4));
            y_5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd5));
            y_6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd6));
            y_7_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd7));

            _mm_storeu_si128((__m128i *)pu1_dst, y_0_16x8b);
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), y_2_16x8b);
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), y_3_16x8b);
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd4), y_4_16x8b);
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd5), y_5_16x8b);
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd6), y_6_16x8b);
            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd7), y_7_16x8b);

            ht -= 8;
            pu1_src += src_strd8;
            pu1_dst += dst_strd8;
        }
        while(ht > 0);
    }
}

/*****************************************************************************/
/*                                                                           */
/*  Function Name : ih264_inter_pred_luma_horz_ssse3                         */
/*                                                                           */
/*  Description   : This function applies a horizontal 6-tap filter on       */
/*                  ht x wd block as mentioned in sec. 8.4.2.2.1 titled      */
/*                  "Luma sample interpolation process". (ht,wd) can be      */
/*                  (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16).   */
/*                                                                           */
/*  Inputs        : puc_src  - pointer to source                             */
/*                  puc_dst  - pointer to destination                        */
/*                  src_strd - stride for source                             */
/*                  dst_strd - stride for destination                        */
/*                  ht       - height of the block                           */
/*                  wd       - width of the block                            */
/*                                                                           */
/*  Issues        : None                                                     */
/*                                                                           */
/*  Revision History:                                                        */
/*                                                                           */
/*         DD MM YYYY   Author(s)       Changes                              */
/*         13 02 2015   Kaushik         Initial Version                      */
/*                      Senthoor                                             */
/*                                                                           */
/*****************************************************************************/
void ih264_inter_pred_luma_horz_ssse3(UWORD8 *pu1_src,
                                      UWORD8 *pu1_dst,
                                      WORD32 src_strd,
                                      WORD32 dst_strd,
                                      WORD32 ht,
                                      WORD32 wd,
                                      UWORD8* pu1_tmp,
                                      WORD32 dydx)
{
    __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
    __m128i const_val16_8x16b;

    UNUSED(pu1_tmp);
    UNUSED(dydx);

    pu1_src -= 2; // the filter input starts from x[-2] (till x[3])

    coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
    coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
    coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5
                                                 //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
    const_val16_8x16b = _mm_set1_epi16(16);

    if(wd == 4)
    {
        __m128i src_r0_16x8b, src_r1_16x8b, src_r0r1_16x8b;
        __m128i src_r0_sht_16x8b, src_r1_sht_16x8b;

        __m128i res_r0r1_t1_8x16b, res_r0r1_t2_8x16b, res_r0r1_t3_8x16b;
        __m128i res_r0r1_16x8b;

        __m128i mask_full_16x8b, mask_low32b;

        mask_full_16x8b = _mm_set1_epi8(0xff);
        mask_low32b = _mm_srli_si128(mask_full_16x8b, 12); // mask for first four bytes

        //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
        //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....

        do
        {
            src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);                     //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
            src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));        //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15

            src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                     //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
            src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1);                     //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0

            src_r0_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);       //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
            src_r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);       //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8

            src_r0r1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);        //a0 a1 a1 a2 a2 a3 a3 a4 b0 b1 b1 b2 b2 b3 b3 b4
            res_r0r1_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);  //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
                                                                                    //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1

            src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4);                         //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8  0  0  0  0
            src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4);                         //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8  0  0  0  0

            src_r0r1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);        //a2 a3 a3 a4 a4 a5 a5 a6 b2 b3 b3 b4 b4 b5 b5 b6
            res_r0r1_t2_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff2_3_16x8b);  //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
                                                                                    //b2*c2+b3*c3 b3*c2+b4*c3 b4*c2+b5*c3 b5*c2+b6*c3

            src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4);                         //a4 a5 a5 a6 a6 a7 a7 a8  0  0  0  0  0  0  0  0
            src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4);                         //b4 b5 b5 b6 b6 b7 b7 b8  0  0  0  0  0  0  0  0

            src_r0r1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);        //a4 a5 a5 a6 a6 a7 a7 a8 b4 b5 b5 b6 b6 b7 b7 b8
            res_r0r1_t3_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff4_5_16x8b);  //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
                                                                                    //b4*c4+b5*c5 b5*c4+b6*c5 b4*c6+b7*c5 b7*c4+b8*c5

            res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t2_8x16b);
            res_r0r1_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0r1_t3_8x16b);
            res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t3_8x16b); //a0*c0+a1*c1+a2*c2+a3*c3+a4*a4+a5*c5 + 16;
                                                                                     //a1*c0+a2*c1+a2*c2+a3*c3+a5*a4+a6*c5 + 16;
                                                                                     //a2*c0+a3*c1+a4*c2+a5*c3+a6*a4+a7*c5 + 16;
                                                                                     //a3*c0+a4*c1+a5*c2+a6*c3+a6*a4+a8*c5 + 16;
                                                                                     //b0*c0+b1*c1+b2*c2+b3*c3+b4*b4+b5*c5 + 16;
                                                                                     //b1*c0+b2*c1+b2*c2+b3*c3+b5*b4+b6*c5 + 16;
                                                                                     //b2*c0+b3*c1+b4*c2+b5*c3+b6*b4+b7*c5 + 16;
                                                                                     //b3*c0+b4*c1+b5*c2+b6*c3+b6*b4+b8*c5 + 16;

            res_r0r1_t1_8x16b = _mm_srai_epi16(res_r0r1_t1_8x16b, 5);                //shifting right by 5 bits.

            res_r0r1_16x8b = _mm_packus_epi16(res_r0r1_t1_8x16b, res_r0r1_t1_8x16b);

            _mm_maskmoveu_si128(res_r0r1_16x8b, mask_low32b, (char*)pu1_dst);
            res_r0r1_16x8b = _mm_srli_si128(res_r0r1_16x8b, 4);
            _mm_maskmoveu_si128(res_r0r1_16x8b, mask_low32b, (char*)(pu1_dst + dst_strd));

            ht -= 2;
            pu1_src += src_strd << 1;
            pu1_dst += dst_strd << 1;
        }
        while(ht > 0);
    }
    else if(wd == 8)
    {
        __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b;
        __m128i src_r0_t1_16x8b, src_r1_t1_16x8b;

        __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
        __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;

        //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
        //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....

        do
        {
            src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);                   //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
            src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));      //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15

            src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                   //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
            src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1);                   //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0

            src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);  //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
            src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);  //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8

            res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
                                                                                  //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
            res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
                                                                                  //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1

            src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                       //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
            src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                       //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0

            src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);               //a3 a4 a5 a6 a7 a8 a9....a15 0  0  0
            src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);               //b3 b4 b5 b6 b7 b8 b9....b15 0  0  0

            src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);  //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
            src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);  //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10

            res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
                                                                                  //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
            res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
                                                                                  //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3

            src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                       //a4 a5 a6 a7 a8 a9....a15 0  0  0  0
            src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                       //b4 b5 b6 b7 b8 b9....b15 0  0  0  0

            src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);               //a5 a6 a7 a8 a9....a15 0  0  0  0  0
            src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);               //b5 b6 b7 b8 b9....b15 0  0  0  0  0

            src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);  //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
            src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);  //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12

            res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
                                                                                  //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
            res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
                                                                                  //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
            res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
            res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
            res_r0_t3_8x16b = _mm_add_epi16(res_r0_t3_8x16b, const_val16_8x16b);
            res_r1_t3_8x16b = _mm_add_epi16(res_r1_t3_8x16b, const_val16_8x16b);
            res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
            res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);

            res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5);                 //shifting right by 5 bits.
            res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5);

            src_r0_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r0_t1_8x16b);
            src_r1_16x8b = _mm_packus_epi16(res_r1_t1_8x16b, res_r1_t1_8x16b);

            _mm_storel_epi64((__m128i *)pu1_dst, src_r0_16x8b);
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), src_r1_16x8b);

            ht -= 2;
            pu1_src += src_strd << 1;
            pu1_dst += dst_strd << 1;
        }
        while(ht > 0);
    }
    else // wd == 16
    {
        __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b;
        __m128i src_r0_t1_16x8b, src_r1_t1_16x8b;

        __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
        __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;

        //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
        //Row0 :                         b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
        //b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels.

        do
        {
            src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);                  //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
            src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8));            //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15

            src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                   //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
            src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1);                   //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0

            src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);  //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
            src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);  //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8

            res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
                                                                                  //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
            res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
                                                                                  //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1

            src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                       //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
            src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                       //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0

            src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);               //a3 a4 a5 a6 a7 a8 a9....a15 0  0  0
            src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);               //b3 b4 b5 b6 b7 b8 b9....b15 0  0  0

            src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);  //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
            src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);  //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10

            res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
                                                                                  //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
            res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
                                                                                  //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3

            src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                       //a4 a5 a6 a7 a8 a9....a15 0  0  0  0
            src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                       //b4 b5 b6 b7 b8 b9....b15 0  0  0  0

            src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);               //a5 a6 a7 a8 a9....a15 0  0  0  0  0
            src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);               //b5 b6 b7 b8 b9....b15 0  0  0  0  0

            src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);  //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
            src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);  //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12

            res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
                                                                                  //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
            res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
                                                                                  //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
            res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
            res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
            res_r0_t3_8x16b = _mm_add_epi16(res_r0_t3_8x16b, const_val16_8x16b);
            res_r1_t3_8x16b = _mm_add_epi16(res_r1_t3_8x16b, const_val16_8x16b);
            res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
            res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);

            res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5);                 //shifting right by 5 bits.
            res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5);

            src_r0_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r1_t1_8x16b);
            _mm_storeu_si128((__m128i *)pu1_dst, src_r0_16x8b);

            ht--;
            pu1_src += src_strd;
            pu1_dst += dst_strd;
        }
        while(ht > 0);
    }
}

/*****************************************************************************/
/*                                                                           */
/*  Function Name : ih264_inter_pred_luma_vert_ssse3                         */
/*                                                                           */
/*  Description   : This function applies a vertical 6-tap filter on         */
/*                  ht x wd block as mentioned in sec. 8.4.2.2.1 titled      */
/*                  "Luma sample interpolation process". (ht,wd) can be      */
/*                  (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16).   */
/*                                                                           */
/*  Inputs        : puc_src  - pointer to source                             */
/*                  puc_dst  - pointer to destination                        */
/*                  src_strd - stride for source                             */
/*                  dst_strd - stride for destination                        */
/*                  ht       - height of the block                           */
/*                  wd       - width of the block                            */
/*                                                                           */
/*  Issues        : None                                                     */
/*                                                                           */
/*  Revision History:                                                        */
/*                                                                           */
/*         DD MM YYYY   Author(s)       Changes                              */
/*         13 02 2015   Kaushik         Initial Version                      */
/*                      Senthoor                                             */
/*                                                                           */
/*****************************************************************************/
void ih264_inter_pred_luma_vert_ssse3(UWORD8 *pu1_src,
                                      UWORD8 *pu1_dst,
                                      WORD32 src_strd,
                                      WORD32 dst_strd,
                                      WORD32 ht,
                                      WORD32 wd,
                                      UWORD8* pu1_tmp,
                                      WORD32 dydx)
{
    __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b, src_r4_16x8b;
    __m128i src_r5_16x8b, src_r6_16x8b;
    __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;

    __m128i res_16x8b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;

    __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
    __m128i const_val16_8x16b;

    UNUSED(pu1_tmp);
    UNUSED(dydx);

    coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
    coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
    coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5
                                                 //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
    const_val16_8x16b = _mm_set1_epi16(16);

    pu1_src -= src_strd << 1; // the filter input starts from x[-2] (till x[3])

    if(wd == 4)
    {
        __m128i mask_low32b;

        mask_low32b = _mm_set1_epi8(0xff);

        //Epilogue: Load all the pred rows except sixth and seventh row
        //          for the first and second row processing.
        src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
        pu1_src += src_strd;
        src_r1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
        pu1_src += src_strd;
        src_r2_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
        pu1_src += src_strd;
        src_r3_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
        pu1_src += src_strd;
        src_r4_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
        pu1_src += src_strd;

        mask_low32b = _mm_srli_si128(mask_low32b, 12); // mask for first four bytes

        src_r0_16x8b = _mm_unpacklo_epi32(src_r0_16x8b, src_r1_16x8b);
        src_r1_16x8b = _mm_unpacklo_epi32(src_r1_16x8b, src_r2_16x8b);
        src_r2_16x8b = _mm_unpacklo_epi32(src_r2_16x8b, src_r3_16x8b);
        src_r3_16x8b = _mm_unpacklo_epi32(src_r3_16x8b, src_r4_16x8b);

        do
        {
            src_r5_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
            src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));

            src_r4_16x8b = _mm_unpacklo_epi32(src_r4_16x8b, src_r5_16x8b);
            src_r5_16x8b = _mm_unpacklo_epi32(src_r5_16x8b, src_r6_16x8b);

            src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
            src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
            src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);

            res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
            res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
            res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);

            res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
            res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b);
            res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);

            res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
            res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);

            _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char*)pu1_dst);
            res_16x8b = _mm_srli_si128(res_16x8b, 4);
            _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char*)(pu1_dst + dst_strd));

            src_r0_16x8b = src_r2_16x8b;
            src_r1_16x8b = src_r3_16x8b;
            src_r2_16x8b = src_r4_16x8b;
            src_r3_16x8b = src_r5_16x8b;
            src_r4_16x8b = src_r6_16x8b;

            ht -= 2;
            pu1_src += src_strd << 1;
            pu1_dst += dst_strd << 1;
        }
        while(ht > 0);
    }

    else if(wd == 8)
    {
        //Epilogue: Load all the pred rows except sixth and seventh row
        //          for the first and second row processing.
        src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
        pu1_src += src_strd;
        src_r1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
        pu1_src += src_strd;
        src_r2_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
        pu1_src += src_strd;
        src_r3_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
        pu1_src += src_strd;
        src_r4_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
        pu1_src += src_strd;

        src_r0_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);
        src_r1_16x8b = _mm_unpacklo_epi64(src_r1_16x8b, src_r2_16x8b);
        src_r2_16x8b = _mm_unpacklo_epi64(src_r2_16x8b, src_r3_16x8b);
        src_r3_16x8b = _mm_unpacklo_epi64(src_r3_16x8b, src_r4_16x8b);

        do
        {
            src_r5_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
            src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));

            src_r4_16x8b = _mm_unpacklo_epi64(src_r4_16x8b, src_r5_16x8b);
            src_r5_16x8b = _mm_unpacklo_epi64(src_r5_16x8b, src_r6_16x8b);

            src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
            src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
            src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);

            res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
            res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
            res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);

            res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
            res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b);
            res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);

            res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
            res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);

            _mm_storel_epi64((__m128i *)pu1_dst, res_16x8b);

            src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b);
            src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b);
            src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b);

            res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
            res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
            res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);

            res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
            res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b);
            res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);

            res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
            res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);

            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b);

            src_r0_16x8b = src_r2_16x8b;
            src_r1_16x8b = src_r3_16x8b;
            src_r2_16x8b = src_r4_16x8b;
            src_r3_16x8b = src_r5_16x8b;
            src_r4_16x8b = src_r6_16x8b;

            ht -= 2;
            pu1_src += src_strd << 1;
            pu1_dst += dst_strd << 1;
        }
        while(ht > 0);
    }
    else // wd == 16
    {
        __m128i res_t0_8x16b;

        //Epilogue: Load all the pred rows except sixth and seventh row
        //          for the first and second row processing.
        src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
        pu1_src += src_strd;
        src_r1_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
        pu1_src += src_strd;
        src_r2_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
        pu1_src += src_strd;
        src_r3_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
        pu1_src += src_strd;
        src_r4_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
        pu1_src += src_strd;

        do
        {
            src_r5_16x8b  = _mm_loadu_si128((__m128i *)pu1_src);
            src_r6_16x8b  = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));

            src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
            src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
            src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);

            res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
            res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
            res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);

            res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
            res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b);
            res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
            res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.

            src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b);
            src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b);
            src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b);

            res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
            res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
            res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);

            res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
            res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b);
            res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
            res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.

            res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b);

            _mm_storeu_si128((__m128i *)pu1_dst, res_16x8b);

            src_r0r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r2_16x8b);
            src_r2r3_16x8b = _mm_unpacklo_epi8(src_r3_16x8b, src_r4_16x8b);
            src_r4r5_16x8b = _mm_unpacklo_epi8(src_r5_16x8b, src_r6_16x8b);

            res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
            res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
            res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);

            res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
            res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b);
            res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
            res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.

            src_r0r1_16x8b = _mm_unpackhi_epi8(src_r1_16x8b, src_r2_16x8b);
            src_r2r3_16x8b = _mm_unpackhi_epi8(src_r3_16x8b, src_r4_16x8b);
            src_r4r5_16x8b = _mm_unpackhi_epi8(src_r5_16x8b, src_r6_16x8b);

            res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
            res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
            res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);

            res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
            res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b);
            res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
            res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.

            res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b);

            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res_16x8b);

            src_r0_16x8b = src_r2_16x8b;
            src_r1_16x8b = src_r3_16x8b;
            src_r2_16x8b = src_r4_16x8b;
            src_r3_16x8b = src_r5_16x8b;
            src_r4_16x8b = src_r6_16x8b;

            ht -= 2;
            pu1_src += src_strd << 1;
            pu1_dst += dst_strd << 1;
        }
        while(ht > 0);
    }
}

/*****************************************************************************/
/*                                                                           */
/*  Function Name : ih264_inter_pred_luma_horz_hpel_vert_hpel_ssse3          */
/*                                                                           */
/*  Description   : This function implements a two stage cascaded six tap    */
/*                  filter, horizontally and then vertically on ht x wd      */
/*                  block as mentioned in sec. 8.4.2.2.1 titled "Luma sample */
/*                  interpolation process". (ht,wd) can be (4,4), (8,4),     */
/*                  (4,8), (8,8), (16,8), (8,16) or (16,16).                 */
/*                                                                           */
/*  Inputs        : puc_src  - pointer to source                             */
/*                  puc_dst  - pointer to destination                        */
/*                  src_strd - stride for source                             */
/*                  dst_strd - stride for destination                        */
/*                  ht       - height of the block                           */
/*                  wd       - width of the block                            */
/*                  pu1_tmp  - pointer to temporary buffer                   */
/*                                                                           */
/*  Issues        : None                                                     */
/*                                                                           */
/*  Revision History:                                                        */
/*                                                                           */
/*         DD MM YYYY   Author(s)       Changes                              */
/*         13 02 2015   Kaushik         Initial Version                      */
/*                      Senthoor                                             */
/*                                                                           */
/*****************************************************************************/
void ih264_inter_pred_luma_horz_hpel_vert_hpel_ssse3(UWORD8 *pu1_src,
                                                     UWORD8 *pu1_dst,
                                                     WORD32 src_strd,
                                                     WORD32 dst_strd,
                                                     WORD32 ht,
                                                     WORD32 wd,
                                                     UWORD8* pu1_tmp,
                                                     WORD32 dydx)
{
    UNUSED(dydx);

    if(wd == 4)
    {
        WORD16 *pi2_temp;

        pu1_tmp += 4;
        pu1_src -= src_strd << 1;
        pi2_temp = (WORD16 *)pu1_tmp;
        pu1_src -= 2; // the filter input starts from x[-2] (till x[3])

        // Horizontal 6-tap filtering
        {
            WORD32 ht_tmp = ht + 4;

            __m128i src_r0_16x8b, src_r1_16x8b;
            __m128i src_r0_sht_16x8b, src_r1_sht_16x8b;
            __m128i src_r0r1_t1_16x8b;
            __m128i res_r0r1_t1_8x16b, res_r0r1_t2_8x16b, res_r0r1_t3_8x16b;
            __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;

            coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01);  //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
            coeff2_3_16x8b = _mm_set1_epi32(0x14141414);  //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
            coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB);  //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
                                                          //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
            //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
            //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....

            do
            {
                src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);                       //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
                src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));          //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15

                src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                       //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
                src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1);                       //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0

                src_r0_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);         //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
                src_r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);         //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8

                src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);       //a0 a1 a1 a2 a2 a3 a3 a4 b0 b1 b1 b2 b2 b3 b3 b4
                res_r0r1_t1_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
                                                                                          //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1

                src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4);                           //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8  0  0  0  0
                src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4);                           //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8  0  0  0  0

                src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);       //a2 a3 a3 a4 a4 a5 a5 a6 b2 b3 b3 b4 b4 b5 b5 b6
                res_r0r1_t2_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
                                                                                          //b2*c2+b3*c3 b3*c2+b4*c3 b4*c2+b5*c3 b5*c2+b6*c3

                src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4);                           //a4 a5 a5 a6 a6 a7 a7 a8  0  0  0  0  0  0  0  0
                src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4);                           //b4 b5 b5 b6 b6 b7 b7 b8  0  0  0  0  0  0  0  0

                src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);       //a4 a5 a5 a6 a6 a7 a7 a8 b4 b5 b5 b6 b6 b7 b7 b8
                res_r0r1_t3_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
                                                                                          //b4*c4+b5*c5 b5*c4+b6*c5 b4*c6+b7*c5 b7*c4+b8*c5
                res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t2_8x16b);
                res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t3_8x16b, res_r0r1_t1_8x16b);

                _mm_storeu_si128((__m128i *)pi2_temp, res_r0r1_t1_8x16b);

                ht_tmp -= 2;
                pu1_src += src_strd << 1;
                pi2_temp += 8;
            }
            while(ht_tmp > 0);

            src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);                           //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
            src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                           //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0

            src_r0_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);             //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
            res_r0r1_t1_8x16b = _mm_maddubs_epi16(src_r0_16x8b, coeff0_1_16x8b);          //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1

            src_r0_16x8b = _mm_srli_si128(src_r0_16x8b,4);                                //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8  0  0  0  0
            res_r0r1_t2_8x16b = _mm_maddubs_epi16(src_r0_16x8b, coeff2_3_16x8b);          //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3

            src_r0_16x8b = _mm_srli_si128(src_r0_16x8b,4);                                //a4 a5 a5 a6 a6 a7 a7 a8  0  0  0  0  0  0  0  0
            res_r0r1_t3_8x16b = _mm_maddubs_epi16(src_r0_16x8b, coeff4_5_16x8b);          //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5

            res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t2_8x16b);
            res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t3_8x16b, res_r0r1_t1_8x16b);

            _mm_storel_epi64((__m128i *)pi2_temp, res_r0r1_t1_8x16b);
        }

        pi2_temp = (WORD16 *)pu1_tmp;

        // Vertical 6-tap filtering
        {
            __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b,
                            src_r4_8x16b;
            __m128i src_r5_8x16b, src_r6_8x16b;
            __m128i src_t1_8x16b, src_t2_8x16b;

            __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b;
            __m128i res_8x16b, res_16x8b;

            __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
            __m128i const_val512_4x32b, mask_low32b;

            mask_low32b = _mm_set1_epi8(0xff);

            coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001);
            coeff2_3_8x16b = _mm_set1_epi32(0x00140014);
            coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB);

            mask_low32b = _mm_srli_si128(mask_low32b, 12);
            const_val512_4x32b = _mm_set1_epi32(512);

            src_r0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp));
            src_r1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp + 4));
            src_r2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp + 8));
            src_r3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp + 12));
            src_r4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp + 16));
            pi2_temp += 20;

            do
            {
                src_r5_8x16b = _mm_loadl_epi64((__m128i *)pi2_temp);
                src_r6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp + 4));

                src_r0_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
                src_t1_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
                src_t2_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);

                res_t1_4x32b = _mm_madd_epi16(src_r0_8x16b, coeff0_1_8x16b);
                res_t2_4x32b = _mm_madd_epi16(src_t1_8x16b, coeff2_3_8x16b);
                res_t3_4x32b = _mm_madd_epi16(src_t2_8x16b, coeff4_5_8x16b);

                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
                res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);

                res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

                src_r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b);
                src_t1_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b);
                src_t2_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b);

                res_t1_4x32b = _mm_madd_epi16(src_r1_8x16b, coeff0_1_8x16b);
                res_t2_4x32b = _mm_madd_epi16(src_t1_8x16b, coeff2_3_8x16b);
                res_t3_4x32b = _mm_madd_epi16(src_t2_8x16b, coeff4_5_8x16b);

                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
                res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);

                res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

                res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
                res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);

                _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char*)pu1_dst);
                res_16x8b = _mm_srli_si128(res_16x8b, 4);
                _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char*)(pu1_dst + dst_strd));

                src_r0_8x16b = src_r2_8x16b;
                src_r1_8x16b = src_r3_8x16b;
                src_r2_8x16b = src_r4_8x16b;
                src_r3_8x16b = src_r5_8x16b;
                src_r4_8x16b = src_r6_8x16b;

                ht -= 2;
                pi2_temp += 8;
                pu1_dst += dst_strd << 1;
            }
            while(ht > 0);
        }
    }
    else if(wd == 8)
    {
        WORD16 *pi2_temp;

        pu1_tmp += 4;
        pu1_src -= src_strd << 1;
        pi2_temp = (WORD16 *)pu1_tmp;
        pu1_src -= 2; // the filter input starts from x[-2] (till x[3])

        // Horizontal 6-tap filtering
        {
            WORD32 ht_tmp = ht + 4;

            __m128i src_r0_16x8b, src_r1_16x8b;
            __m128i src_r0_sht_16x8b, src_r1_sht_16x8b;
            __m128i src_r0_t1_16x8b, src_r1_t1_16x8b;
            __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
            __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;
            __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;

            coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01);  //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
            coeff2_3_16x8b = _mm_set1_epi32(0x14141414);  //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
            coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB);  //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
                                                          //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
            //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
            //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....

            do
            {
                src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);                      //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 a13 a14 a15
                src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));         //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 b10 b11 b12 b13 b14 b15

                src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                      //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
                src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1);                      //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0

                src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
                src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8

                res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b);    //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
                                                                                         //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
                res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b);    //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
                                                                                         //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1

                src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                          //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
                src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                          //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0

                src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                  //a3 a4 a5 a6 a7 a8 a9....a15 0  0  0
                src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                  //b3 b4 b5 b6 b7 b8 b9....b15 0  0  0

                src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
                src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10

                res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b);    //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
                                                                                         //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
                res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b);    //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
                                                                                         //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3

                src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                          //a4 a5 a6 a7 a8 a9....a15 0  0  0  0
                src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                          //b4 b5 b6 b7 b8 b9....b15 0  0  0  0

                src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                  //a5 a6 a7 a8 a9....a15 0  0  0  0  0
                src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                  //b5 b6 b7 b8 b9....b15 0  0  0  0  0

                src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
                src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12

                res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b);    //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
                                                                                         //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
                res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b);    //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
                                                                                         //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
                res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
                res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);

                res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
                res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);

                _mm_storeu_si128((__m128i *)pi2_temp, res_r0_t1_8x16b);
                _mm_storeu_si128((__m128i *)(pi2_temp + 8), res_r1_t1_8x16b);

                ht_tmp -= 2;
                pu1_src += src_strd << 1;
                pi2_temp += 16;
            }
            while(ht_tmp > 0);

            src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);                          //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 a13 a14 a15
            src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                          //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0

            src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b,src_r0_sht_16x8b);          //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
            res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b,coeff0_1_16x8b);         //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
                                                                                         //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1

            src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                              //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
            src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                      //a3 a4 a5 a6 a7 a8 a9....a15 0  0  0

            src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);         //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
            res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b);        //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
                                                                                         //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3

            src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                              //a4 a5 a6 a7 a8 a9....a15 0  0  0  0
            src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                      //a5 a6 a7 a8 a9....a15 0  0  0  0  0

            src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);         //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
            res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b);        //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
                                                                                         //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
            res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
            res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);

            _mm_storeu_si128((__m128i *)pi2_temp, res_r0_t1_8x16b);
        }

        pi2_temp = (WORD16 *)pu1_tmp;

        // Vertical 6-tap filtering
        {
            __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b,
                            src_r4_8x16b;
            __m128i src_r5_8x16b, src_r6_8x16b;
            __m128i src_r0r1_8x16b, src_r2r3_8x16b, src_r4r5_8x16b;

            __m128i res_t1_4x32b, res_t2_4x32b, res_t3_4x32b;
            __m128i res_c0_4x32b, res_c1_4x32b;
            __m128i res_8x16b, res_16x8b;

            __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
            __m128i const_val512_4x32b;

            coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001);
            coeff2_3_8x16b = _mm_set1_epi32(0x00140014);
            coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB);

            const_val512_4x32b = _mm_set1_epi32(512);

            src_r0_8x16b = _mm_loadu_si128((__m128i *)pi2_temp);
            src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 8));
            src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 16));
            src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 24));
            src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 32));
            pi2_temp += 40;

            do
            {
                src_r5_8x16b = _mm_loadu_si128((__m128i *)pi2_temp);
                src_r6_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 8));

                src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
                src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
                src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);

                res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
                res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
                res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);

                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
                res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
                res_c0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

                src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
                src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
                src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);

                res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
                res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
                res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);

                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
                res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
                res_c1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

                res_8x16b = _mm_packs_epi32(res_c0_4x32b, res_c1_4x32b);
                res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);

                _mm_storel_epi64((__m128i *)pu1_dst, res_16x8b);

                src_r0r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b);
                src_r2r3_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b);
                src_r4r5_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b);

                res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
                res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
                res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);

                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
                res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
                res_c0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

                src_r0r1_8x16b = _mm_unpackhi_epi16(src_r1_8x16b, src_r2_8x16b);
                src_r2r3_8x16b = _mm_unpackhi_epi16(src_r3_8x16b, src_r4_8x16b);
                src_r4r5_8x16b = _mm_unpackhi_epi16(src_r5_8x16b, src_r6_8x16b);

                res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
                res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
                res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);

                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
                res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
                res_c1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

                res_8x16b = _mm_packs_epi32(res_c0_4x32b, res_c1_4x32b);
                res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);

                _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b);

                src_r0_8x16b = src_r2_8x16b;
                src_r1_8x16b = src_r3_8x16b;
                src_r2_8x16b = src_r4_8x16b;
                src_r3_8x16b = src_r5_8x16b;
                src_r4_8x16b = src_r6_8x16b;

                ht -= 2;
                pi2_temp += 16;
                pu1_dst += dst_strd << 1;
            }
            while(ht > 0);
        }
    }
    else // wd == 16
    {
        WORD16 *pi2_temp;
        WORD32 ht_tmp;

        pu1_tmp += 4;
        pu1_src -= src_strd << 1;
        pi2_temp = (WORD16 *)pu1_tmp;
        pu1_src -= 2; // the filter input starts from x[-2] (till x[3])

        // Horizontal 6-tap filtering
        {
            ht_tmp = ht + 5;

            __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b;
            __m128i src_r0_t1_16x8b, src_r1_t1_16x8b;

            __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
            __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;

            __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;

            coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01);  //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
            coeff2_3_16x8b = _mm_set1_epi32(0x14141414);  //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
            coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB);  //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
                                                          //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
            //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
            //Row0 :                         b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
            //b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels.

            do
            {
                src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);                      //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
                src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8));                //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15

                src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                      //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
                src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1);                      //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0

                src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
                src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8

                res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b);    //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
                                                                                         //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
                res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b);    //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
                                                                                         //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1

                src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                          //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
                src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                          //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0

                src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                  //a3 a4 a5 a6 a7 a8 a9....a15 0  0  0
                src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                  //b3 b4 b5 b6 b7 b8 b9....b15 0  0  0

                src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
                src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10

                res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b);    //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
                                                                                         //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
                res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b);    //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
                                                                                         //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3

                src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                          //a4 a5 a6 a7 a8 a9....a15 0  0  0  0
                src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                          //b4 b5 b6 b7 b8 b9....b15 0  0  0  0

                src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                  //a5 a6 a7 a8 a9....a15 0  0  0  0  0
                src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                  //b5 b6 b7 b8 b9....b15 0  0  0  0  0

                src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
                src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12

                res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b);    //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
                                                                                         //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
                res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b);    //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
                                                                                         //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
                res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
                res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);

                res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
                res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);

                _mm_storeu_si128((__m128i *)pi2_temp, res_r0_t1_8x16b);
                _mm_storeu_si128((__m128i *)(pi2_temp + 8), res_r1_t1_8x16b);

                ht_tmp--;
                pu1_src += src_strd;
                pi2_temp += 16;
            }
            while(ht_tmp > 0);
        }

        pi2_temp = (WORD16 *)pu1_tmp;

        // Vertical 6-tap filtering
        {
            WORD16 *pi2_temp2;
            UWORD8 *pu1_dst2;
            WORD32 ht_tmp;

            __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b, src_r4_8x16b;
            __m128i src_r5_8x16b, src_r6_8x16b;
            __m128i src_r0r1_8x16b, src_r2r3_8x16b, src_r4r5_8x16b;

            __m128i res_t1_4x32b, res_t2_4x32b, res_t3_4x32b;
            __m128i res_c0_4x32b, res_c1_4x32b;
            __m128i res_8x16b, res_16x8b;

            __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
            __m128i const_val512_4x32b;

            coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001);
            coeff2_3_8x16b = _mm_set1_epi32(0x00140014);
            coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB);

            const_val512_4x32b = _mm_set1_epi32(512);

            pi2_temp2 = pi2_temp + 8;
            pu1_dst2 = pu1_dst + 8;
            ht_tmp = ht;

            /**********************************************************/
            /*     Do first height x 8 block                          */
            /**********************************************************/
            src_r0_8x16b = _mm_loadu_si128((__m128i *)pi2_temp);
            src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 16));
            src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 32));
            src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 48));
            src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 64));
            pi2_temp += 80;

            do
            {
                src_r5_8x16b = _mm_loadu_si128((__m128i *)pi2_temp);
                src_r6_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 16));

                src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
                src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
                src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);

                res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
                res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
                res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);

                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
                res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
                res_c0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

                src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
                src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
                src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);

                res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
                res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
                res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);

                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
                res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
                res_c1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

                res_8x16b = _mm_packs_epi32(res_c0_4x32b, res_c1_4x32b);
                res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);

                _mm_storel_epi64((__m128i *)pu1_dst, res_16x8b);

                src_r0r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b);
                src_r2r3_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b);
                src_r4r5_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b);

                res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
                res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
                res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);

                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
                res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
                res_c0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

                src_r0r1_8x16b = _mm_unpackhi_epi16(src_r1_8x16b, src_r2_8x16b);
                src_r2r3_8x16b = _mm_unpackhi_epi16(src_r3_8x16b, src_r4_8x16b);
                src_r4r5_8x16b = _mm_unpackhi_epi16(src_r5_8x16b, src_r6_8x16b);

                res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
                res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
                res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);

                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
                res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
                res_c1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

                res_8x16b = _mm_packs_epi32(res_c0_4x32b, res_c1_4x32b);
                res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);

                _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b);

                src_r0_8x16b = src_r2_8x16b;
                src_r1_8x16b = src_r3_8x16b;
                src_r2_8x16b = src_r4_8x16b;
                src_r3_8x16b = src_r5_8x16b;
                src_r4_8x16b = src_r6_8x16b;

                ht_tmp -= 2;
                pi2_temp += 32;
                pu1_dst += dst_strd << 1;
            }
            while(ht_tmp > 0);

            /**********************************************************/
            /*     Do second ht x 8 block                          */
            /**********************************************************/
            src_r0_8x16b = _mm_loadu_si128((__m128i *)pi2_temp2);
            src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 16));
            src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 32));
            src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 48));
            src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 64));
            pi2_temp2 += 80;

            do
            {
                src_r5_8x16b = _mm_loadu_si128((__m128i *)pi2_temp2);
                src_r6_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 16));

                src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
                src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
                src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);

                res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
                res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
                res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);

                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
                res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
                res_c0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

                src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
                src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
                src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);

                res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
                res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
                res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);

                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
                res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
                res_c1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

                res_8x16b = _mm_packs_epi32(res_c0_4x32b, res_c1_4x32b);
                res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);

                _mm_storel_epi64((__m128i *)pu1_dst2, res_16x8b);

                src_r0r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b);
                src_r2r3_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b);
                src_r4r5_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b);

                res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
                res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
                res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);

                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
                res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
                res_c0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

                src_r0r1_8x16b = _mm_unpackhi_epi16(src_r1_8x16b, src_r2_8x16b);
                src_r2r3_8x16b = _mm_unpackhi_epi16(src_r3_8x16b, src_r4_8x16b);
                src_r4r5_8x16b = _mm_unpackhi_epi16(src_r5_8x16b, src_r6_8x16b);

                res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
                res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
                res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);

                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
                res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
                res_c1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

                res_8x16b = _mm_packs_epi32(res_c0_4x32b, res_c1_4x32b);
                res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);

                _mm_storel_epi64((__m128i *)(pu1_dst2 + dst_strd), res_16x8b);

                src_r0_8x16b = src_r2_8x16b;
                src_r1_8x16b = src_r3_8x16b;
                src_r2_8x16b = src_r4_8x16b;
                src_r3_8x16b = src_r5_8x16b;
                src_r4_8x16b = src_r6_8x16b;

                ht -= 2;
                pi2_temp2 += 32;
                pu1_dst2 += dst_strd << 1;
            }
            while(ht > 0);
        }
    }
}

/*****************************************************************************/
/*                                                                           */
/*  Function Name : ih264_inter_pred_luma_horz_qpel_ssse3                    */
/*                                                                           */
/*  Description   : This function implements a six-tap filter horizontally   */
/*                  on ht x wd block and averages the values with the source */
/*                  pixels to calculate horizontal quarter-pel as mentioned  */
/*                  in sec. 8.4.2.2.1 titled "Luma sample interpolation      */
/*                  process". (ht,wd) can be (4,4), (8,4), (4,8), (8,8),     */
/*                  (16,8), (8,16) or (16,16).                               */
/*                                                                           */
/*  Inputs        : puc_src  - pointer to source                             */
/*                  puc_dst  - pointer to destination                        */
/*                  src_strd - stride for source                             */
/*                  dst_strd - stride for destination                        */
/*                  ht       - height of the block                           */
/*                  wd       - width of the block                            */
/*                  pu1_tmp  - pointer to temporary buffer                   */
/*                  dydx     - x and y reference offset for q-pel            */
/*                             calculations                                  */
/*                                                                           */
/*  Issues        : None                                                     */
/*                                                                           */
/*  Revision History:                                                        */
/*                                                                           */
/*         DD MM YYYY   Author(s)       Changes                              */
/*         13 02 2015   Kaushik         Initial Version                      */
/*                      Senthoor                                             */
/*                                                                           */
/*****************************************************************************/
void ih264_inter_pred_luma_horz_qpel_ssse3(UWORD8 *pu1_src,
                                           UWORD8 *pu1_dst,
                                           WORD32 src_strd,
                                           WORD32 dst_strd,
                                           WORD32 ht,
                                           WORD32 wd,
                                           UWORD8* pu1_tmp,
                                           WORD32 dydx)
{
    WORD32 x_offset;
    UWORD8 *pu1_pred1;

    __m128i src_r0_16x8b, src_r1_16x8b;
    __m128i src_r0_sht_16x8b, src_r1_sht_16x8b;
    __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
    __m128i const_val16_8x16b;

    UNUSED(pu1_tmp);

    x_offset = dydx & 3;

    coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
    coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
    coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5
                                                 //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
    pu1_pred1 = pu1_src + (x_offset >> 1);

    const_val16_8x16b = _mm_set1_epi16(16);

    pu1_src -= 2; // the filter input starts from x[-2] (till x[3])

    if(wd == 4)
    {
        __m128i src_r0r1_16x8b;

        __m128i res_r0r1_t1_8x16b, res_r0r1_t2_8x16b, res_r0r1_t3_8x16b;
        __m128i res_r0r1_16x8b;

        __m128i mask_full_16x8b, mask_low32b;

        mask_full_16x8b = _mm_set1_epi8(0xff);
        mask_low32b = _mm_srli_si128(mask_full_16x8b, 12); // mask for first four bytes

        //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
        //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....

        do
        {
            src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);                         //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
            src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));            //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15

            src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                         //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
            src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1);                         //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0

            src_r0_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);           //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
            src_r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);           //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8

            src_r0r1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);            //a0 a1 a1 a2 a2 a3 a3 a4 b0 b1 b1 b2 b2 b3 b3 b4
            res_r0r1_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);      //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
                                                                                        //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1

            src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4);                             //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8  0  0  0  0
            src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4);                             //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8  0  0  0  0

            src_r0r1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);            //a2 a3 a3 a4 a4 a5 a5 a6 b2 b3 b3 b4 b4 b5 b5 b6
            res_r0r1_t2_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff2_3_16x8b);      //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
                                                                                        //b2*c2+b3*c3 b3*c2+b4*c3 b4*c2+b5*c3 b5*c2+b6*c3

            src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4);                             //a4 a5 a5 a6 a6 a7 a7 a8  0  0  0  0  0  0  0  0
            src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4);                             //b4 b5 b5 b6 b6 b7 b7 b8  0  0  0  0  0  0  0  0

            src_r0r1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);            //a4 a5 a5 a6 a6 a7 a7 a8 b4 b5 b5 b6 b6 b7 b7 b8
            res_r0r1_t3_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff4_5_16x8b);      //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
                                                                                        //b4*c4+b5*c5 b5*c4+b6*c5 b4*c6+b7*c5 b7*c4+b8*c5
            src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_pred1);
            src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred1 + src_strd));

            res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t2_8x16b);
            res_r0r1_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0r1_t3_8x16b);
            res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t3_8x16b);    //a0*c0+a1*c1+a2*c2+a3*c3+a4*a4+a5*c5 + 16;
                                                                                        //a1*c0+a2*c1+a2*c2+a3*c3+a5*a4+a6*c5 + 16;
                                                                                        //a2*c0+a3*c1+a4*c2+a5*c3+a6*a4+a7*c5 + 16;
                                                                                        //a3*c0+a4*c1+a5*c2+a6*c3+a6*a4+a8*c5 + 16;
                                                                                        //b0*c0+b1*c1+b2*c2+b3*c3+b4*b4+b5*c5 + 16;
                                                                                        //b1*c0+b2*c1+b2*c2+b3*c3+b5*b4+b6*c5 + 16;
                                                                                        //b2*c0+b3*c1+b4*c2+b5*c3+b6*b4+b7*c5 + 16;
                                                                                        //b3*c0+b4*c1+b5*c2+b6*c3+b6*b4+b8*c5 + 16;
            src_r0r1_16x8b = _mm_unpacklo_epi32(src_r0_16x8b,src_r1_16x8b);

            res_r0r1_t1_8x16b = _mm_srai_epi16(res_r0r1_t1_8x16b, 5);                   //shifting right by 5 bits.

            res_r0r1_16x8b = _mm_packus_epi16(res_r0r1_t1_8x16b, res_r0r1_t1_8x16b);
            res_r0r1_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_r0r1_16x8b);              //computing q-pel

            _mm_maskmoveu_si128(res_r0r1_16x8b, mask_low32b, (char*)pu1_dst);
            res_r0r1_16x8b = _mm_srli_si128(res_r0r1_16x8b, 4);
            _mm_maskmoveu_si128(res_r0r1_16x8b, mask_low32b, (char*)(pu1_dst + dst_strd));

            ht -= 2;
            pu1_src += src_strd << 1;
            pu1_pred1 += src_strd << 1;
            pu1_dst += dst_strd << 1;
        }
        while(ht > 0);
    }
    else if(wd == 8)
    {
        __m128i src_r0_t1_16x8b, src_r1_t1_16x8b;

        __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
        __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;
        __m128i res_r0_16x8b, res_r1_16x8b;

        //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
        //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....

        do
        {
            src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);                      //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
            src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));         //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15

            src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                      //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
            src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1);                      //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0

            src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
            src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8

            res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b);    //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
                                                                                     //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
            res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b);    //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
                                                                                     //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1

            src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                          //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
            src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                          //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0

            src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                  //a3 a4 a5 a6 a7 a8 a9....a15 0  0  0
            src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                  //b3 b4 b5 b6 b7 b8 b9....b15 0  0  0

            src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
            src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10

            res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b);    //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
                                                                                     //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
            res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b);    //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
                                                                                     //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3

            src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                          //a4 a5 a6 a7 a8 a9....a15 0  0  0  0
            src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                          //b4 b5 b6 b7 b8 b9....b15 0  0  0  0

            src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                  //a5 a6 a7 a8 a9....a15 0  0  0  0  0
            src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                  //b5 b6 b7 b8 b9....b15 0  0  0  0  0

            src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
            src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12

            res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b);    //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
                                                                                     //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
            res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b);    //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
                                                                                     //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
            src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_pred1);
            src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred1 + src_strd));

            res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
            res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
            res_r0_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0_t3_8x16b);
            res_r1_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r1_t3_8x16b);
            res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
            res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);

            res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5);
            res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5);                    //shifting right by 5 bits.

            res_r0_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r0_t1_8x16b);
            res_r1_16x8b = _mm_packus_epi16(res_r1_t1_8x16b, res_r1_t1_8x16b);

            res_r0_16x8b = _mm_avg_epu8(src_r0_16x8b, res_r0_16x8b);
            res_r1_16x8b = _mm_avg_epu8(src_r1_16x8b, res_r1_16x8b);                 //computing q-pel

            _mm_storel_epi64((__m128i *)pu1_dst, res_r0_16x8b);
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_r1_16x8b);

            ht -= 2;
            pu1_src += src_strd << 1;
            pu1_pred1 += src_strd << 1;
            pu1_dst += dst_strd << 1;
        }
        while(ht > 0);
    }
    else // wd == 16
    {
        __m128i src_r0_t1_16x8b, src_r1_t1_16x8b;

        __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
        __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;
        __m128i res_16x8b;

        //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
        //Row0 :                         b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
        //b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels.

        do
        {
            src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);                      //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
            src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8));                //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15

            src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                      //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
            src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1);                      //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0

            src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
            src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8

            res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b);    //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
                                                                                     //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
            res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b);    //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
                                                                                     //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1

            src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                          //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
            src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                          //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0

            src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                  //a3 a4 a5 a6 a7 a8 a9....a15 0  0  0
            src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                  //b3 b4 b5 b6 b7 b8 b9....b15 0  0  0

            src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
            src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10

            res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b);    //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
                                                                                     //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
            res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b);    //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
                                                                                     //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3

            src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                          //a4 a5 a6 a7 a8 a9....a15 0  0  0  0
            src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                          //b4 b5 b6 b7 b8 b9....b15 0  0  0  0

            src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                  //a5 a6 a7 a8 a9....a15 0  0  0  0  0
            src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                  //b5 b6 b7 b8 b9....b15 0  0  0  0  0

            src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
            src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12

            res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b);    //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
                                                                                     //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
            res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b);    //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
                                                                                     //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
            src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_pred1);

            res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
            res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
            res_r0_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0_t3_8x16b);
            res_r1_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r1_t3_8x16b);
            res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
            res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);

            res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5);
            res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5);                    //shifting right by 5 bits

            res_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r1_t1_8x16b);
            res_16x8b = _mm_avg_epu8(src_r0_16x8b, res_16x8b);                       //computing q-pel

            _mm_storeu_si128((__m128i *)pu1_dst, res_16x8b);

            ht--;
            pu1_src += src_strd;
            pu1_pred1 += src_strd;
            pu1_dst += dst_strd;
        }
        while(ht > 0);
    }
}

/*****************************************************************************/
/*                                                                           */
/*  Function Name : ih264_inter_pred_luma_vert_qpel_ssse3                    */
/*                                                                           */
/*  Description   : This function implements a six-tap filter vertically on  */
/*                  ht x wd block and averages the values with the source    */
/*                  pixels to calculate vertical quarter-pel as mentioned in */
/*                  sec. 8.4.2.2.1 titled "Luma sample interpolation         */
/*                  process". (ht,wd) can be (4,4), (8,4), (4,8), (8,8),     */
/*                  (16,8), (8,16) or (16,16).                               */
/*                                                                           */
/*  Inputs        : puc_src  - pointer to source                             */
/*                  puc_dst  - pointer to destination                        */
/*                  src_strd - stride for source                             */
/*                  dst_strd - stride for destination                        */
/*                  ht       - height of the block                           */
/*                  wd       - width of the block                            */
/*                  pu1_tmp  - pointer to temporary buffer                   */
/*                  dydx     - x and y reference offset for q-pel            */
/*                             calculations                                  */
/*                                                                           */
/*  Issues        : None                                                     */
/*                                                                           */
/*  Revision History:                                                        */
/*                                                                           */
/*         DD MM YYYY   Author(s)       Changes                              */
/*         13 02 2015   Kaushik         Initial Version                      */
/*                      Senthoor                                             */
/*                                                                           */
/*****************************************************************************/
void ih264_inter_pred_luma_vert_qpel_ssse3(UWORD8 *pu1_src,
                                           UWORD8 *pu1_dst,
                                           WORD32 src_strd,
                                           WORD32 dst_strd,
                                           WORD32 ht,
                                           WORD32 wd,
                                           UWORD8* pu1_tmp,
                                           WORD32 dydx)
{
    WORD32 y_offset;
    UWORD8 *pu1_pred1;


    __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b, src_r4_16x8b;
    __m128i src_r5_16x8b, src_r6_16x8b;
    __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;
    __m128i res_16x8b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;

    __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
    __m128i const_val16_8x16b;

    UNUSED(pu1_tmp);
    y_offset = dydx & 0xf;

    coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
    coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
    coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5
                                                 //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20

    pu1_pred1 = pu1_src + (y_offset >> 3) * src_strd;

    const_val16_8x16b = _mm_set1_epi16(16);

    pu1_src -= src_strd << 1; // the filter input starts from x[-2] (till x[3])

    if(wd == 4)
    {
        __m128i mask_low32b;

        mask_low32b = _mm_set1_epi8(0xff);

        //Epilogue: Load all the pred rows except sixth and seventh row
        //          for the first and second row processing.
        src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
        pu1_src += src_strd;
        src_r1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
        pu1_src += src_strd;
        src_r2_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
        pu1_src += src_strd;
        src_r3_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
        pu1_src += src_strd;
        src_r4_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
        pu1_src += src_strd;

        mask_low32b = _mm_srli_si128(mask_low32b, 12); // mask for first four bytes

        src_r0_16x8b = _mm_unpacklo_epi32(src_r0_16x8b, src_r1_16x8b);
        src_r1_16x8b = _mm_unpacklo_epi32(src_r1_16x8b, src_r2_16x8b);
        src_r2_16x8b = _mm_unpacklo_epi32(src_r2_16x8b, src_r3_16x8b);
        src_r3_16x8b = _mm_unpacklo_epi32(src_r3_16x8b, src_r4_16x8b);

        do
        {
            src_r5_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
            src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));

            src_r4_16x8b = _mm_unpacklo_epi32(src_r4_16x8b, src_r5_16x8b);
            src_r5_16x8b = _mm_unpacklo_epi32(src_r5_16x8b, src_r6_16x8b);

            src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
            src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
            src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);

            res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
            res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
            res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);

            src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_pred1);
            src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred1 + src_strd));

            res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
            res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
            res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);

            src_r0r1_16x8b = _mm_unpacklo_epi32(src_r0_16x8b,src_r1_16x8b);

            res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.

            res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);

            res_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_16x8b); //computing q-pel

            _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char*)pu1_dst);
            res_16x8b = _mm_srli_si128(res_16x8b, 4);
            _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char*)(pu1_dst + dst_strd));

            src_r0_16x8b = src_r2_16x8b;
            src_r1_16x8b = src_r3_16x8b;
            src_r2_16x8b = src_r4_16x8b;
            src_r3_16x8b = src_r5_16x8b;
            src_r4_16x8b = src_r6_16x8b;

            ht -= 2;
            pu1_src += src_strd << 1;
            pu1_pred1 += src_strd << 1;
            pu1_dst += dst_strd << 1;
        }
        while(ht > 0);
    }

    else if(wd == 8)
    {
        //Epilogue: Load all the pred rows except sixth and seventh row
        //          for the first and second row processing.
        src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
        pu1_src += src_strd;
        src_r1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
        pu1_src += src_strd;
        src_r2_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
        pu1_src += src_strd;
        src_r3_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
        pu1_src += src_strd;
        src_r4_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
        pu1_src += src_strd;

        src_r0_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);
        src_r1_16x8b = _mm_unpacklo_epi64(src_r1_16x8b, src_r2_16x8b);
        src_r2_16x8b = _mm_unpacklo_epi64(src_r2_16x8b, src_r3_16x8b);
        src_r3_16x8b = _mm_unpacklo_epi64(src_r3_16x8b, src_r4_16x8b);

        do
        {
            src_r5_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
            src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));

            src_r4_16x8b = _mm_unpacklo_epi64(src_r4_16x8b, src_r5_16x8b);
            src_r5_16x8b = _mm_unpacklo_epi64(src_r5_16x8b, src_r6_16x8b);

            src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
            src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
            src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);

            res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
            res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
            res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);

            src_r0r1_16x8b = _mm_loadl_epi64((__m128i *)pu1_pred1);

            res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
            res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
            res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);

            res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.

            res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
            res_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_16x8b); //computing q-pel

            _mm_storel_epi64((__m128i *)pu1_dst, res_16x8b);

            src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b);
            src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b);
            src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b);

            res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
            res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
            res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);

            src_r0r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred1 + src_strd));

            res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
            res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
            res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);

            res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.

            res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
            res_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_16x8b); //computing q-pel

            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b);

            src_r0_16x8b = src_r2_16x8b;
            src_r1_16x8b = src_r3_16x8b;
            src_r2_16x8b = src_r4_16x8b;
            src_r3_16x8b = src_r5_16x8b;
            src_r4_16x8b = src_r6_16x8b;

            ht -= 2;
            pu1_src += src_strd << 1;
            pu1_pred1 += src_strd << 1;
            pu1_dst += dst_strd << 1;
        }
        while(ht > 0);
    }
    else // wd == 16
    {
        __m128i res_t0_8x16b;

        //Epilogue: Load all the pred rows except sixth and seventh row
        //          for the first and second row processing.
        src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
        pu1_src += src_strd;
        src_r1_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
        pu1_src += src_strd;
        src_r2_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
        pu1_src += src_strd;
        src_r3_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
        pu1_src += src_strd;
        src_r4_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
        pu1_src += src_strd;

        do
        {
            src_r5_16x8b  = _mm_loadu_si128((__m128i *)pu1_src);
            src_r6_16x8b  = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));

            src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
            src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
            src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);

            res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
            res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
            res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);

            res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
            res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
            res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);

            res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.

            src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b);
            src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b);
            src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b);

            res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
            res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
            res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);

            src_r0r1_16x8b = _mm_loadu_si128((__m128i *)pu1_pred1);

            res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
            res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
            res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);

            res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.

            res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b);
            res_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_16x8b); //computing q-pel

            _mm_storeu_si128((__m128i *)pu1_dst, res_16x8b);

            src_r0r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r2_16x8b);
            src_r2r3_16x8b = _mm_unpacklo_epi8(src_r3_16x8b, src_r4_16x8b);
            src_r4r5_16x8b = _mm_unpacklo_epi8(src_r5_16x8b, src_r6_16x8b);

            res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
            res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
            res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);

            res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
            res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
            res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);

            res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.

            src_r0r1_16x8b = _mm_unpackhi_epi8(src_r1_16x8b, src_r2_16x8b);
            src_r2r3_16x8b = _mm_unpackhi_epi8(src_r3_16x8b, src_r4_16x8b);
            src_r4r5_16x8b = _mm_unpackhi_epi8(src_r5_16x8b, src_r6_16x8b);

            res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
            res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
            res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);

            src_r0r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred1 + src_strd));

            res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
            res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
            res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);

            res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.

            res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b);
            res_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_16x8b); //computing q-pel

            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res_16x8b);

            src_r0_16x8b = src_r2_16x8b;
            src_r1_16x8b = src_r3_16x8b;
            src_r2_16x8b = src_r4_16x8b;
            src_r3_16x8b = src_r5_16x8b;
            src_r4_16x8b = src_r6_16x8b;

            ht -= 2;
            pu1_src += src_strd << 1;
            pu1_pred1 += src_strd << 1;
            pu1_dst += dst_strd << 1;
        }
        while(ht > 0);
    }
}

/*****************************************************************************/
/*                                                                           */
/*  Function Name : ih264_inter_pred_luma_horz_qpel_vert_qpel_ssse3          */
/*                                                                           */
/*  Description   : This function implements a six-tap filter vertically and */
/*                  horizontally on ht x wd block separately and averages    */
/*                  the two sets of values to calculate values at (1/4,1/4), */
/*                  (1/4, 3/4), (3/4, 1/4) or (3/4, 3/4) as mentioned in     */
/*                  sec. 8.4.2.2.1 titled "Luma sample interpolation         */
/*                  process". (ht,wd) can be (4,4), (8,4), (4,8), (8,8),     */
/*                  (16,8), (8,16) or (16,16).                               */
/*                                                                           */
/*  Inputs        : puc_src  - pointer to source                             */
/*                  puc_dst  - pointer to destination                        */
/*                  src_strd - stride for source                             */
/*                  dst_strd - stride for destination                        */
/*                  ht       - height of the block                           */
/*                  wd       - width of the block                            */
/*                  pu1_tmp  - pointer to temporary buffer                   */
/*                  dydx     - x and y reference offset for q-pel            */
/*                             calculations                                  */
/*                                                                           */
/*  Issues        : None                                                     */
/*                                                                           */
/*  Revision History:                                                        */
/*                                                                           */
/*         DD MM YYYY   Author(s)       Changes                              */
/*         13 02 2015   Kaushik         Initial Version                      */
/*                      Senthoor                                             */
/*                                                                           */
/*****************************************************************************/
void ih264_inter_pred_luma_horz_qpel_vert_qpel_ssse3(UWORD8 *pu1_src,
                                                     UWORD8 *pu1_dst,
                                                     WORD32 src_strd,
                                                     WORD32 dst_strd,
                                                     WORD32 ht,
                                                     WORD32 wd,
                                                     UWORD8* pu1_tmp,
                                                     WORD32 dydx)
{
    WORD32 ht_temp;
    UWORD8 *pu1_pred_vert,*pu1_pred_horiz;
    UWORD8 *pu1_tmp1, *pu1_tmp2;
    WORD32 x_offset, y_offset;

    pu1_tmp1 = pu1_tmp;

    dydx &= 0xf;
    ht_temp = ht;
    x_offset = dydx & 0x3;
    y_offset = dydx >> 2;
    pu1_tmp2 = pu1_tmp1;

    pu1_pred_vert  = pu1_src + (x_offset >> 1) - 2*src_strd;
    pu1_pred_horiz = pu1_src + (y_offset >> 1) * src_strd - 2;
    //the filter input starts from x[-2] (till x[3])

    __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
    __m128i const_val16_8x16b;

    coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01);  //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
    coeff2_3_16x8b = _mm_set1_epi32(0x14141414);  //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
    coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB);  //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
                                                  //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
    const_val16_8x16b = _mm_set1_epi16(16);

    if(wd == 4)
    {
        //vertical q-pel filter
        {
            __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b, src_r4_16x8b;
            __m128i src_r5_16x8b, src_r6_16x8b;
            __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;

            __m128i res_r0r1_16x8b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;

            //epilogue: Load all the pred rows except sixth  and seventh row for the
            //first and second row processing.
            src_r0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
            pu1_pred_vert = pu1_pred_vert + src_strd;

            src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
            pu1_pred_vert = pu1_pred_vert + src_strd;
            src_r0_16x8b = _mm_unpacklo_epi32(src_r0_16x8b, src_r1_16x8b);

            src_r2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
            pu1_pred_vert = pu1_pred_vert + src_strd;
            src_r1_16x8b = _mm_unpacklo_epi32(src_r1_16x8b, src_r2_16x8b);

            src_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
            pu1_pred_vert = pu1_pred_vert + src_strd;
            src_r2_16x8b = _mm_unpacklo_epi32(src_r2_16x8b, src_r3_16x8b);

            src_r4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
            pu1_pred_vert = pu1_pred_vert + src_strd;
            src_r3_16x8b = _mm_unpacklo_epi32(src_r3_16x8b, src_r4_16x8b);

            //Core Loop: Process all the rows.
            do
            {
                src_r5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
                src_r4_16x8b = _mm_unpacklo_epi32(src_r4_16x8b, src_r5_16x8b);

                src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert + src_strd));
                src_r5_16x8b = _mm_unpacklo_epi32(src_r5_16x8b, src_r6_16x8b);

                src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
                src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
                src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);

                res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
                res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
                res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);

                res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
                res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
                res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);

                res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
                res_r0r1_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);

                _mm_storel_epi64((__m128i *)pu1_tmp1, res_r0r1_16x8b);

                src_r0_16x8b = src_r2_16x8b;
                src_r1_16x8b = src_r3_16x8b;
                src_r2_16x8b = src_r4_16x8b;
                src_r3_16x8b = src_r5_16x8b;
                src_r4_16x8b = src_r6_16x8b;

                ht_temp -= 2;
                pu1_pred_vert += src_strd << 1;
                pu1_tmp1 += 8;
            }
            while(ht_temp > 0);
        }

        //horizontal q-pel filter
        {
            __m128i src_r0_16x8b, src_r1_16x8b;
            __m128i src_r0_sht_16x8b, src_r1_sht_16x8b;
            __m128i src_r0r1_vpel_16x8b, src_r0r1_t1_16x8b;

            __m128i res_r0r1_t1_8x16b, res_r0r1_t2_8x16b, res_r0r1_t3_8x16b;
            __m128i res_r0r1_16x8b;

            __m128i mask_low32b;

            mask_low32b = _mm_set1_epi8(0xff);
            mask_low32b = _mm_srli_si128(mask_low32b, 12);

            //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
            //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....

            do
            {
                src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_pred_horiz);                  //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
                src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_horiz + src_strd));     //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15

                src_r0r1_vpel_16x8b = _mm_loadl_epi64((__m128i *)pu1_tmp2);

                src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                          //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
                src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1);                          //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0

                src_r0_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);            //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
                src_r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);            //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8

                src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);          //a0 a1 a1 a2 a2 a3 a3 a4 b0 b1 b1 b2 b2 b3 b3 b4
                res_r0r1_t1_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff0_1_16x8b);    //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
                                                                                             //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1

                src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4);                              //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8  0  0  0  0
                src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4);                              //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8  0  0  0  0

                src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);          //a2 a3 a3 a4 a4 a5 a5 a6 b2 b3 b3 b4 b4 b5 b5 b6
                res_r0r1_t2_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff2_3_16x8b);    //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
                                                                                             //b2*c2+b3*c3 b3*c2+b4*c3 b4*c2+b5*c3 b5*c2+b6*c3

                src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4);                              //a4 a5 a5 a6 a6 a7 a7 a8  0  0  0  0  0  0  0  0
                src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4);                              //b4 b5 b5 b6 b6 b7 b7 b8  0  0  0  0  0  0  0  0

                src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);          //a4 a5 a5 a6 a6 a7 a7 a8 b4 b5 b5 b6 b6 b7 b7 b8
                res_r0r1_t3_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff4_5_16x8b);    //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
                                                                                             //b4*c4+b5*c5 b5*c4+b6*c5 b4*c6+b7*c5 b7*c4+b8*c5

                res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t2_8x16b);
                res_r0r1_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0r1_t3_8x16b);
                res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t3_8x16b);     //a0*c0+a1*c1+a2*c2+a3*c3+a4*a4+a5*c5 + 15;
                                                                                             //a1*c0+a2*c1+a2*c2+a3*c3+a5*a4+a6*c5 + 15;
                                                                                             //a2*c0+a3*c1+a4*c2+a5*c3+a6*a4+a7*c5 + 15;
                                                                                             //a3*c0+a4*c1+a5*c2+a6*c3+a6*a4+a8*c5 + 15;
                                                                                             //b0*c0+b1*c1+b2*c2+b3*c3+b4*b4+b5*c5 + 15;
                                                                                             //b1*c0+b2*c1+b2*c2+b3*c3+b5*b4+b6*c5 + 15;
                                                                                             //b2*c0+b3*c1+b4*c2+b5*c3+b6*b4+b7*c5 + 15;
                                                                                             //b3*c0+b4*c1+b5*c2+b6*c3+b6*b4+b8*c5 + 15;

                res_r0r1_t1_8x16b = _mm_srai_epi16(res_r0r1_t1_8x16b, 5);                    //shifting right by 5 bits.

                res_r0r1_16x8b = _mm_packus_epi16(res_r0r1_t1_8x16b,res_r0r1_t1_8x16b);

                res_r0r1_16x8b = _mm_avg_epu8(res_r0r1_16x8b,src_r0r1_vpel_16x8b);

                _mm_maskmoveu_si128(res_r0r1_16x8b, mask_low32b, (char*)pu1_dst);
                res_r0r1_16x8b = _mm_srli_si128(res_r0r1_16x8b, 4);
                _mm_maskmoveu_si128(res_r0r1_16x8b, mask_low32b, (char*)(pu1_dst + dst_strd));

                ht -= 2;
                pu1_pred_horiz += src_strd << 1;
                pu1_tmp2 += 8;
                pu1_dst += dst_strd << 1;
            }
            while(ht > 0);
        }
    }
    else if(wd == 8)
    {
        //vertical q-pel filter
        {
            __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b;
            __m128i src_r4_16x8b, src_r5_16x8b, src_r6_16x8b;
            __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;

            __m128i res_16x8b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;

            //epilogue: Load all the pred rows except sixth  and seventh row for the
            //first and second row processing.
            src_r0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
            pu1_pred_vert = pu1_pred_vert + src_strd;

            src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
            pu1_pred_vert = pu1_pred_vert + src_strd;
            src_r0_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);

            src_r2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
            pu1_pred_vert = pu1_pred_vert + src_strd;
            src_r1_16x8b = _mm_unpacklo_epi64(src_r1_16x8b, src_r2_16x8b);

            src_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
            pu1_pred_vert = pu1_pred_vert + src_strd;
            src_r2_16x8b = _mm_unpacklo_epi64(src_r2_16x8b, src_r3_16x8b);

            src_r4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
            pu1_pred_vert = pu1_pred_vert + src_strd;
            src_r3_16x8b = _mm_unpacklo_epi64(src_r3_16x8b, src_r4_16x8b);

            //Core Loop: Process all the rows.
            do
            {
                src_r5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
                src_r4_16x8b = _mm_unpacklo_epi64(src_r4_16x8b, src_r5_16x8b);

                src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert + src_strd));
                src_r5_16x8b = _mm_unpacklo_epi64(src_r5_16x8b, src_r6_16x8b);

                src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
                src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
                src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);

                res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
                res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
                res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);

                res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
                res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
                res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);

                res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
                res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);

                _mm_storel_epi64((__m128i *)(pu1_tmp1), res_16x8b);

                src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b);
                src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b);
                src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b);

                res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
                res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
                res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);

                res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
                res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
                res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);

                res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
                res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);

                _mm_storel_epi64((__m128i *)(pu1_tmp1 + 8), res_16x8b);

                src_r0_16x8b = src_r2_16x8b;
                src_r1_16x8b = src_r3_16x8b;
                src_r2_16x8b = src_r4_16x8b;
                src_r3_16x8b = src_r5_16x8b;
                src_r4_16x8b = src_r6_16x8b;

                ht_temp -= 2;
                pu1_pred_vert += src_strd << 1;
                pu1_tmp1 += 16;
            }
            while(ht_temp > 0);
        }

        //horizontal q-pel filter
        {
            __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b;
            __m128i src_r0_t1_16x8b, src_r1_t1_16x8b;
            __m128i src_r0_vpel_16x8b, src_r1_vpel_16x8b;

            __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
            __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b, res_16x8b;

            //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
            //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....

            do
            {
                src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_horiz));               //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
                src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_horiz + src_strd));    //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15

                src_r0_vpel_16x8b = _mm_loadl_epi64((__m128i *)(pu1_tmp2));                //a2 a3 a4 a5 a6 a7 a8....a15 0 or
                                                                                           //a3 a4 a5 a6 a7 a8 a9....a15 0
                src_r1_vpel_16x8b = _mm_loadl_epi64((__m128i *)(pu1_tmp2 + 8));
                                                                                           //b2 b3 b4 b5 b6 b7 b8....b15 0 or
                                                                                           //b3 b4 b5 b6 b7 b8 b9....b15 0

                src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                        //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
                src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1);                        //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0

                src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);       //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
                src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);       //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8

                res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b);      //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
                                                                                           //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
                res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b);      //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
                                                                                           //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1

                src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                            //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
                src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                            //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0

                src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                    //a3 a4 a5 a6 a7 a8 a9....a15 0  0  0
                src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                    //b3 b4 b5 b6 b7 b8 b9....b15 0  0  0

                src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);       //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
                src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);       //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10

                res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b);      //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
                                                                                           //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
                res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b);      //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
                                                                                           //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3

                src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                            //a4 a5 a6 a7 a8 a9....a15 0  0  0  0
                src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                            //b4 b5 b6 b7 b8 b9....b15 0  0  0  0

                src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                    //a5 a6 a7 a8 a9....a15 0  0  0  0  0
                src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                    //b5 b6 b7 b8 b9....b15 0  0  0  0  0

                src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);       //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
                src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);       //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12

                res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b);      //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
                                                                                           //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
                res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b);      //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
                                                                                           //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
                res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
                res_r0_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0_t3_8x16b);
                res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
                res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5);                      //shifting right by 5 bits.

                res_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r0_t1_8x16b);
                res_16x8b = _mm_avg_epu8(res_16x8b, src_r0_vpel_16x8b);

                _mm_storel_epi64((__m128i *)(pu1_dst), res_16x8b);

                res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
                res_r1_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r1_t3_8x16b);
                res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);
                res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5);                      //shifting right by 5 bits.

                res_16x8b = _mm_packus_epi16(res_r1_t1_8x16b, res_r1_t1_8x16b);
                res_16x8b = _mm_avg_epu8(res_16x8b,src_r1_vpel_16x8b);

                _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b);

                ht -= 2;
                pu1_pred_horiz += src_strd << 1;
                pu1_dst += dst_strd << 1;
                pu1_tmp2 += 16;
            }
            while(ht > 0);
        }
    }
    else // wd == 16
    {
        //vertical q-pel filter
        {
            __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b;
            __m128i src_r4_16x8b, src_r5_16x8b, src_r6_16x8b;
            __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;

            __m128i res_t0_8x16b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;
            __m128i res_16x8b;

            //epilogue: Load all the pred rows except sixth  and seventh row for the
            //first and second row processing.
            src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert));
            pu1_pred_vert =  pu1_pred_vert + src_strd;
            src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert));
            pu1_pred_vert =  pu1_pred_vert + src_strd;
            src_r2_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert));
            pu1_pred_vert =  pu1_pred_vert + src_strd;
            src_r3_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert));
            pu1_pred_vert =  pu1_pred_vert + src_strd;
            src_r4_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert));
            pu1_pred_vert =  pu1_pred_vert + src_strd;

            //Core Loop: Process all the rows.
            do
            {
                src_r5_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert));
                src_r6_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert + src_strd));

                src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
                src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
                src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);

                res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
                res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
                res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);

                res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
                res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
                res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
                res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.

                src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b);
                src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b);
                src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b);

                res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
                res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
                res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);

                res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
                res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
                res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
                res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.

                res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b);

                _mm_storeu_si128((__m128i *)(pu1_tmp1), res_16x8b);

                src_r0r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r2_16x8b);
                src_r2r3_16x8b = _mm_unpacklo_epi8(src_r3_16x8b, src_r4_16x8b);
                src_r4r5_16x8b = _mm_unpacklo_epi8(src_r5_16x8b, src_r6_16x8b);

                res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
                res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
                res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);

                res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
                res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
                res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
                res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.

                src_r0r1_16x8b = _mm_unpackhi_epi8(src_r1_16x8b, src_r2_16x8b);
                src_r2r3_16x8b = _mm_unpackhi_epi8(src_r3_16x8b, src_r4_16x8b);
                src_r4r5_16x8b = _mm_unpackhi_epi8(src_r5_16x8b, src_r6_16x8b);

                res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
                res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
                res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);

                res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
                res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
                res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
                res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.

                res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b);

                _mm_storeu_si128((__m128i *)(pu1_tmp1 + 16), res_16x8b);

                src_r0_16x8b = src_r2_16x8b;
                src_r1_16x8b = src_r3_16x8b;
                src_r2_16x8b = src_r4_16x8b;
                src_r3_16x8b = src_r5_16x8b;
                src_r4_16x8b = src_r6_16x8b;

                ht_temp -= 2;
                pu1_pred_vert += src_strd << 1;
                pu1_tmp1 += 32;
            }
            while(ht_temp > 0);
        }
        //horizontal q-pel filter
        {
            __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b;
            __m128i src_r0_t1_16x8b, src_r1_t1_16x8b;
            __m128i src_vpel_16x8b;

            __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
            __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;
            __m128i res_16x8b;

            //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
            //Row0 :                         b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
            //b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels.

            do
            {
                src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_horiz));             //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
                src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_horiz + 8));         //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
                src_vpel_16x8b = _mm_loadu_si128((__m128i *)(pu1_tmp2));

                src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                      //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
                src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1);                      //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0

                src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
                src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8

                res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b);    //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
                                                                                         //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
                res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b);    //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
                                                                                         //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1

                src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                          //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
                src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                          //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0

                src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                  //a3 a4 a5 a6 a7 a8 a9....a15 0  0  0
                src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                  //b3 b4 b5 b6 b7 b8 b9....b15 0  0  0

                src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
                src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10

                res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b);    //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
                                                                                         //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
                res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b);    //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
                                                                                         //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3

                src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                          //a4 a5 a6 a7 a8 a9....a15 0  0  0  0
                src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                          //b4 b5 b6 b7 b8 b9....b15 0  0  0  0

                src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                  //a5 a6 a7 a8 a9....a15 0  0  0  0  0
                src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                  //b5 b6 b7 b8 b9....b15 0  0  0  0  0

                src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
                src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12

                res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b);    //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
                                                                                         //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
                res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b);    //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
                                                                                         //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
                res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
                res_r0_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0_t3_8x16b);
                res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
                res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5);                    //shifting right by 5 bits.

                res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
                res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);
                res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, const_val16_8x16b);
                res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5);                    //shifting right by 5 bits.

                res_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r1_t1_8x16b);

                res_16x8b = _mm_avg_epu8(res_16x8b, src_vpel_16x8b);
                _mm_storeu_si128((__m128i *)(pu1_dst), res_16x8b);

                ht --;
                pu1_pred_horiz  += src_strd;
                pu1_dst += dst_strd;
                pu1_tmp2 += 16;
            }
            while(ht > 0);
        }
    }
}

/*****************************************************************************/
/*                                                                           */
/*  Function Name : ih264_inter_pred_luma_horz_qpel_vert_hpel_ssse3          */
/*                                                                           */
/*  Description   : This function implements a six-tap filter vertically and */
/*                  horizontally on ht x wd block separately and averages    */
/*                  the two sets of values to calculate values at (1/4,1/2), */
/*                  or (3/4, 1/2) as mentioned in sec. 8.4.2.2.1 titled      */
/*                  "Luma sample interpolation process". (ht,wd) can be      */
/*                  (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16).   */
/*                                                                           */
/*  Inputs        : puc_src  - pointer to source                             */
/*                  puc_dst  - pointer to destination                        */
/*                  src_strd - stride for source                             */
/*                  dst_strd - stride for destination                        */
/*                  ht       - height of the block                           */
/*                  wd       - width of the block                            */
/*                  pu1_tmp  - pointer to temporary buffer                   */
/*                  dydx     - x and y reference offset for q-pel            */
/*                             calculations                                  */
/*                                                                           */
/*  Issues        : None                                                     */
/*                                                                           */
/*  Revision History:                                                        */
/*                                                                           */
/*         DD MM YYYY   Author(s)       Changes                              */
/*         13 02 2015   Kaushik         Initial Version                      */
/*                      Senthoor                                             */
/*                                                                           */
/*****************************************************************************/
void ih264_inter_pred_luma_horz_qpel_vert_hpel_ssse3(UWORD8 *pu1_src,
                                                     UWORD8 *pu1_dst,
                                                     WORD32 src_strd,
                                                     WORD32 dst_strd,
                                                     WORD32 ht,
                                                     WORD32 wd,
                                                     UWORD8* pu1_tmp,
                                                     WORD32 dydx)
{
    WORD32 ht_temp;
    WORD32 x_offset;
    WORD32 off0,off1, off2, off3, off4, off5;
    WORD16 *pi2_temp1,*pi2_temp2,*pi2_temp3;

    ht_temp = ht;
    x_offset = dydx & 0x3;
    pi2_temp1 = (WORD16 *)pu1_tmp;
    pi2_temp2 = pi2_temp1;
    pi2_temp3 = pi2_temp1 + (x_offset >> 1);

    pu1_src -= 2 * src_strd;
    pu1_src -= 2;
    pi2_temp3 += 2;
    //the filter input starts from x[-2] (till x[3])

    if(wd == 4)
    {
        //vertical half-pel
        {
            __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b, src_r4_16x8b;
            __m128i src_r5_16x8b, src_r6_16x8b;
            __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;

            __m128i res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;

            __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;

            coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01);  //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
            coeff2_3_16x8b = _mm_set1_epi32(0x14141414);  //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
            coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB);  //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
                                                          //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
            off0 = -((src_strd << 2) + src_strd) + 8;
            off1 = -(src_strd << 2) + 8;
            off2 = -((src_strd << 1) + src_strd) + 8;
            off3 = -(src_strd << 1) + 8;
            off4 = -src_strd + 8;
            off5 = 8;

            //epilogue: Load all the pred rows except sixth  and seventh row for the
            //first and second row processing.
            src_r0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
            pu1_src =  pu1_src + src_strd;

            src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
            pu1_src =  pu1_src + src_strd;

            src_r2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
            pu1_src =  pu1_src + src_strd;

            src_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
            pu1_src =  pu1_src + src_strd;

            src_r4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
            pu1_src =  pu1_src + src_strd;

            //Core Loop: Process all the rows.
            do
            {
                src_r5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));

                src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
                src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
                src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);

                res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
                res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
                res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);

                res_t1_8x16b = _mm_add_epi16(res_t2_8x16b, res_t1_8x16b);
                res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);

                _mm_storeu_si128((__m128i *)(pi2_temp1), res_t1_8x16b);

                pi2_temp1[8] = pu1_src[off0] + pu1_src[off5]
                                   - (pu1_src[off1] + pu1_src[off4])
                                   + ((pu1_src[off2] + pu1_src[off3] - pu1_src[off1] - pu1_src[off4]) << 2)
                                   + ((pu1_src[off2] + pu1_src[off3]) << 4);

                pu1_src = pu1_src + src_strd;
                pi2_temp1 = pi2_temp1 + 9;

                src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));

                src_r0r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r2_16x8b);
                src_r2r3_16x8b = _mm_unpacklo_epi8(src_r3_16x8b, src_r4_16x8b);
                src_r4r5_16x8b = _mm_unpacklo_epi8(src_r5_16x8b, src_r6_16x8b);

                res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
                res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
                res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);

                res_t1_8x16b = _mm_add_epi16(res_t2_8x16b, res_t1_8x16b);
                res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);

                _mm_storeu_si128((__m128i *)(pi2_temp1), res_t1_8x16b);

                pi2_temp1[8] = pu1_src[off0] + pu1_src[off5]
                                   - (pu1_src[off1] + pu1_src[off4])
                                   + ((pu1_src[off2] + pu1_src[off3] - pu1_src[off1] - pu1_src[off4]) << 2)
                                   + ((pu1_src[off2] + pu1_src[off3]) << 4);

                ht_temp -= 2;
                pu1_src = pu1_src + src_strd;
                pi2_temp1 = pi2_temp1 + 9;

                src_r0_16x8b = src_r2_16x8b;
                src_r1_16x8b = src_r3_16x8b;
                src_r2_16x8b = src_r4_16x8b;
                src_r3_16x8b = src_r5_16x8b;
                src_r4_16x8b = src_r6_16x8b;
            }
            while(ht_temp > 0);
        }

        //horizontal q-pel
        {
            __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b;
            __m128i src_r3_8x16b, src_r4_8x16b, src_r5_8x16b;
            __m128i src_r0r1_c0_8x16b, src_r2r3_c0_8x16b, src_r4r5_c0_8x16b;
            __m128i src_hpel_16x8b, src_hpel_8x16b;

            __m128i res_t1_4x32b, res_t2_4x32b, res_t3_4x32b;
            __m128i res_8x16b, res_16x8b;

            __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
            __m128i const_val512_4x32b, const_val16_8x16b;
            __m128i mask_low32b;

            mask_low32b = _mm_set1_epi8(0xff);

            coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001);
            coeff2_3_8x16b = _mm_set1_epi32(0x00140014);
            coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB);

            mask_low32b = _mm_srli_si128(mask_low32b, 12);

            const_val512_4x32b = _mm_set1_epi32(512);
            const_val16_8x16b = _mm_set1_epi16(16);

            do
            {
                src_r0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2));
                src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 1));
                src_r2_8x16b = _mm_srli_si128(src_r1_8x16b, 2);
                src_r3_8x16b = _mm_srli_si128(src_r1_8x16b, 4);
                src_r4_8x16b = _mm_srli_si128(src_r1_8x16b, 6);
                src_r5_8x16b = _mm_srli_si128(src_r1_8x16b, 8);

                src_r0r1_c0_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
                src_r2r3_c0_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
                src_r4r5_c0_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);

                res_t1_4x32b = _mm_madd_epi16(src_r0r1_c0_8x16b, coeff0_1_8x16b);
                res_t2_4x32b = _mm_madd_epi16(src_r2r3_c0_8x16b, coeff2_3_8x16b);
                res_t3_4x32b = _mm_madd_epi16(src_r4r5_c0_8x16b, coeff4_5_8x16b);

                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
                res_t3_4x32b = _mm_add_epi32(const_val512_4x32b, res_t3_4x32b);
                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
                res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

                res_8x16b = _mm_packs_epi32(res_t1_4x32b, res_t1_4x32b);
                res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);

                src_hpel_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp3));
                src_hpel_8x16b = _mm_add_epi16(src_hpel_8x16b, const_val16_8x16b);
                src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits.
                src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b);

                res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b);

                _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char*)pu1_dst);

                ht--;
                pi2_temp2 = pi2_temp2 + 4 + 5;
                pi2_temp3 = pi2_temp3 + 4 + 5;
                pu1_dst = pu1_dst + dst_strd;
            }
            while(ht > 0);
        }
    }
    else if(wd == 8)
    {
        // vertical half-pel
        {
            __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b, src_r4_16x8b;
            __m128i src_r5_16x8b, src_r6_16x8b;
            __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;

            __m128i res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;

            __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;

            coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01);  //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
            coeff2_3_16x8b = _mm_set1_epi32(0x14141414);  //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
            coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB);  //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5

            //epilogue: Load all the pred rows except sixth  and seventh row for the
            //first and second row processing.
            src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));
            pu1_src =  pu1_src + src_strd;

            src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));
            pu1_src =  pu1_src + src_strd;

            src_r2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));
            pu1_src =  pu1_src + src_strd;

            src_r3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));
            pu1_src =  pu1_src + src_strd;

            src_r4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));
            pu1_src =  pu1_src + src_strd;

            //Core Loop: Process all the rows.
            do
            {
                src_r5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));
                src_r6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));

                src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
                src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
                src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);

                res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
                res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
                res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);

                res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
                res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);

                _mm_storeu_si128((__m128i *)(pi2_temp1), res_t1_8x16b);

                src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b);
                src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b);
                src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b);

                res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
                res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
                res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);

                res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
                res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);

                _mm_storeu_si128((__m128i *)(pi2_temp1 + 8), res_t1_8x16b);

                src_r0r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r2_16x8b);
                src_r2r3_16x8b = _mm_unpacklo_epi8(src_r3_16x8b, src_r4_16x8b);
                src_r4r5_16x8b = _mm_unpacklo_epi8(src_r5_16x8b, src_r6_16x8b);

                res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
                res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
                res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);

                res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
                res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);

                _mm_storeu_si128((__m128i *)(pi2_temp1 + 8 + 5), res_t1_8x16b);

                src_r0r1_16x8b = _mm_unpackhi_epi8(src_r1_16x8b, src_r2_16x8b);
                src_r2r3_16x8b = _mm_unpackhi_epi8(src_r3_16x8b, src_r4_16x8b);
                src_r4r5_16x8b = _mm_unpackhi_epi8(src_r5_16x8b, src_r6_16x8b);

                res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
                res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
                res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);

                res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
                res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);

                _mm_storeu_si128((__m128i *)(pi2_temp1 + 8 + 5 + 8), res_t1_8x16b);

                src_r0_16x8b = src_r2_16x8b;
                src_r1_16x8b = src_r3_16x8b;
                src_r2_16x8b = src_r4_16x8b;
                src_r3_16x8b = src_r5_16x8b;
                src_r4_16x8b = src_r6_16x8b;

                ht_temp -= 2;
                pu1_src =  pu1_src + (src_strd << 1);
                pi2_temp1 = pi2_temp1 + (13 << 1);
            }
            while(ht_temp > 0);
        }
        // horizontal q-pel
        {
            __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b;
            __m128i src_r4_8x16b, src_r5_8x16b;
            __m128i src_r0r1_c0_8x16b, src_r2r3_c0_8x16b, src_r4r5_c0_8x16b;
            __m128i src_r0r1_c1_8x16b, src_r2r3_c1_8x16b, src_r4r5_c1_8x16b;
            __m128i src_hpel_8x16b, src_hpel_16x8b;

            __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b;
            __m128i res_8x16b, res_16x8b;

            __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
            __m128i const_val512_4x32b, const_val16_8x16b;

            coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001);
            coeff2_3_8x16b = _mm_set1_epi32(0x00140014);
            coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB);

            const_val512_4x32b = _mm_set1_epi32(512);
            const_val16_8x16b = _mm_set1_epi16(16);

            do
            {
                src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2));
                src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 1));
                src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 2));
                src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 3));
                src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 4));
                src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 5));

                src_r0r1_c0_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
                src_r2r3_c0_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
                src_r4r5_c0_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);

                src_r0r1_c1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
                src_r2r3_c1_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
                src_r4r5_c1_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);

                res_t1_4x32b = _mm_madd_epi16(src_r0r1_c0_8x16b, coeff0_1_8x16b);
                res_t2_4x32b = _mm_madd_epi16(src_r2r3_c0_8x16b, coeff2_3_8x16b);
                res_t3_4x32b = _mm_madd_epi16(src_r4r5_c0_8x16b, coeff4_5_8x16b);

                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
                res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);

                res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

                res_t1_4x32b = _mm_madd_epi16(src_r0r1_c1_8x16b, coeff0_1_8x16b);
                res_t2_4x32b = _mm_madd_epi16(src_r2r3_c1_8x16b, coeff2_3_8x16b);
                res_t3_4x32b = _mm_madd_epi16(src_r4r5_c1_8x16b, coeff4_5_8x16b);

                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
                res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);

                res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

                res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
                res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);

                src_hpel_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp3));
                src_hpel_8x16b = _mm_add_epi16(src_hpel_8x16b, const_val16_8x16b);
                src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits.
                src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b);

                res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b);

                _mm_storel_epi64((__m128i *)(pu1_dst), res_16x8b);

                ht--;
                pi2_temp2 = pi2_temp2 + 8 + 5;
                pi2_temp3 = pi2_temp3 + 8 + 5;
                pu1_dst = pu1_dst + dst_strd;
            }
            while(ht > 0);
        }
    }
    else // wd == 16
    {
        // vertical half-pel
        {
            __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b;
            __m128i src_r4_16x8b, src_r5_16x8b;
            __m128i src_r0_c2_16x8b, src_r1_c2_16x8b, src_r2_c2_16x8b, src_r3_c2_16x8b;
            __m128i src_r4_c2_16x8b, src_r5_c2_16x8b;
            __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;

            __m128i res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;

            __m128i coeff0_1_16x8b,coeff2_3_16x8b,coeff4_5_16x8b;

            coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01);  //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
            coeff2_3_16x8b = _mm_set1_epi32(0x14141414);  //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
            coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB);  //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5

            src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));
            src_r0_c2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 16));
            pu1_src =  pu1_src + src_strd;
            src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));
            src_r1_c2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 16));
            pu1_src =  pu1_src + src_strd;
            src_r2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));
            src_r2_c2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 16));
            pu1_src =  pu1_src + src_strd;
            src_r3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));
            src_r3_c2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 16));
            pu1_src =  pu1_src + src_strd;
            src_r4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));
            src_r4_c2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 16));
            pu1_src =  pu1_src + src_strd;

            //Core Loop: Process all the rows.
            do
            {
                src_r5_16x8b  = _mm_loadu_si128((__m128i *)(pu1_src));
                src_r5_c2_16x8b  = _mm_loadu_si128((__m128i *)(pu1_src + 16));

                src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
                src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
                src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);

                res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
                res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
                res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);

                res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
                res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);

                _mm_storeu_si128((__m128i *)(pi2_temp1), res_t1_8x16b);

                src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b);
                src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b);
                src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b);

                res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
                res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
                res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);

                res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
                res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);

                _mm_storeu_si128((__m128i *)(pi2_temp1 + 8), res_t1_8x16b);

                src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_c2_16x8b, src_r1_c2_16x8b);
                src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_c2_16x8b, src_r3_c2_16x8b);
                src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_c2_16x8b, src_r5_c2_16x8b);

                res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
                res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
                res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);

                res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
                res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);

                _mm_storeu_si128((__m128i *)(pi2_temp1 + 16), res_t1_8x16b);

                src_r0_16x8b = src_r1_16x8b;
                src_r1_16x8b = src_r2_16x8b;
                src_r2_16x8b = src_r3_16x8b;
                src_r3_16x8b = src_r4_16x8b;
                src_r4_16x8b = src_r5_16x8b;

                src_r0_c2_16x8b = src_r1_c2_16x8b;
                src_r1_c2_16x8b = src_r2_c2_16x8b;
                src_r2_c2_16x8b = src_r3_c2_16x8b;
                src_r3_c2_16x8b = src_r4_c2_16x8b;
                src_r4_c2_16x8b = src_r5_c2_16x8b;

                ht_temp--;
                pu1_src =  pu1_src + src_strd;
                pi2_temp1 =  pi2_temp1 + 16 + 5;
            }
            while(ht_temp > 0);
        }
        // horizontal q-pel
        {
            __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b;
            __m128i src_r4_8x16b, src_r5_8x16b;
            __m128i src_r0r1_8x16b, src_r2r3_8x16b, src_r4r5_8x16b;
            __m128i src_hpel1_8x16b, src_hpel2_8x16b, src_hpel_16x8b;

            __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b;
            __m128i res_c0_8x16b, res_c1_8x16b, res_16x8b;

            __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
            __m128i const_val512_4x32b, const_val16_8x16b;

            coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001);
            coeff2_3_8x16b = _mm_set1_epi32(0x00140014);
            coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB);

            const_val512_4x32b = _mm_set1_epi32(512);
            const_val16_8x16b = _mm_set1_epi16(16);

            do
            {
                src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2));
                src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 1));
                src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 2));
                src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 3));
                src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 4));
                src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 5));

                src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
                src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
                src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);

                res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
                res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
                res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);

                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
                res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
                res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

                src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
                src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
                src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);

                res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
                res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
                res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);

                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
                res_t3_4x32b = _mm_add_epi32(const_val512_4x32b, res_t3_4x32b);
                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
                res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

                res_c0_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);

                src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8));
                src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8 + 1));
                src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8 + 2));
                src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8 + 3));
                src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8 + 4));
                src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8 + 5));

                src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
                src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
                src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);

                res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
                res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
                res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);

                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
                res_t3_4x32b = _mm_add_epi32(const_val512_4x32b, res_t3_4x32b);
                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
                res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b ,10);

                src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
                src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
                src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);

                res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
                res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
                res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);

                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
                res_t3_4x32b = _mm_add_epi32(const_val512_4x32b, res_t3_4x32b);
                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
                res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

                res_c1_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
                res_16x8b = _mm_packus_epi16(res_c0_8x16b, res_c1_8x16b);

                src_hpel1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp3));
                src_hpel1_8x16b = _mm_add_epi16(src_hpel1_8x16b, const_val16_8x16b);
                src_hpel1_8x16b = _mm_srai_epi16(src_hpel1_8x16b, 5); //shifting right by 5 bits.

                src_hpel2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp3 + 8));
                src_hpel2_8x16b = _mm_add_epi16(src_hpel2_8x16b, const_val16_8x16b);
                src_hpel2_8x16b = _mm_srai_epi16(src_hpel2_8x16b, 5); //shifting right by 5 bits.

                src_hpel_16x8b = _mm_packus_epi16(src_hpel1_8x16b, src_hpel2_8x16b);
                res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b);

                _mm_storeu_si128((__m128i *)(pu1_dst), res_16x8b);

                ht--;
                pi2_temp2 = pi2_temp2 + 16 + 5;
                pi2_temp3 = pi2_temp3 + 16 + 5;
                pu1_dst = pu1_dst + dst_strd;
            }
            while(ht > 0);
        }
    }
}

/*****************************************************************************/
/*                                                                           */
/*  Function Name : ih264_inter_pred_luma_horz_hpel_vert_qpel_ssse3          */
/*                                                                           */
/*  Description   : This function implements a six-tap filter vertically and */
/*                  horizontally on ht x wd block separately and averages    */
/*                  the two sets of values to calculate values at (1/2,1/4), */
/*                  or (1/2, 3/4) as mentioned in sec. 8.4.2.2.1 titled      */
/*                  "Luma sample interpolation process". (ht,wd) can be      */
/*                  (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16).   */
/*                                                                           */
/*  Inputs        : puc_src  - pointer to source                             */
/*                  puc_dst  - pointer to destination                        */
/*                  src_strd - stride for source                             */
/*                  dst_strd - stride for destination                        */
/*                  ht       - height of the block                           */
/*                  wd       - width of the block                            */
/*                  pu1_tmp  - pointer to temporary buffer                   */
/*                  dydx     - x and y reference offset for q-pel            */
/*                             calculations                                  */
/*                                                                           */
/*  Issues        : None                                                     */
/*                                                                           */
/*  Revision History:                                                        */
/*                                                                           */
/*         DD MM YYYY   Author(s)       Changes                              */
/*         13 02 2015   Kaushik         Initial Version                      */
/*                      Senthoor                                             */
/*                                                                           */
/*****************************************************************************/
void ih264_inter_pred_luma_horz_hpel_vert_qpel_ssse3(UWORD8 *pu1_src,
                                                     UWORD8 *pu1_dst,
                                                     WORD32 src_strd,
                                                     WORD32 dst_strd,
                                                     WORD32 ht,
                                                     WORD32 wd,
                                                     UWORD8* pu1_tmp,
                                                     WORD32 dydx)
{
    WORD32 ht_temp;
    WORD32 y_offset;
    WORD16 *pi2_temp1,*pi2_temp2,*pi2_temp3;

    y_offset = (dydx & 0xf) >> 2;
    pi2_temp1 = (WORD16 *)pu1_tmp;
    pi2_temp2 = pi2_temp1;
    pi2_temp3 = pi2_temp1 + (y_offset >> 1) * wd;

    ht_temp = ht + 5;
    pu1_src -= src_strd << 1;
    pu1_src -= 2;
    pi2_temp3 += wd << 1;
    //the filter input starts from x[-2] (till x[3])

    if(wd == 4)
    {
        // horizontal half-pel
        {
            __m128i src_r0_16x8b, src_r1_16x8b, src_r0r1_t1_16x8b;
            __m128i src_r0_sht_16x8b, src_r1_sht_16x8b;
            __m128i res_r0r1_t1_8x16b, res_r0r1_t2_8x16b, res_r0r1_t3_8x16b;
            __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;

            coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01);  //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
            coeff2_3_16x8b = _mm_set1_epi32(0x14141414);  //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
            coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB);  //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5

            //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
            //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....

            do
            {
                src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);                         //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
                src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));            //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15

                src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                         //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
                src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1);                         //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0

                src_r0_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);           //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
                src_r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);           //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8

                src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);         //a0 a1 a1 a2 a2 a3 a3 a4 b0 b1 b1 b2 b2 b3 b3 b4
                res_r0r1_t1_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff0_1_16x8b);   //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
                                                                                            //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1

                src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4);                             //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8  0  0  0  0
                src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4);                             //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8  0  0  0  0

                src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);         //a2 a3 a3 a4 a4 a5 a5 a6 b2 b3 b3 b4 b4 b5 b5 b6
                res_r0r1_t2_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff2_3_16x8b);   //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
                                                                                            //b2*c2+b3*c3 b3*c2+b4*c3 b4*c2+b5*c3 b5*c2+b6*c3

                src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4);                             //a4 a5 a5 a6 a6 a7 a7 a8  0  0  0  0  0  0  0  0
                src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4);                             //b4 b5 b5 b6 b6 b7 b7 b8  0  0  0  0  0  0  0  0

                src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);         //a4 a5 a5 a6 a6 a7 a7 a8 b4 b5 b5 b6 b6 b7 b7 b8
                res_r0r1_t3_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff4_5_16x8b);   //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
                                                                                            //b4*c4+b5*c5 b5*c4+b6*c5 b4*c6+b7*c5 b7*c4+b8*c5

                res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t2_8x16b);
                res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t3_8x16b);


                _mm_storeu_si128((__m128i *)(pi2_temp1), res_r0r1_t1_8x16b);

                ht_temp -= 2;
                pu1_src =  pu1_src + (src_strd << 1);
                pi2_temp1 =  pi2_temp1 + (4 << 1);
            }
            while(ht_temp > 0);
        }
        // vertical q-pel
        {
            __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b;
            __m128i src_r4_8x16b, src_r5_8x16b, src_r6_8x16b;
            __m128i src_r0r1_c0_8x16b, src_r2r3_c0_8x16b, src_r4r5_c0_8x16b;
            __m128i src_hpel_16x8b, src_hpel_8x16b;

            __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b;
            __m128i res_8x16b, res_16x8b;

            __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
            __m128i const_val512_4x32b, const_val16_8x16b;
            __m128i mask_low32b;

            mask_low32b = _mm_set1_epi8(0xff);
            const_val512_4x32b = _mm_set1_epi32(512);
            const_val16_8x16b = _mm_set1_epi16(16);
            mask_low32b = _mm_srli_si128(mask_low32b, 12);

            coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001);
            coeff2_3_8x16b = _mm_set1_epi32(0x00140014);
            coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB);

            src_r0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2));
            src_r1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2 + 4));
            src_r2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2 + 8));
            src_r3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2 + 12));
            src_r4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2 + 16));
            pi2_temp2 += 20;

            do
            {
                src_r5_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2));
                src_r6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2 + 4));

                src_r0r1_c0_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
                src_r2r3_c0_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
                src_r4r5_c0_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);

                res_t1_4x32b = _mm_madd_epi16(src_r0r1_c0_8x16b, coeff0_1_8x16b);
                res_t2_4x32b = _mm_madd_epi16(src_r2r3_c0_8x16b, coeff2_3_8x16b);
                res_t3_4x32b = _mm_madd_epi16(src_r4r5_c0_8x16b, coeff4_5_8x16b);

                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
                res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
                res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

                src_r0r1_c0_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b);
                src_r2r3_c0_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b);
                src_r4r5_c0_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b);

                res_t1_4x32b = _mm_madd_epi16(src_r0r1_c0_8x16b, coeff0_1_8x16b);
                res_t2_4x32b = _mm_madd_epi16(src_r2r3_c0_8x16b, coeff2_3_8x16b);
                res_t3_4x32b = _mm_madd_epi16(src_r4r5_c0_8x16b, coeff4_5_8x16b);

                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
                res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
                res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

                res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
                res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);

                src_hpel_8x16b = _mm_loadu_si128((__m128i *)pi2_temp3);
                src_hpel_8x16b = _mm_add_epi16(src_hpel_8x16b, const_val16_8x16b);
                src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits.
                src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b);

                res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b);

                _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char *)(pu1_dst));
                res_16x8b = _mm_srli_si128(res_16x8b, 4);
                _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char *)(pu1_dst + dst_strd));

                src_r0_8x16b = src_r2_8x16b;
                src_r1_8x16b = src_r3_8x16b;
                src_r2_8x16b = src_r4_8x16b;
                src_r3_8x16b = src_r5_8x16b;
                src_r4_8x16b = src_r6_8x16b;

                ht -= 2;
                pi2_temp2 =  pi2_temp2 + (4 << 1);
                pi2_temp3 =  pi2_temp3 + (4 << 1);
                pu1_dst = pu1_dst + (dst_strd << 1);
            }
            while(ht > 0);
        }
    }
    else if(wd == 8)
    {
        // horizontal half-pel
        {
            __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b;
            __m128i src_r0_t1_16x8b, src_r1_t1_16x8b;

            __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
            __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;

            __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;

            coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01);  //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
            coeff2_3_16x8b = _mm_set1_epi32(0x14141414);  //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
            coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB);  //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5

            //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
            //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....

            do
            {
                src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));                   //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
                src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));        //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15

                src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                     //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
                src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1);                     //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0

                src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);    //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
                src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);    //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8

                res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b);   //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
                                                                                        //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
                res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b);   //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
                                                                                        //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1

                src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                         //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
                src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                         //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0

                src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                 //a3 a4 a5 a6 a7 a8 a9....a15 0  0  0
                src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                 //b3 b4 b5 b6 b7 b8 b9....b15 0  0  0

                src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);    //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
                src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);    //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10

                res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b);   //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
                                                                                        //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
                res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b);   //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
                                                                                        //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3

                src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                         //a4 a5 a6 a7 a8 a9....a15 0  0  0  0
                src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                         //b4 b5 b6 b7 b8 b9....b15 0  0  0  0

                src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                 //a5 a6 a7 a8 a9....a15 0  0  0  0  0
                src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                 //b5 b6 b7 b8 b9....b15 0  0  0  0  0

                src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);    //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
                src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);    //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12

                res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b);   //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
                                                                                        //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
                res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b);   //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
                                                                                        //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
                res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
                res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);

                res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
                res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);

                _mm_storeu_si128((__m128i *)(pi2_temp1), res_r0_t1_8x16b);
                _mm_storeu_si128((__m128i *)(pi2_temp1 + 8), res_r1_t1_8x16b);

                ht_temp -= 2;
                pu1_src =  pu1_src + (src_strd << 1);
                pi2_temp1 =  pi2_temp1 + (8 << 1);
            }
            while(ht_temp > 0);
        }
        // vertical q-pel
        {
            __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b;
            __m128i src_r4_8x16b, src_r5_8x16b, src_r6_8x16b;
            __m128i src_r0r1_8x16b, src_r2r3_8x16b, src_r4r5_8x16b;
            __m128i src_hpel_8x16b, src_hpel_16x8b;

            __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b;
            __m128i res_8x16b, res_16x8b;

            __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
            __m128i const_val512_4x32b, const_val16_8x16b;

            coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001);
            coeff2_3_8x16b = _mm_set1_epi32(0x00140014);
            coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB);

            const_val512_4x32b = _mm_set1_epi32(512);
            const_val16_8x16b = _mm_set1_epi16(16);

            src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2));
            src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8));
            src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 16));
            src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 24));
            src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 32));
            pi2_temp2 += 40;

            do
            {
                src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2));
                src_r6_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8));

                src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
                src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
                src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);

                res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
                res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
                res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);

                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
                res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
                res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

                src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
                src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
                src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);

                res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
                res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
                res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);

                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
                res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
                res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

                res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
                res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);

                src_hpel_8x16b = _mm_loadu_si128((__m128i *)pi2_temp3);
                src_hpel_8x16b = _mm_add_epi16(const_val16_8x16b, src_hpel_8x16b);
                src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits.
                src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b);

                res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b);

                _mm_storel_epi64((__m128i *)(pu1_dst), res_16x8b);

                src_r0r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b);
                src_r2r3_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b);
                src_r4r5_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b);

                res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
                res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
                res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);

                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
                res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
                res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

                src_r0r1_8x16b = _mm_unpackhi_epi16(src_r1_8x16b, src_r2_8x16b);
                src_r2r3_8x16b = _mm_unpackhi_epi16(src_r3_8x16b, src_r4_8x16b);
                src_r4r5_8x16b = _mm_unpackhi_epi16(src_r5_8x16b, src_r6_8x16b);

                res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
                res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
                res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);

                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
                res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
                res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

                res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
                res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);

                src_hpel_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp3 + 8));
                src_hpel_8x16b = _mm_add_epi16(const_val16_8x16b, src_hpel_8x16b);
                src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits.
                src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b);

                res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b);

                _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b);

                src_r0_8x16b = src_r2_8x16b;
                src_r1_8x16b = src_r3_8x16b;
                src_r2_8x16b = src_r4_8x16b;
                src_r3_8x16b = src_r5_8x16b;
                src_r4_8x16b = src_r6_8x16b;

                ht -= 2;
                pi2_temp2 = pi2_temp2 + (8 << 1);
                pi2_temp3 = pi2_temp3 + (8 << 1);
                pu1_dst = pu1_dst + (dst_strd << 1);
            }
            while(ht > 0);
        }
    }
    else // wd == 16
    {
        UWORD8 *pu1_dst1;
        WORD16 *pi2_temp4,*pi2_temp5;

        pu1_dst1 = pu1_dst + 8;
        pi2_temp4 = pi2_temp2 + 8;
        pi2_temp5 = pi2_temp3 + 8;

        // horizontal half-pel
        {
            __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b;
            __m128i src_r0_t1_16x8b, src_r1_t1_16x8b;

            __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
            __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;

            __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;

            coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01);  //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
            coeff2_3_16x8b = _mm_set1_epi32(0x14141414);  //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
            coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB);  //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5

            //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
            //Row0 :                         b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
            //b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels.

            do
            {
                src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));                  //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
                src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8));              //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15

                src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                    //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
                src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1);                    //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0

                src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);   //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
                src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);   //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8

                res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b);   //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
                                                                                        //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
                res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b);   //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
                                                                                        //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1

                src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                         //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
                src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                         //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0

                src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                 //a3 a4 a5 a6 a7 a8 a9....a15 0  0  0
                src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                 //b3 b4 b5 b6 b7 b8 b9....b15 0  0  0

                src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);    //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
                src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);    //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10

                res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b);   //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
                                                                                        //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
                res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b);   //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
                                                                                        //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3

                src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                         //a4 a5 a6 a7 a8 a9....a15 0  0  0  0
                src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                         //b4 b5 b6 b7 b8 b9....b15 0  0  0  0

                src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                 //a5 a6 a7 a8 a9....a15 0  0  0  0  0
                src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                 //b5 b6 b7 b8 b9....b15 0  0  0  0  0

                src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);    //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
                src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);    //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12

                res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b);   //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
                                                                                        //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
                res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b);   //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
                                                                                        //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
                res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
                res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);

                res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
                res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);

                _mm_storeu_si128((__m128i *)(pi2_temp1), res_r0_t1_8x16b);
                _mm_storeu_si128((__m128i *)(pi2_temp1 + 8), res_r1_t1_8x16b);

                ht_temp--;
                pu1_src =  pu1_src + src_strd;
                pi2_temp1 =  pi2_temp1 + 16;
            }
            while(ht_temp > 0);
        }
        // vertical q-pel
        {
            __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b, src_r4_8x16b;
            __m128i src_r5_8x16b, src_r6_8x16b;
            __m128i src_r0r1_8x16b, src_r2r3_8x16b, src_r4r5_8x16b;
            __m128i src_hpel_8x16b, src_hpel_16x8b;

            __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b;
            __m128i res_8x16b, res_16x8b;

            __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
            __m128i const_val512_4x32b, const_val16_8x16b;

            coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001);
            coeff2_3_8x16b = _mm_set1_epi32(0x00140014);
            coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB);

            const_val512_4x32b = _mm_set1_epi32(512);
            const_val16_8x16b = _mm_set1_epi16(16);

            /**********************************************************/
            /*     Do first height x 8 block                          */
            /**********************************************************/
            src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2));
            src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 16));
            src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 32));
            src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 48));
            src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 64));
            pi2_temp2 += 80;

            ht_temp = ht;
            do
            {
                src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2));
                src_r6_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 16));

                src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
                src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
                src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);

                res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
                res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
                res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);

                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
                res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
                res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

                src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
                src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
                src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);

                res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
                res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
                res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);

                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
                res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
                res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

                res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
                res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);

                src_hpel_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp3));
                src_hpel_8x16b = _mm_add_epi16(src_hpel_8x16b, const_val16_8x16b);
                src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits.
                src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b);

                res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b);
                _mm_storel_epi64((__m128i *)(pu1_dst), res_16x8b);

                src_r0r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b);
                src_r2r3_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b);
                src_r4r5_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b);

                res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
                res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
                res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);

                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
                res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
                res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

                src_r0r1_8x16b = _mm_unpackhi_epi16(src_r1_8x16b, src_r2_8x16b);
                src_r2r3_8x16b = _mm_unpackhi_epi16(src_r3_8x16b, src_r4_8x16b);
                src_r4r5_8x16b = _mm_unpackhi_epi16(src_r5_8x16b, src_r6_8x16b);

                res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
                res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
                res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);

                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
                res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
                res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

                res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
                res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);

                src_hpel_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp3 + 16));
                src_hpel_8x16b = _mm_add_epi16(src_hpel_8x16b, const_val16_8x16b);
                src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits.
                src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b);

                res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b);
                _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b);

                src_r0_8x16b = src_r2_8x16b;
                src_r1_8x16b = src_r3_8x16b;
                src_r2_8x16b = src_r4_8x16b;
                src_r3_8x16b = src_r5_8x16b;
                src_r4_8x16b = src_r6_8x16b;

                ht_temp -= 2;
                pi2_temp3 = pi2_temp3 + (16 << 1);
                pi2_temp2 = pi2_temp2 + (16 << 1);
                pu1_dst = pu1_dst + (dst_strd << 1);
            }
            while(ht_temp > 0);

            /**********************************************************/
            /*     Do second height * 8 block                         */
            /**********************************************************/
            src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp4));
            src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp4 + 16));
            src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp4 + 32));
            src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp4 + 48));
            src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp4 + 64));
            pi2_temp4 += 80;

            do
            {
                src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp4));
                src_r6_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp4 + 16));

                src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
                src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
                src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);

                res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
                res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
                res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);

                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
                res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
                res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

                src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
                src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
                src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);

                res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
                res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
                res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);

                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
                res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
                res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

                res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
                res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);

                src_hpel_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp5));
                src_hpel_8x16b = _mm_add_epi16(src_hpel_8x16b, const_val16_8x16b);
                src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits.
                src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b);

                res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b);
                _mm_storel_epi64((__m128i *)(pu1_dst1), res_16x8b);

                src_r0r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b);
                src_r2r3_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b);
                src_r4r5_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b);

                res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
                res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
                res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);

                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
                res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
                res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

                src_r0r1_8x16b = _mm_unpackhi_epi16(src_r1_8x16b, src_r2_8x16b);
                src_r2r3_8x16b = _mm_unpackhi_epi16(src_r3_8x16b, src_r4_8x16b);
                src_r4r5_8x16b = _mm_unpackhi_epi16(src_r5_8x16b, src_r6_8x16b);

                res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
                res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
                res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);

                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
                res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
                res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
                res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);

                res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
                res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);

                src_hpel_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp5 + 16));
                src_hpel_8x16b = _mm_add_epi16(src_hpel_8x16b, const_val16_8x16b);
                src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits.
                src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b);

                res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b);
                _mm_storel_epi64((__m128i *)(pu1_dst1 + dst_strd), res_16x8b);

                src_r0_8x16b = src_r2_8x16b;
                src_r1_8x16b = src_r3_8x16b;
                src_r2_8x16b = src_r4_8x16b;
                src_r3_8x16b = src_r5_8x16b;
                src_r4_8x16b = src_r6_8x16b;

                ht -= 2;
                pi2_temp5 = pi2_temp5 + (16 << 1);
                pi2_temp4 = pi2_temp4 + (16 << 1);
                pu1_dst1 = pu1_dst1 + (dst_strd << 1);
            }
            while(ht > 0);
        }
    }
}

/*****************************************************************************/
/*                                                                           */
/*  Function Name : ih264_inter_pred_chroma_ssse3                            */
/*                                                                           */
/*  Description   : This function implements a four-tap 2D filter as         */
/*                  mentioned in sec. 8.4.2.2.2 titled "Chroma sample        */
/*                  "interpolation process". (ht,wd) can be (2,2), (4,2),    */
/*                  (2,4), (4,4), (8,4), (4,8) or (8,8).                     */
/*                                                                           */
/*  Inputs        : puc_src  - pointer to source                             */
/*                  puc_dst  - pointer to destination                        */
/*                  src_strd - stride for source                             */
/*                  dst_strd - stride for destination                        */
/*                  dx       - x position of destination value               */
/*                  dy       - y position of destination value               */
/*                  ht       - height of the block                           */
/*                  wd       - width of the block                            */
/*                                                                           */
/*  Issues        : None                                                     */
/*                                                                           */
/*  Revision History:                                                        */
/*                                                                           */
/*         DD MM YYYY   Author(s)       Changes                              */
/*         13 02 2015   Kaushik         Initial Version                      */
/*                      Senthoor                                             */
/*                                                                           */
/*****************************************************************************/
void ih264_inter_pred_chroma_ssse3(UWORD8 *pu1_src,
                                   UWORD8 *pu1_dst,
                                   WORD32 src_strd,
                                   WORD32 dst_strd,
                                   WORD32 dx,
                                   WORD32 dy,
                                   WORD32 ht,
                                   WORD32 wd)
{
    WORD32 i, j, A, B, C, D;

    i = 8 - dx;
    j = 8 - dy;

    A = i * j;
    B = dx * j;
    C = i * dy;
    D = dx * dy;

    if(wd == 2)
    {
        WORD32 tmp1, tmp2, tmp3, tmp4;

        do
        {
            //U
            tmp1 = A * pu1_src[0] + B * pu1_src[2] + C * pu1_src[src_strd] + D * pu1_src[src_strd + 2];
            tmp2 = A * pu1_src[2] + B * pu1_src[4] + C * pu1_src[src_strd + 2] + D * pu1_src[src_strd + 4];
            //V
            tmp3 = A * pu1_src[1] + B * pu1_src[3] + C * pu1_src[src_strd + 1] + D * pu1_src[src_strd + 3];
            tmp4 = A * pu1_src[3] + B * pu1_src[5] + C * pu1_src[src_strd + 3] + D * pu1_src[src_strd + 5];

            tmp1 = (tmp1 + 32) >> 6;
            tmp2 = (tmp2 + 32) >> 6;
            tmp3 = (tmp3 + 32) >> 6;
            tmp4 = (tmp4 + 32) >> 6;

            pu1_dst[0] = CLIP_U8(tmp1);
            pu1_dst[2] = CLIP_U8(tmp2);
            pu1_dst[1] = CLIP_U8(tmp3);
            pu1_dst[3] = CLIP_U8(tmp4);

            pu1_src += src_strd;
            pu1_dst += dst_strd;

            tmp1 = A * pu1_src[0] + B * pu1_src[2] + C * pu1_src[src_strd] + D * pu1_src[src_strd + 2];
            tmp2 = A * pu1_src[2] + B * pu1_src[4] + C * pu1_src[src_strd + 2] + D * pu1_src[src_strd + 4];
            tmp3 = A * pu1_src[1] + B * pu1_src[3] + C * pu1_src[src_strd + 1] + D * pu1_src[src_strd + 3];
            tmp4 = A * pu1_src[3] + B * pu1_src[5] + C * pu1_src[src_strd + 3] + D * pu1_src[src_strd + 5];

            tmp1 = (tmp1 + 32) >> 6;
            tmp2 = (tmp2 + 32) >> 6;
            tmp3 = (tmp3 + 32) >> 6;
            tmp4 = (tmp4 + 32) >> 6;

            pu1_dst[0] = CLIP_U8(tmp1);
            pu1_dst[2] = CLIP_U8(tmp2);
            pu1_dst[1] = CLIP_U8(tmp3);
            pu1_dst[3] = CLIP_U8(tmp4);

            ht -= 2;
            pu1_src += src_strd;
            pu1_dst += dst_strd;
        }
        while(ht > 0);

        /*
        WORD32 AB, CD;

        __m128i src_r1_16x8b, src_r2_16x8b, src_r3_16x8b;
        __m128i src_r1r2_16x8b, src_r2r3_16x8b;
        __m128i res_AB_8x16b, res_CD_8x16b, res_8x16b, res_16x8b;
        __m128i mask_low32b;

        __m128i coeffAB_16x8b, coeffCD_16x8b, round_add32_8x16b;
        __m128i const_shuff_16x8b;

        AB = (B << 8) + A;
        CD = (D << 8) + C;

        coeffAB_16x8b = _mm_set1_epi16(AB);
        coeffCD_16x8b = _mm_set1_epi16(CD);

        round_add32_8x16b = _mm_set1_epi16(32);

        mask_low32b = _mm_set1_epi8(0xff);
        src_r1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);                       //u1[0] v1[0] u1[1] v1[1] u1[2] v1[2] u1[3] v1[3]
        pu1_src += src_strd;

        const_shuff_16x8b = _mm_setr_epi32(0x03010200, 0x05030402, 0x0b090a08, 0x0d0b0c0a);
        mask_low32b = _mm_srli_si128(mask_low32b, 12);

        do
        {
            src_r2_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);                   //u2[0] v2[0] u2[1] v2[1] u1[2] v2[2] u2[3] v2[3]
            src_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));      //u3[0] v3[0] u3[1] v3[1] u3[2] v3[2] u3[3] v3[3]

            src_r1r2_16x8b = _mm_unpacklo_epi64(src_r1_16x8b, src_r2_16x8b);
            src_r2r3_16x8b = _mm_unpacklo_epi64(src_r2_16x8b, src_r3_16x8b);

            src_r1r2_16x8b = _mm_shuffle_epi8(src_r1r2_16x8b, const_shuff_16x8b); //u1[0] u1[1] v1[0] v1[1] u1[1] u1[2] v1[1] v1[2]
                                                                                  //u2[0] u2[1] v2[0] v2[1] u2[1] u2[2] v2[1] v2[2]
            src_r2r3_16x8b = _mm_shuffle_epi8(src_r2r3_16x8b, const_shuff_16x8b); //u2[0] u2[1] v2[0] v2[1] u2[1] u2[2] v2[1] v2[2]
                                                                                  //u3[0] u3[1] v3[0] v3[1] u3[1] u3[2] v3[1] v3[2]
            res_AB_8x16b = _mm_maddubs_epi16(src_r1r2_16x8b, coeffAB_16x8b);
            res_CD_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeffCD_16x8b);

            res_8x16b = _mm_add_epi16(res_AB_8x16b, round_add32_8x16b);
            res_8x16b = _mm_add_epi16(res_8x16b, res_CD_8x16b);
            res_8x16b = _mm_srai_epi16(res_8x16b, 6);
            res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);

            _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char *)pu1_dst);

            ht -= 2;
            pu1_src += src_strd << 1;
            res_16x8b = _mm_srli_si128(res_16x8b, 4);
            src_r1_16x8b = src_r3_16x8b;

            _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char *)(pu1_dst + dst_strd));

            pu1_dst += dst_strd << 1;
        }
        while(ht > 0);
        */
    }
    else if(wd == 4)
    {
        WORD32 AB, CD;

        __m128i src_r1_16x8b, src_r2_16x8b, src_r3_16x8b;
        __m128i res1_AB_8x16b, res1_CD_8x16b, res1_8x16b, res1_16x8b;
        __m128i res2_AB_8x16b, res2_CD_8x16b, res2_8x16b, res2_16x8b;

        __m128i coeffAB_16x8b, coeffCD_16x8b, round_add32_8x16b;
        __m128i const_shuff_16x8b;

        AB = (B << 8) + A;
        CD = (D << 8) + C;

        coeffAB_16x8b = _mm_set1_epi16(AB);
        coeffCD_16x8b = _mm_set1_epi16(CD);

        round_add32_8x16b = _mm_set1_epi16(32);

        const_shuff_16x8b = _mm_setr_epi32(0x03010200, 0x05030402, 0x07050604, 0x09070806);

        src_r1_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
        src_r1_16x8b = _mm_shuffle_epi8(src_r1_16x8b, const_shuff_16x8b);
        pu1_src += src_strd;

        do
        {
            src_r2_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
            src_r3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));

            src_r2_16x8b = _mm_shuffle_epi8(src_r2_16x8b, const_shuff_16x8b);
            src_r3_16x8b = _mm_shuffle_epi8(src_r3_16x8b, const_shuff_16x8b);

            res1_AB_8x16b = _mm_maddubs_epi16(src_r1_16x8b, coeffAB_16x8b);
            res1_CD_8x16b = _mm_maddubs_epi16(src_r2_16x8b, coeffCD_16x8b);
            res2_AB_8x16b = _mm_maddubs_epi16(src_r2_16x8b, coeffAB_16x8b);
            res2_CD_8x16b = _mm_maddubs_epi16(src_r3_16x8b, coeffCD_16x8b);

            res1_8x16b = _mm_add_epi16(res1_AB_8x16b, res1_CD_8x16b);
            res2_8x16b = _mm_add_epi16(res2_AB_8x16b, res2_CD_8x16b);
            res1_8x16b = _mm_add_epi16(res1_8x16b, round_add32_8x16b);
            res2_8x16b = _mm_add_epi16(res2_8x16b, round_add32_8x16b);

            res1_8x16b = _mm_srai_epi16(res1_8x16b, 6);
            res2_8x16b = _mm_srai_epi16(res2_8x16b, 6);

            res1_16x8b = _mm_packus_epi16(res1_8x16b, res1_8x16b);
            res2_16x8b = _mm_packus_epi16(res2_8x16b, res2_8x16b);

            _mm_storel_epi64((__m128i *)pu1_dst, res1_16x8b);
            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res2_16x8b);

            src_r1_16x8b = src_r3_16x8b;

            ht -= 2;
            pu1_src += src_strd << 1;
            pu1_dst += dst_strd << 1;
        }
        while(ht > 0);
    }
    else // wd == 8
    {
        WORD32 AB, CD;

        __m128i src_r1l_16x8b, src_r2l_16x8b;
        __m128i src_r1h_16x8b, src_r2h_16x8b;

        __m128i res_l_AB_8x16b, res_l_CD_8x16b;
        __m128i res_h_AB_8x16b, res_h_CD_8x16b;
        __m128i res_l_8x16b, res_h_8x16b, res_16x8b;

        __m128i coeffAB_16x8b, coeffCD_16x8b, round_add32_8x16b;
        __m128i const_shuff_16x8b;

        AB = (B << 8) + A;
        CD = (D << 8) + C;

        coeffAB_16x8b = _mm_set1_epi16(AB);
        coeffCD_16x8b = _mm_set1_epi16(CD);

        round_add32_8x16b = _mm_set1_epi16(32);

        const_shuff_16x8b = _mm_setr_epi32(0x03010200, 0x05030402, 0x07050604, 0x09070806);

        src_r1l_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
        src_r1h_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8));

        src_r1l_16x8b = _mm_shuffle_epi8(src_r1l_16x8b, const_shuff_16x8b);
        src_r1h_16x8b = _mm_shuffle_epi8(src_r1h_16x8b, const_shuff_16x8b);

        pu1_src += src_strd;

        do
        {
            //row 1
            src_r2l_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
            src_r2h_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8));

            src_r2l_16x8b = _mm_shuffle_epi8(src_r2l_16x8b, const_shuff_16x8b);
            src_r2h_16x8b = _mm_shuffle_epi8(src_r2h_16x8b, const_shuff_16x8b);

            res_l_AB_8x16b = _mm_maddubs_epi16(src_r1l_16x8b, coeffAB_16x8b);
            res_h_AB_8x16b = _mm_maddubs_epi16(src_r1h_16x8b, coeffAB_16x8b);
            res_l_CD_8x16b = _mm_maddubs_epi16(src_r2l_16x8b, coeffCD_16x8b);
            res_h_CD_8x16b = _mm_maddubs_epi16(src_r2h_16x8b, coeffCD_16x8b);

            res_l_8x16b = _mm_add_epi16(res_l_AB_8x16b, round_add32_8x16b);
            res_h_8x16b = _mm_add_epi16(res_h_AB_8x16b, round_add32_8x16b);
            res_l_8x16b = _mm_add_epi16(res_l_8x16b, res_l_CD_8x16b);
            res_h_8x16b = _mm_add_epi16(res_h_8x16b, res_h_CD_8x16b);

            res_l_8x16b = _mm_srai_epi16(res_l_8x16b, 6);
            res_h_8x16b = _mm_srai_epi16(res_h_8x16b, 6);

            res_16x8b = _mm_packus_epi16(res_l_8x16b, res_h_8x16b);

            _mm_storeu_si128((__m128i *)pu1_dst, res_16x8b);

            pu1_src += src_strd;
            pu1_dst += dst_strd;

            //row 2
            src_r1l_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
            src_r1h_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8));

            src_r1l_16x8b = _mm_shuffle_epi8(src_r1l_16x8b, const_shuff_16x8b);
            src_r1h_16x8b = _mm_shuffle_epi8(src_r1h_16x8b, const_shuff_16x8b);

            res_l_AB_8x16b = _mm_maddubs_epi16(src_r2l_16x8b, coeffAB_16x8b);
            res_h_AB_8x16b = _mm_maddubs_epi16(src_r2h_16x8b, coeffAB_16x8b);
            res_l_CD_8x16b = _mm_maddubs_epi16(src_r1l_16x8b, coeffCD_16x8b);
            res_h_CD_8x16b = _mm_maddubs_epi16(src_r1h_16x8b, coeffCD_16x8b);

            res_l_8x16b = _mm_add_epi16(res_l_AB_8x16b, round_add32_8x16b);
            res_h_8x16b = _mm_add_epi16(res_h_AB_8x16b, round_add32_8x16b);
            res_l_8x16b = _mm_add_epi16(res_l_8x16b, res_l_CD_8x16b);
            res_h_8x16b = _mm_add_epi16(res_h_8x16b, res_h_CD_8x16b);

            res_l_8x16b = _mm_srai_epi16(res_l_8x16b, 6);
            res_h_8x16b = _mm_srai_epi16(res_h_8x16b, 6);

            res_16x8b = _mm_packus_epi16(res_l_8x16b, res_h_8x16b);

            _mm_storeu_si128((__m128i *)pu1_dst, res_16x8b);

            pu1_src += src_strd;
            pu1_dst += dst_strd;

            //row 3
            src_r2l_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
            src_r2h_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8));

            src_r2l_16x8b = _mm_shuffle_epi8(src_r2l_16x8b, const_shuff_16x8b);
            src_r2h_16x8b = _mm_shuffle_epi8(src_r2h_16x8b, const_shuff_16x8b);

            res_l_AB_8x16b = _mm_maddubs_epi16(src_r1l_16x8b, coeffAB_16x8b);
            res_h_AB_8x16b = _mm_maddubs_epi16(src_r1h_16x8b, coeffAB_16x8b);
            res_l_CD_8x16b = _mm_maddubs_epi16(src_r2l_16x8b, coeffCD_16x8b);
            res_h_CD_8x16b = _mm_maddubs_epi16(src_r2h_16x8b, coeffCD_16x8b);

            res_l_8x16b = _mm_add_epi16(res_l_AB_8x16b, round_add32_8x16b);
            res_h_8x16b = _mm_add_epi16(res_h_AB_8x16b, round_add32_8x16b);
            res_l_8x16b = _mm_add_epi16(res_l_8x16b, res_l_CD_8x16b);
            res_h_8x16b = _mm_add_epi16(res_h_8x16b, res_h_CD_8x16b);

            res_l_8x16b = _mm_srai_epi16(res_l_8x16b, 6);
            res_h_8x16b = _mm_srai_epi16(res_h_8x16b, 6);

            res_16x8b = _mm_packus_epi16(res_l_8x16b, res_h_8x16b);

            _mm_storeu_si128((__m128i *)pu1_dst, res_16x8b);

            pu1_src += src_strd;
            pu1_dst += dst_strd;

            //row 1
            src_r1l_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
            src_r1h_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8));

            src_r1l_16x8b = _mm_shuffle_epi8(src_r1l_16x8b, const_shuff_16x8b);
            src_r1h_16x8b = _mm_shuffle_epi8(src_r1h_16x8b, const_shuff_16x8b);

            res_l_AB_8x16b = _mm_maddubs_epi16(src_r2l_16x8b, coeffAB_16x8b);
            res_h_AB_8x16b = _mm_maddubs_epi16(src_r2h_16x8b, coeffAB_16x8b);
            res_l_CD_8x16b = _mm_maddubs_epi16(src_r1l_16x8b, coeffCD_16x8b);
            res_h_CD_8x16b = _mm_maddubs_epi16(src_r1h_16x8b, coeffCD_16x8b);

            res_l_8x16b = _mm_add_epi16(res_l_AB_8x16b, round_add32_8x16b);
            res_h_8x16b = _mm_add_epi16(res_h_AB_8x16b, round_add32_8x16b);
            res_l_8x16b = _mm_add_epi16(res_l_8x16b, res_l_CD_8x16b);
            res_h_8x16b = _mm_add_epi16(res_h_8x16b, res_h_CD_8x16b);

            res_l_8x16b = _mm_srai_epi16(res_l_8x16b, 6);
            res_h_8x16b = _mm_srai_epi16(res_h_8x16b, 6);

            res_16x8b = _mm_packus_epi16(res_l_8x16b, res_h_8x16b);

            _mm_storeu_si128((__m128i *)pu1_dst, res_16x8b);

            ht -= 4;
            pu1_src += src_strd;
            pu1_dst += dst_strd;
        }
        while(ht > 0);
    }
}
