common/x86/ihevc_inter_pred_filters_sse42_intr.c - platform/external/libhevc - Git at Google

 /******************************************************************************
 *
 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 ******************************************************************************/


 /**
 *******************************************************************************
 * @file
 *  ihevc_inter_pred_filters_x86_intr.c
 *
 * @brief
 *  Contains function definitions for inter prediction  interpolation filters
 *  coded in x86 intrinsics
 *
 *
 * @author
 *
 *
 * @par List of Functions:
 *  - ihevc_inter_pred_luma_copy_w16out_sse42()
 *  - ihevc_inter_pred_chroma_copy_sse42()
 *  - ihevc_inter_pred_chroma_copy_w16out_sse42()
 *
 * @remarks
 *  None
 *
 *******************************************************************************
 */


 /*****************************************************************************/
 /* File Includes                                                             */
 /*****************************************************************************/
 #include <assert.h>

 #include "ihevc_debug.h"
 #include "ihevc_typedefs.h"
 #include "ihevc_defs.h"
 #include "ihevc_inter_pred.h"
 #include "ihevc_macros.h"
 #include "ihevc_platform_macros.h"
 #include "ihevc_func_selector.h"

 #include <immintrin.h>
 #include <emmintrin.h>
 #include <smmintrin.h>
 #include <tmmintrin.h>

 /*****************************************************************************/
 /* Function Definitions                                                      */
 /*****************************************************************************/


 /**
 *******************************************************************************
 *
 * @brief
 *       Interprediction luma filter for copy 16bit output
 *
 * @par Description:
 *    Copies the array of width 'wd' and height 'ht' from the  location pointed
 *    by 'src' to the location pointed by 'dst' The output is upshifted by 6
 *    bits and is used as input for vertical filtering or weighted prediction
 *
 * @param[in] pu1_src
 *  UWORD8 pointer to the source
 *
 * @param[out] pi2_dst
 *  WORD16 pointer to the destination
 *
 * @param[in] src_strd
 *  integer source stride
 *
 * @param[in] dst_strd
 *  integer destination stride
 *
 * @param[in] pi1_coeff
 *  WORD8 pointer to the filter coefficients
 *
 * @param[in] ht
 *  integer height of the array
 *
 * @param[in] wd
 *  integer width of the array
 *
 * @returns
 *
 * @remarks
 *  None
 *
 *******************************************************************************
 */

 void ihevc_inter_pred_luma_copy_w16out_sse42(UWORD8 *pu1_src,
                                              WORD16 *pi2_dst,
                                              WORD32 src_strd,
                                              WORD32 dst_strd,
                                              WORD8 *pi1_coeff,
                                              WORD32 ht,
                                              WORD32 wd)
 {
     WORD32 row, col;
     __m128i  src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b;
     UNUSED(pi1_coeff);
     ASSERT(wd % 4 == 0); /* checking assumption*/
     ASSERT(ht % 4 == 0); /* checking assumption*/

     if(0 == (wd & 7)) /* multiple of 8 case */
     {
         for(row = 0; row < ht; row += 4)
         {
             for(col = 0; col < wd; col += 8)
             {
                 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
                 src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
                 src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
                 src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
                 src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */

                 src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
                 src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
                 src2_16x8b = _mm_cvtepu8_epi16(src2_16x8b);
                 src3_16x8b = _mm_cvtepu8_epi16(src3_16x8b);

                 src0_16x8b = _mm_slli_epi16(src0_16x8b,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
                 src1_16x8b = _mm_slli_epi16(src1_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
                 src2_16x8b = _mm_slli_epi16(src2_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
                 src3_16x8b = _mm_slli_epi16(src3_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);

                 /* storing 16 8-bit output values */
                 _mm_storeu_si128((__m128i *)(pi2_dst), src0_16x8b);                 /* row =0 */
                 _mm_storeu_si128((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
                 _mm_storeu_si128((__m128i *)(pi2_dst + 2 * dst_strd), src2_16x8b);  /* row =2 */
                 _mm_storeu_si128((__m128i *)(pi2_dst + 3 * dst_strd), src3_16x8b);  /* row =3 */

                 pu1_src += 8; /* pointer update */
                 pi2_dst += 8; /* pointer update */
             } /* inner for loop ends here(8-output values in single iteration) */

             pu1_src += 4 * src_strd - wd; /* pointer update */
             pi2_dst += 4 * dst_strd - wd; /* pointer update */
         }
     }
     else /* wd = multiple of 4 case */
     {
         for(row = 0; row < ht; row += 4)
         {
             for(col = 0; col < wd; col += 4)
             {
                 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
                 src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
                 src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
                 src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
                 src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */

                 src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
                 src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
                 src2_16x8b = _mm_cvtepu8_epi16(src2_16x8b);
                 src3_16x8b = _mm_cvtepu8_epi16(src3_16x8b);

                 src0_16x8b = _mm_slli_epi16(src0_16x8b,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
                 src1_16x8b = _mm_slli_epi16(src1_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
                 src2_16x8b = _mm_slli_epi16(src2_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
                 src3_16x8b = _mm_slli_epi16(src3_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);

                 /* storing 16 8-bit output values */
                 _mm_storel_epi64((__m128i *)(pi2_dst), src0_16x8b);                 /* row =0 */
                 _mm_storel_epi64((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
                 _mm_storel_epi64((__m128i *)(pi2_dst + 2 * dst_strd), src2_16x8b);  /* row =2 */
                 _mm_storel_epi64((__m128i *)(pi2_dst + 3 * dst_strd), src3_16x8b);  /* row =3 */

                 pu1_src += 4; /* pointer update */
                 pi2_dst += 4; /* pointer update */
             } /* inner for loop ends here(4-output values in single iteration) */

             pu1_src += 4 * src_strd - wd; /* pointer update */
             pi2_dst += 4 * dst_strd - wd; /* pointer update */
         }
     }
 }

 /**
 *******************************************************************************
 *
 * @brief
 *      Chroma interprediction filter for copy
 *
 * @par Description:
 *    Copies the array of width 'wd' and height 'ht' from the  location pointed
 *    by 'src' to the location pointed by 'dst'
 *
 * @param[in] pu1_src
 *  UWORD8 pointer to the source
 *
 * @param[out] pu1_dst
 *  UWORD8 pointer to the destination
 *
 * @param[in] src_strd
 *  integer source stride
 *
 * @param[in] dst_strd
 *  integer destination stride
 *
 * @param[in] pi1_coeff
 *  WORD8 pointer to the filter coefficients
 *
 * @param[in] ht
 *  integer height of the array
 *
 * @param[in] wd
 *  integer width of the array
 *
 * @returns
 *
 * @remarks
 *  None
 *
 *******************************************************************************
 */

 void ihevc_inter_pred_chroma_copy_sse42(UWORD8 *pu1_src,
                                         UWORD8 *pu1_dst,
                                         WORD32 src_strd,
                                         WORD32 dst_strd,
                                         WORD8 *pi1_coeff,
                                         WORD32 ht,
                                         WORD32 wd)
 {
     WORD32 row, col, wdx2;
     __m128i  src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b;

     ASSERT(wd % 2 == 0); /* checking assumption*/
     ASSERT(ht % 2 == 0); /* checking assumption*/
     UNUSED(pi1_coeff);
     wdx2 = wd * 2;

     if(0 == (ht & 3)) /* ht multiple of 4 */
     {
         if(0 == (wdx2 & 15)) /* wdx2 multiple of 16 case */
         {
             for(row = 0; row < ht; row += 4)
             {
                 for(col = 0; col < wdx2; col += 16)
                 {
                     /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
                     src0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));                /* row =0 */
                     src1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
                     src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
                     src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */

                     /* storing 16 8-bit output values */
                     _mm_storeu_si128((__m128i *)(pu1_dst), src0_16x8b);                 /* row =0 */
                     _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
                     _mm_storeu_si128((__m128i *)(pu1_dst + 2 * dst_strd), src2_16x8b);  /* row =2 */
                     _mm_storeu_si128((__m128i *)(pu1_dst + 3 * dst_strd), src3_16x8b);  /* row =3 */

                     pu1_src += 16; /* pointer update */
                     pu1_dst += 16; /* pointer update */
                 } /* inner for loop ends here(16-output values in single iteration) */

                 pu1_src += 4 * src_strd - wdx2; /* pointer update */
                 pu1_dst += 4 * dst_strd - wdx2; /* pointer update */
             }

         }
         else if(0 == (wdx2 & 7)) /* multiple of 8 case */
         {
             for(row = 0; row < ht; row += 4)
             {
                 for(col = 0; col < wdx2; col += 8)
                 {
                     /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
                     src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
                     src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
                     src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
                     src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */

                     /* storing 16 8-bit output values */
                     _mm_storel_epi64((__m128i *)(pu1_dst), src0_16x8b);                 /* row =0 */
                     _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
                     _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src2_16x8b);  /* row =2 */
                     _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src3_16x8b);  /* row =3 */

                     pu1_src += 8; /* pointer update */
                     pu1_dst += 8; /* pointer update */
                 } /*  inner for loop ends here(8-output values in single iteration) */

                 pu1_src += 4 * src_strd - wdx2; /* pointer update */
                 pu1_dst += 4 * dst_strd - wdx2; /* pointer update */
             }
         }
         else /* wdx2 = multiple of 4 case */
         {
             WORD32 dst0, dst1, dst2, dst3;
             for(row = 0; row < ht; row += 4)
             {
                 for(col = 0; col < wdx2; col += 4)
                 {
                     /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
                     src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
                     src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
                     src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
                     src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */

                     dst0 = _mm_cvtsi128_si32(src0_16x8b);
                     dst1 = _mm_cvtsi128_si32(src1_16x8b);
                     dst2 = _mm_cvtsi128_si32(src2_16x8b);
                     dst3 = _mm_cvtsi128_si32(src3_16x8b);

                     /* storing 4 8-bit output values */
                     *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; /* row =0 */
                     *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; /* row =1 */
                     *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2; /* row =2 */
                     *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3; /* row =3 */

                     pu1_src += 4; /* pointer update */
                     pu1_dst += 4; /* pointer update */
                 } /*  inner for loop ends here(4- output values in single iteration) */

                 pu1_src += 4 * src_strd - wdx2; /* pointer update */
                 pu1_dst += 4 * dst_strd - wdx2; /* pointer update */
             }
         }
     }
     else /* ht multiple of 2 */
     {
         if(0 == (wdx2 & 15)) /* wdx2 multiple of 16 case */
         {
             for(row = 0; row < ht; row += 2)
             {
                 for(col = 0; col < wdx2; col += 16)
                 {
                     /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
                     src0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));                /* row =0 */
                     src1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */

                     /* storing 16 8-bit output values */
                     _mm_storeu_si128((__m128i *)(pu1_dst), src0_16x8b);                 /* row =0 */
                     _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */

                     pu1_src += 16; /* pointer update */
                     pu1_dst += 16; /* pointer update */
                 } /* inner for loop ends here(16-output values in single iteration) */

                 pu1_src += 2 * src_strd - wdx2; /* pointer update */
                 pu1_dst += 2 * dst_strd - wdx2; /* pointer update */
             }

         }
         else if(0 == (wdx2 & 7)) /* multiple of 8 case */
         {
             for(row = 0; row < ht; row += 2)
             {
                 for(col = 0; col < wdx2; col += 8)
                 {
                     /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
                     src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
                     src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */

                     /* storing 16 8-bit output values */
                     _mm_storel_epi64((__m128i *)(pu1_dst), src0_16x8b);                 /* row =0 */
                     _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */

                     pu1_src += 8; /* pointer update */
                     pu1_dst += 8; /* pointer update */
                 } /*  inner for loop ends here(8-output values in single iteration) */

                 pu1_src += 2 * src_strd - wdx2; /* pointer update */
                 pu1_dst += 2 * dst_strd - wdx2; /* pointer update */
             }
         }
         else /* wdx2 = multiple of 4 case */
         {
             WORD32 dst0, dst1;
             for(row = 0; row < ht; row += 2)
             {
                 for(col = 0; col < wdx2; col += 4)
                 {
                     /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
                     src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
                     src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */

                     dst0 = _mm_cvtsi128_si32(src0_16x8b);
                     dst1 = _mm_cvtsi128_si32(src1_16x8b);


                     /* storing 4 8-bit output values */
                     *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; /* row =0 */
                     *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; /* row =1 */

                     pu1_src += 4; /* pointer update */
                     pu1_dst += 4; /* pointer update */
                 } /*  inner for loop ends here(4- output values in single iteration) */

                 pu1_src += 2 * src_strd - wdx2; /* pointer update */
                 pu1_dst += 2 * dst_strd - wdx2; /* pointer update */
             }
         }
     }
 }

 /**
 *******************************************************************************
 *
 * @brief
 *       chroma interprediction filter for copying 16bit output
 *
 * @par Description:
 *    Copies the array of width 'wd' and height 'ht' from the  location pointed
 *    by 'src' to the location pointed by 'dst' The output is upshifted by 6
 *    bits and is used as input for vertical filtering or weighted prediction
 *
 * @param[in] pu1_src
 *  UWORD8 pointer to the source
 *
 * @param[out] pi2_dst
 *  WORD16 pointer to the destination
 *
 * @param[in] src_strd
 *  integer source stride
 *
 * @param[in] dst_strd
 *  integer destination stride
 *
 * @param[in] pi1_coeff
 *  WORD8 pointer to the filter coefficients
 *
 * @param[in] ht
 *  integer height of the array
 *
 * @param[in] wd
 *  integer width of the array
 *
 * @returns
 *
 * @remarks
 *  None
 *
 *******************************************************************************
 */

 void ihevc_inter_pred_chroma_copy_w16out_sse42(UWORD8 *pu1_src,
                                                WORD16 *pi2_dst,
                                                WORD32 src_strd,
                                                WORD32 dst_strd,
                                                WORD8 *pi1_coeff,
                                                WORD32 ht,
                                                WORD32 wd)
 {
     WORD32 row, col, wdx2;
     __m128i  src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b;

     ASSERT(wd % 2 == 0); /* checking assumption*/
     ASSERT(ht % 2 == 0); /* checking assumption*/
     UNUSED(pi1_coeff);
     wdx2 = wd * 2;

     if(0 == (ht & 3)) /* multiple of 4 case */
     {
         if(0 == (wdx2 & 7)) /* multiple of 8 case */
         {
             for(row = 0; row < ht; row += 4)
             {
                 for(col = 0; col < wdx2; col += 8)
                 {
                     /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
                     src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
                     src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
                     src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
                     src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */

                     src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
                     src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
                     src2_16x8b = _mm_cvtepu8_epi16(src2_16x8b);
                     src3_16x8b = _mm_cvtepu8_epi16(src3_16x8b);

                     src0_16x8b = _mm_slli_epi16(src0_16x8b,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
                     src1_16x8b = _mm_slli_epi16(src1_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
                     src2_16x8b = _mm_slli_epi16(src2_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
                     src3_16x8b = _mm_slli_epi16(src3_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);

                     /* storing 16 8-bit output values */
                     _mm_storeu_si128((__m128i *)(pi2_dst), src0_16x8b);                 /* row =0 */
                     _mm_storeu_si128((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
                     _mm_storeu_si128((__m128i *)(pi2_dst + 2 * dst_strd), src2_16x8b);  /* row =2 */
                     _mm_storeu_si128((__m128i *)(pi2_dst + 3 * dst_strd), src3_16x8b);  /* row =3 */

                     pu1_src += 8; /* pointer update */
                     pi2_dst += 8; /* pointer update */
                 } /* inner for loop ends here(8-output values in single iteration) */

                 pu1_src += 4 * src_strd - wdx2; /* pointer update */
                 pi2_dst += 4 * dst_strd - wdx2; /* pointer update */
             }
         }
         else /* wdx2 = multiple of 4 case */
         {
             for(row = 0; row < ht; row += 4)
             {
                 for(col = 0; col < wdx2; col += 4)
                 {
                     /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
                     src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
                     src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
                     src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
                     src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */

                     src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
                     src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
                     src2_16x8b = _mm_cvtepu8_epi16(src2_16x8b);
                     src3_16x8b = _mm_cvtepu8_epi16(src3_16x8b);

                     src0_16x8b = _mm_slli_epi16(src0_16x8b,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
                     src1_16x8b = _mm_slli_epi16(src1_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
                     src2_16x8b = _mm_slli_epi16(src2_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
                     src3_16x8b = _mm_slli_epi16(src3_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);

                     /* storing 16 8-bit output values */
                     _mm_storel_epi64((__m128i *)(pi2_dst), src0_16x8b);                 /* row =0 */
                     _mm_storel_epi64((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
                     _mm_storel_epi64((__m128i *)(pi2_dst + 2 * dst_strd), src2_16x8b);  /* row =2 */
                     _mm_storel_epi64((__m128i *)(pi2_dst + 3 * dst_strd), src3_16x8b);  /* row =3 */

                     pu1_src += 4; /* pointer update */
                     pi2_dst += 4; /* pointer update */
                 } /* inner for loop ends here(4-output values in single iteration) */

                 pu1_src += 4 * src_strd - wdx2; /* pointer update */
                 pi2_dst += 4 * dst_strd - wdx2; /* pointer update */
             }
         }
     }
     else  /* ht multiple of 2 case */
     {
         if(0 == (wdx2 & 7)) /* multiple of 8 case */
         {
             for(row = 0; row < ht; row += 2)
             {
                 for(col = 0; col < wdx2; col += 8)
                 {
                     /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
                     src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
                     src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */

                     src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
                     src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);

                     src0_16x8b = _mm_slli_epi16(src0_16x8b,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
                     src1_16x8b = _mm_slli_epi16(src1_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);

                     /* storing 16 8-bit output values */
                     _mm_storeu_si128((__m128i *)(pi2_dst), src0_16x8b);                 /* row =0 */
                     _mm_storeu_si128((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */

                     pu1_src += 8; /* pointer update */
                     pi2_dst += 8; /* pointer update */
                 } /* inner for loop ends here(8-output values in single iteration) */

                 pu1_src += 2 * src_strd - wdx2; /* pointer update */
                 pi2_dst += 2 * dst_strd - wdx2; /* pointer update */
             }
         }
         else /* wdx2 = multiple of 4 case */
         {
             for(row = 0; row < ht; row += 2)
             {
                 for(col = 0; col < wdx2; col += 4)
                 {
                     /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
                     src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
                     src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */

                     src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
                     src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);

                     src0_16x8b = _mm_slli_epi16(src0_16x8b,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
                     src1_16x8b = _mm_slli_epi16(src1_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);

                     /* storing 16 8-bit output values */
                     _mm_storel_epi64((__m128i *)(pi2_dst), src0_16x8b);                 /* row =0 */
                     _mm_storel_epi64((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */

                     pu1_src += 4; /* pointer update */
                     pi2_dst += 4; /* pointer update */
                 } /* inner for loop ends here(4-output values in single iteration) */

                 pu1_src += 2 * src_strd - wdx2; /* pointer update */
                 pi2_dst += 2 * dst_strd - wdx2; /* pointer update */
             }
         }
     }
 }
	/******************************************************************************
	*
	* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at:
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*
	******************************************************************************/


	/**
	*******************************************************************************
	* @file
	* ihevc_inter_pred_filters_x86_intr.c
	*
	* @brief
	* Contains function definitions for inter prediction interpolation filters
	* coded in x86 intrinsics
	*
	*
	* @author
	*
	*
	* @par List of Functions:
	* - ihevc_inter_pred_luma_copy_w16out_sse42()
	* - ihevc_inter_pred_chroma_copy_sse42()
	* - ihevc_inter_pred_chroma_copy_w16out_sse42()
	*
	* @remarks
	* None
	*
	*******************************************************************************
	*/


	/*****************************************************************************/
	/* File Includes */
	/*****************************************************************************/
	#include <assert.h>

	#include "ihevc_debug.h"
	#include "ihevc_typedefs.h"
	#include "ihevc_defs.h"
	#include "ihevc_inter_pred.h"
	#include "ihevc_macros.h"
	#include "ihevc_platform_macros.h"
	#include "ihevc_func_selector.h"

	#include <immintrin.h>
	#include <emmintrin.h>
	#include <smmintrin.h>
	#include <tmmintrin.h>

	/*****************************************************************************/
	/* Function Definitions */
	/*****************************************************************************/



	/**
	*******************************************************************************
	*
	* @brief
	* Interprediction luma filter for copy 16bit output
	*
	* @par Description:
	* Copies the array of width 'wd' and height 'ht' from the location pointed
	* by 'src' to the location pointed by 'dst' The output is upshifted by 6
	* bits and is used as input for vertical filtering or weighted prediction
	*
	* @param[in] pu1_src
	* UWORD8 pointer to the source
	*
	* @param[out] pi2_dst
	* WORD16 pointer to the destination
	*
	* @param[in] src_strd
	* integer source stride
	*
	* @param[in] dst_strd
	* integer destination stride
	*
	* @param[in] pi1_coeff
	* WORD8 pointer to the filter coefficients
	*
	* @param[in] ht
	* integer height of the array
	*
	* @param[in] wd
	* integer width of the array
	*
	* @returns
	*
	* @remarks
	* None
	*
	*******************************************************************************
	*/

	void ihevc_inter_pred_luma_copy_w16out_sse42(UWORD8 *pu1_src,
	WORD16 *pi2_dst,
	WORD32 src_strd,
	WORD32 dst_strd,
	WORD8 *pi1_coeff,
	WORD32 ht,
	WORD32 wd)
	{
	WORD32 row, col;
	__m128i src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b;
	UNUSED(pi1_coeff);
	ASSERT(wd % 4 == 0); /* checking assumption*/
	ASSERT(ht % 4 == 0); /* checking assumption*/

	if(0 == (wd & 7)) /* multiple of 8 case */
	{
	for(row = 0; row < ht; row += 4)
	{
	for(col = 0; col < wd; col += 8)
	{
	/load 8 pixel values from 7:0 pos. relative to cur. pos./
	src0_16x8b = _mm_loadl_epi64((__m128i )(pu1_src)); / row =0 */
	src1_16x8b = _mm_loadl_epi64((__m128i )(pu1_src + 1 src_strd)); /* row =1 */
	src2_16x8b = _mm_loadl_epi64((__m128i )(pu1_src + 2 src_strd)); /* row =2 */
	src3_16x8b = _mm_loadl_epi64((__m128i )(pu1_src + 3 src_strd)); /* row =3 */

	src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
	src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
	src2_16x8b = _mm_cvtepu8_epi16(src2_16x8b);
	src3_16x8b = _mm_cvtepu8_epi16(src3_16x8b);

	src0_16x8b = _mm_slli_epi16(src0_16x8b, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
	src1_16x8b = _mm_slli_epi16(src1_16x8b, SHIFT_14_MINUS_BIT_DEPTH);
	src2_16x8b = _mm_slli_epi16(src2_16x8b, SHIFT_14_MINUS_BIT_DEPTH);
	src3_16x8b = _mm_slli_epi16(src3_16x8b, SHIFT_14_MINUS_BIT_DEPTH);

	/* storing 16 8-bit output values */
	_mm_storeu_si128((__m128i )(pi2_dst), src0_16x8b); / row =0 */
	_mm_storeu_si128((__m128i )(pi2_dst + 1 dst_strd), src1_16x8b); /* row =1 */
	_mm_storeu_si128((__m128i )(pi2_dst + 2 dst_strd), src2_16x8b); /* row =2 */
	_mm_storeu_si128((__m128i )(pi2_dst + 3 dst_strd), src3_16x8b); /* row =3 */

	pu1_src += 8; /* pointer update */
	pi2_dst += 8; /* pointer update */
	} /* inner for loop ends here(8-output values in single iteration) */

	pu1_src += 4 * src_strd - wd; /* pointer update */
	pi2_dst += 4 * dst_strd - wd; /* pointer update */
	}
	}
	else /* wd = multiple of 4 case */
	{
	for(row = 0; row < ht; row += 4)
	{
	for(col = 0; col < wd; col += 4)
	{
	/load 8 pixel values from 7:0 pos. relative to cur. pos./
	src0_16x8b = _mm_loadl_epi64((__m128i )(pu1_src)); / row =0 */
	src1_16x8b = _mm_loadl_epi64((__m128i )(pu1_src + 1 src_strd)); /* row =1 */
	src2_16x8b = _mm_loadl_epi64((__m128i )(pu1_src + 2 src_strd)); /* row =2 */
	src3_16x8b = _mm_loadl_epi64((__m128i )(pu1_src + 3 src_strd)); /* row =3 */

	src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
	src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
	src2_16x8b = _mm_cvtepu8_epi16(src2_16x8b);
	src3_16x8b = _mm_cvtepu8_epi16(src3_16x8b);

	src0_16x8b = _mm_slli_epi16(src0_16x8b, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
	src1_16x8b = _mm_slli_epi16(src1_16x8b, SHIFT_14_MINUS_BIT_DEPTH);
	src2_16x8b = _mm_slli_epi16(src2_16x8b, SHIFT_14_MINUS_BIT_DEPTH);
	src3_16x8b = _mm_slli_epi16(src3_16x8b, SHIFT_14_MINUS_BIT_DEPTH);

	/* storing 16 8-bit output values */
	_mm_storel_epi64((__m128i )(pi2_dst), src0_16x8b); / row =0 */
	_mm_storel_epi64((__m128i )(pi2_dst + 1 dst_strd), src1_16x8b); /* row =1 */
	_mm_storel_epi64((__m128i )(pi2_dst + 2 dst_strd), src2_16x8b); /* row =2 */
	_mm_storel_epi64((__m128i )(pi2_dst + 3 dst_strd), src3_16x8b); /* row =3 */

	pu1_src += 4; /* pointer update */
	pi2_dst += 4; /* pointer update */
	} /* inner for loop ends here(4-output values in single iteration) */

	pu1_src += 4 * src_strd - wd; /* pointer update */
	pi2_dst += 4 * dst_strd - wd; /* pointer update */
	}
	}
	}

	/**
	*******************************************************************************
	*
	* @brief
	* Chroma interprediction filter for copy
	*
	* @par Description:
	* Copies the array of width 'wd' and height 'ht' from the location pointed
	* by 'src' to the location pointed by 'dst'
	*
	* @param[in] pu1_src
	* UWORD8 pointer to the source
	*
	* @param[out] pu1_dst
	* UWORD8 pointer to the destination
	*
	* @param[in] src_strd
	* integer source stride
	*
	* @param[in] dst_strd
	* integer destination stride
	*
	* @param[in] pi1_coeff
	* WORD8 pointer to the filter coefficients
	*
	* @param[in] ht
	* integer height of the array
	*
	* @param[in] wd
	* integer width of the array
	*
	* @returns
	*
	* @remarks
	* None
	*
	*******************************************************************************
	*/

	void ihevc_inter_pred_chroma_copy_sse42(UWORD8 *pu1_src,
	UWORD8 *pu1_dst,
	WORD32 src_strd,
	WORD32 dst_strd,
	WORD8 *pi1_coeff,
	WORD32 ht,
	WORD32 wd)
	{
	WORD32 row, col, wdx2;
	__m128i src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b;

	ASSERT(wd % 2 == 0); /* checking assumption*/
	ASSERT(ht % 2 == 0); /* checking assumption*/
	UNUSED(pi1_coeff);
	wdx2 = wd * 2;

	if(0 == (ht & 3)) /* ht multiple of 4 */
	{
	if(0 == (wdx2 & 15)) /* wdx2 multiple of 16 case */
	{
	for(row = 0; row < ht; row += 4)
	{
	for(col = 0; col < wdx2; col += 16)
	{
	/load 16 pixel values from 15:0 pos. relative to cur. pos./
	src0_16x8b = _mm_loadu_si128((__m128i )(pu1_src)); / row =0 */
	src1_16x8b = _mm_loadu_si128((__m128i )(pu1_src + 1 src_strd)); /* row =1 */
	src2_16x8b = _mm_loadu_si128((__m128i )(pu1_src + 2 src_strd)); /* row =2 */
	src3_16x8b = _mm_loadu_si128((__m128i )(pu1_src + 3 src_strd)); /* row =3 */

	/* storing 16 8-bit output values */
	_mm_storeu_si128((__m128i )(pu1_dst), src0_16x8b); / row =0 */
	_mm_storeu_si128((__m128i )(pu1_dst + 1 dst_strd), src1_16x8b); /* row =1 */
	_mm_storeu_si128((__m128i )(pu1_dst + 2 dst_strd), src2_16x8b); /* row =2 */
	_mm_storeu_si128((__m128i )(pu1_dst + 3 dst_strd), src3_16x8b); /* row =3 */

	pu1_src += 16; /* pointer update */
	pu1_dst += 16; /* pointer update */
	} /* inner for loop ends here(16-output values in single iteration) */

	pu1_src += 4 * src_strd - wdx2; /* pointer update */
	pu1_dst += 4 * dst_strd - wdx2; /* pointer update */
	}

	}
	else if(0 == (wdx2 & 7)) /* multiple of 8 case */
	{
	for(row = 0; row < ht; row += 4)
	{
	for(col = 0; col < wdx2; col += 8)
	{
	/load 16 pixel values from 15:0 pos. relative to cur. pos./
	src0_16x8b = _mm_loadl_epi64((__m128i )(pu1_src)); / row =0 */
	src1_16x8b = _mm_loadl_epi64((__m128i )(pu1_src + 1 src_strd)); /* row =1 */
	src2_16x8b = _mm_loadl_epi64((__m128i )(pu1_src + 2 src_strd)); /* row =2 */
	src3_16x8b = _mm_loadl_epi64((__m128i )(pu1_src + 3 src_strd)); /* row =3 */

	/* storing 16 8-bit output values */
	_mm_storel_epi64((__m128i )(pu1_dst), src0_16x8b); / row =0 */
	_mm_storel_epi64((__m128i )(pu1_dst + 1 dst_strd), src1_16x8b); /* row =1 */
	_mm_storel_epi64((__m128i )(pu1_dst + 2 dst_strd), src2_16x8b); /* row =2 */
	_mm_storel_epi64((__m128i )(pu1_dst + 3 dst_strd), src3_16x8b); /* row =3 */

	pu1_src += 8; /* pointer update */
	pu1_dst += 8; /* pointer update */
	} /* inner for loop ends here(8-output values in single iteration) */

	pu1_src += 4 * src_strd - wdx2; /* pointer update */
	pu1_dst += 4 * dst_strd - wdx2; /* pointer update */
	}
	}
	else /* wdx2 = multiple of 4 case */
	{
	WORD32 dst0, dst1, dst2, dst3;
	for(row = 0; row < ht; row += 4)
	{
	for(col = 0; col < wdx2; col += 4)
	{
	/load 16 pixel values from 15:0 pos. relative to cur. pos./
	src0_16x8b = _mm_loadl_epi64((__m128i )(pu1_src)); / row =0 */
	src1_16x8b = _mm_loadl_epi64((__m128i )(pu1_src + 1 src_strd)); /* row =1 */
	src2_16x8b = _mm_loadl_epi64((__m128i )(pu1_src + 2 src_strd)); /* row =2 */
	src3_16x8b = _mm_loadl_epi64((__m128i )(pu1_src + 3 src_strd)); /* row =3 */

	dst0 = _mm_cvtsi128_si32(src0_16x8b);
	dst1 = _mm_cvtsi128_si32(src1_16x8b);
	dst2 = _mm_cvtsi128_si32(src2_16x8b);
	dst3 = _mm_cvtsi128_si32(src3_16x8b);

	/* storing 4 8-bit output values */
	(WORD32 )(&pu1_dst[0 * dst_strd]) = dst0; /* row =0 */
	(WORD32 )(&pu1_dst[1 * dst_strd]) = dst1; /* row =1 */
	(WORD32 )(&pu1_dst[2 * dst_strd]) = dst2; /* row =2 */
	(WORD32 )(&pu1_dst[3 * dst_strd]) = dst3; /* row =3 */

	pu1_src += 4; /* pointer update */
	pu1_dst += 4; /* pointer update */
	} /* inner for loop ends here(4- output values in single iteration) */

	pu1_src += 4 * src_strd - wdx2; /* pointer update */
	pu1_dst += 4 * dst_strd - wdx2; /* pointer update */
	}
	}
	}
	else /* ht multiple of 2 */
	{
	if(0 == (wdx2 & 15)) /* wdx2 multiple of 16 case */
	{
	for(row = 0; row < ht; row += 2)
	{
	for(col = 0; col < wdx2; col += 16)
	{
	/load 16 pixel values from 15:0 pos. relative to cur. pos./
	src0_16x8b = _mm_loadu_si128((__m128i )(pu1_src)); / row =0 */
	src1_16x8b = _mm_loadu_si128((__m128i )(pu1_src + 1 src_strd)); /* row =1 */

	/* storing 16 8-bit output values */
	_mm_storeu_si128((__m128i )(pu1_dst), src0_16x8b); / row =0 */
	_mm_storeu_si128((__m128i )(pu1_dst + 1 dst_strd), src1_16x8b); /* row =1 */

	pu1_src += 16; /* pointer update */
	pu1_dst += 16; /* pointer update */
	} /* inner for loop ends here(16-output values in single iteration) */

	pu1_src += 2 * src_strd - wdx2; /* pointer update */
	pu1_dst += 2 * dst_strd - wdx2; /* pointer update */
	}

	}
	else if(0 == (wdx2 & 7)) /* multiple of 8 case */
	{
	for(row = 0; row < ht; row += 2)
	{
	for(col = 0; col < wdx2; col += 8)
	{
	/load 16 pixel values from 15:0 pos. relative to cur. pos./
	src0_16x8b = _mm_loadl_epi64((__m128i )(pu1_src)); / row =0 */
	src1_16x8b = _mm_loadl_epi64((__m128i )(pu1_src + 1 src_strd)); /* row =1 */

	/* storing 16 8-bit output values */
	_mm_storel_epi64((__m128i )(pu1_dst), src0_16x8b); / row =0 */
	_mm_storel_epi64((__m128i )(pu1_dst + 1 dst_strd), src1_16x8b); /* row =1 */

	pu1_src += 8; /* pointer update */
	pu1_dst += 8; /* pointer update */
	} /* inner for loop ends here(8-output values in single iteration) */

	pu1_src += 2 * src_strd - wdx2; /* pointer update */
	pu1_dst += 2 * dst_strd - wdx2; /* pointer update */
	}
	}
	else /* wdx2 = multiple of 4 case */
	{
	WORD32 dst0, dst1;
	for(row = 0; row < ht; row += 2)
	{
	for(col = 0; col < wdx2; col += 4)
	{
	/load 16 pixel values from 15:0 pos. relative to cur. pos./
	src0_16x8b = _mm_loadl_epi64((__m128i )(pu1_src)); / row =0 */
	src1_16x8b = _mm_loadl_epi64((__m128i )(pu1_src + 1 src_strd)); /* row =1 */

	dst0 = _mm_cvtsi128_si32(src0_16x8b);
	dst1 = _mm_cvtsi128_si32(src1_16x8b);


	/* storing 4 8-bit output values */
	(WORD32 )(&pu1_dst[0 * dst_strd]) = dst0; /* row =0 */
	(WORD32 )(&pu1_dst[1 * dst_strd]) = dst1; /* row =1 */

	pu1_src += 4; /* pointer update */
	pu1_dst += 4; /* pointer update */
	} /* inner for loop ends here(4- output values in single iteration) */

	pu1_src += 2 * src_strd - wdx2; /* pointer update */
	pu1_dst += 2 * dst_strd - wdx2; /* pointer update */
	}
	}
	}
	}

	/**
	*******************************************************************************
	*
	* @brief
	* chroma interprediction filter for copying 16bit output
	*
	* @par Description:
	* Copies the array of width 'wd' and height 'ht' from the location pointed
	* by 'src' to the location pointed by 'dst' The output is upshifted by 6
	* bits and is used as input for vertical filtering or weighted prediction
	*
	* @param[in] pu1_src
	* UWORD8 pointer to the source
	*
	* @param[out] pi2_dst
	* WORD16 pointer to the destination
	*
	* @param[in] src_strd
	* integer source stride
	*
	* @param[in] dst_strd
	* integer destination stride
	*
	* @param[in] pi1_coeff
	* WORD8 pointer to the filter coefficients
	*
	* @param[in] ht
	* integer height of the array
	*
	* @param[in] wd
	* integer width of the array
	*
	* @returns
	*
	* @remarks
	* None
	*
	*******************************************************************************
	*/

	void ihevc_inter_pred_chroma_copy_w16out_sse42(UWORD8 *pu1_src,
	WORD16 *pi2_dst,
	WORD32 src_strd,
	WORD32 dst_strd,
	WORD8 *pi1_coeff,
	WORD32 ht,
	WORD32 wd)
	{
	WORD32 row, col, wdx2;
	__m128i src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b;

	ASSERT(wd % 2 == 0); /* checking assumption*/
	ASSERT(ht % 2 == 0); /* checking assumption*/
	UNUSED(pi1_coeff);
	wdx2 = wd * 2;

	if(0 == (ht & 3)) /* multiple of 4 case */
	{
	if(0 == (wdx2 & 7)) /* multiple of 8 case */
	{
	for(row = 0; row < ht; row += 4)
	{
	for(col = 0; col < wdx2; col += 8)
	{
	/load 8 pixel values from 7:0 pos. relative to cur. pos./
	src0_16x8b = _mm_loadl_epi64((__m128i )(pu1_src)); / row =0 */
	src1_16x8b = _mm_loadl_epi64((__m128i )(pu1_src + 1 src_strd)); /* row =1 */
	src2_16x8b = _mm_loadl_epi64((__m128i )(pu1_src + 2 src_strd)); /* row =2 */
	src3_16x8b = _mm_loadl_epi64((__m128i )(pu1_src + 3 src_strd)); /* row =3 */

	src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
	src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
	src2_16x8b = _mm_cvtepu8_epi16(src2_16x8b);
	src3_16x8b = _mm_cvtepu8_epi16(src3_16x8b);

	src0_16x8b = _mm_slli_epi16(src0_16x8b, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
	src1_16x8b = _mm_slli_epi16(src1_16x8b, SHIFT_14_MINUS_BIT_DEPTH);
	src2_16x8b = _mm_slli_epi16(src2_16x8b, SHIFT_14_MINUS_BIT_DEPTH);
	src3_16x8b = _mm_slli_epi16(src3_16x8b, SHIFT_14_MINUS_BIT_DEPTH);

	/* storing 16 8-bit output values */
	_mm_storeu_si128((__m128i )(pi2_dst), src0_16x8b); / row =0 */
	_mm_storeu_si128((__m128i )(pi2_dst + 1 dst_strd), src1_16x8b); /* row =1 */
	_mm_storeu_si128((__m128i )(pi2_dst + 2 dst_strd), src2_16x8b); /* row =2 */
	_mm_storeu_si128((__m128i )(pi2_dst + 3 dst_strd), src3_16x8b); /* row =3 */

	pu1_src += 8; /* pointer update */
	pi2_dst += 8; /* pointer update */
	} /* inner for loop ends here(8-output values in single iteration) */

	pu1_src += 4 * src_strd - wdx2; /* pointer update */
	pi2_dst += 4 * dst_strd - wdx2; /* pointer update */
	}
	}
	else /* wdx2 = multiple of 4 case */
	{
	for(row = 0; row < ht; row += 4)
	{
	for(col = 0; col < wdx2; col += 4)
	{
	/load 8 pixel values from 7:0 pos. relative to cur. pos./
	src0_16x8b = _mm_loadl_epi64((__m128i )(pu1_src)); / row =0 */
	src1_16x8b = _mm_loadl_epi64((__m128i )(pu1_src + 1 src_strd)); /* row =1 */
	src2_16x8b = _mm_loadl_epi64((__m128i )(pu1_src + 2 src_strd)); /* row =2 */
	src3_16x8b = _mm_loadl_epi64((__m128i )(pu1_src + 3 src_strd)); /* row =3 */

	src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
	src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
	src2_16x8b = _mm_cvtepu8_epi16(src2_16x8b);
	src3_16x8b = _mm_cvtepu8_epi16(src3_16x8b);

	src0_16x8b = _mm_slli_epi16(src0_16x8b, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
	src1_16x8b = _mm_slli_epi16(src1_16x8b, SHIFT_14_MINUS_BIT_DEPTH);
	src2_16x8b = _mm_slli_epi16(src2_16x8b, SHIFT_14_MINUS_BIT_DEPTH);
	src3_16x8b = _mm_slli_epi16(src3_16x8b, SHIFT_14_MINUS_BIT_DEPTH);

	/* storing 16 8-bit output values */
	_mm_storel_epi64((__m128i )(pi2_dst), src0_16x8b); / row =0 */
	_mm_storel_epi64((__m128i )(pi2_dst + 1 dst_strd), src1_16x8b); /* row =1 */
	_mm_storel_epi64((__m128i )(pi2_dst + 2 dst_strd), src2_16x8b); /* row =2 */
	_mm_storel_epi64((__m128i )(pi2_dst + 3 dst_strd), src3_16x8b); /* row =3 */

	pu1_src += 4; /* pointer update */
	pi2_dst += 4; /* pointer update */
	} /* inner for loop ends here(4-output values in single iteration) */

	pu1_src += 4 * src_strd - wdx2; /* pointer update */
	pi2_dst += 4 * dst_strd - wdx2; /* pointer update */
	}
	}
	}
	else /* ht multiple of 2 case */
	{
	if(0 == (wdx2 & 7)) /* multiple of 8 case */
	{
	for(row = 0; row < ht; row += 2)
	{
	for(col = 0; col < wdx2; col += 8)
	{
	/load 8 pixel values from 7:0 pos. relative to cur. pos./
	src0_16x8b = _mm_loadl_epi64((__m128i )(pu1_src)); / row =0 */
	src1_16x8b = _mm_loadl_epi64((__m128i )(pu1_src + 1 src_strd)); /* row =1 */

	src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
	src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);

	src0_16x8b = _mm_slli_epi16(src0_16x8b, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
	src1_16x8b = _mm_slli_epi16(src1_16x8b, SHIFT_14_MINUS_BIT_DEPTH);

	/* storing 16 8-bit output values */
	_mm_storeu_si128((__m128i )(pi2_dst), src0_16x8b); / row =0 */
	_mm_storeu_si128((__m128i )(pi2_dst + 1 dst_strd), src1_16x8b); /* row =1 */

	pu1_src += 8; /* pointer update */
	pi2_dst += 8; /* pointer update */
	} /* inner for loop ends here(8-output values in single iteration) */

	pu1_src += 2 * src_strd - wdx2; /* pointer update */
	pi2_dst += 2 * dst_strd - wdx2; /* pointer update */
	}
	}
	else /* wdx2 = multiple of 4 case */
	{
	for(row = 0; row < ht; row += 2)
	{
	for(col = 0; col < wdx2; col += 4)
	{
	/load 8 pixel values from 7:0 pos. relative to cur. pos./
	src0_16x8b = _mm_loadl_epi64((__m128i )(pu1_src)); / row =0 */
	src1_16x8b = _mm_loadl_epi64((__m128i )(pu1_src + 1 src_strd)); /* row =1 */

	src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
	src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);

	src0_16x8b = _mm_slli_epi16(src0_16x8b, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
	src1_16x8b = _mm_slli_epi16(src1_16x8b, SHIFT_14_MINUS_BIT_DEPTH);

	/* storing 16 8-bit output values */
	_mm_storel_epi64((__m128i )(pi2_dst), src0_16x8b); / row =0 */
	_mm_storel_epi64((__m128i )(pi2_dst + 1 dst_strd), src1_16x8b); /* row =1 */

	pu1_src += 4; /* pointer update */
	pi2_dst += 4; /* pointer update */
	} /* inner for loop ends here(4-output values in single iteration) */

	pu1_src += 2 * src_strd - wdx2; /* pointer update */
	pi2_dst += 2 * dst_strd - wdx2; /* pointer update */
	}
	}
	}
	}