blob: 154b613aadd82059962aa80b6f1b5df93c7e4231 [file] [log] [blame]
/******************************************************************************
*
* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
/**
*******************************************************************************
* @file
* ihevc_inter_pred_filters_x86_intr.c
*
* @brief
* Contains function definitions for inter prediction interpolation filters
* coded in x86 intrinsics
*
*
* @author
*
*
* @par List of Functions:
* - ihevc_inter_pred_luma_copy_w16out_sse42()
* - ihevc_inter_pred_chroma_copy_sse42()
* - ihevc_inter_pred_chroma_copy_w16out_sse42()
*
* @remarks
* None
*
*******************************************************************************
*/
/*****************************************************************************/
/* File Includes */
/*****************************************************************************/
#include <assert.h>
#include "ihevc_debug.h"
#include "ihevc_typedefs.h"
#include "ihevc_defs.h"
#include "ihevc_inter_pred.h"
#include "ihevc_macros.h"
#include "ihevc_platform_macros.h"
#include "ihevc_func_selector.h"
#include <immintrin.h>
#include <emmintrin.h>
#include <smmintrin.h>
#include <tmmintrin.h>
/*****************************************************************************/
/* Function Definitions */
/*****************************************************************************/
/**
*******************************************************************************
*
* @brief
* Interprediction luma filter for copy 16bit output
*
* @par Description:
* Copies the array of width 'wd' and height 'ht' from the location pointed
* by 'src' to the location pointed by 'dst' The output is upshifted by 6
* bits and is used as input for vertical filtering or weighted prediction
*
* @param[in] pu1_src
* UWORD8 pointer to the source
*
* @param[out] pi2_dst
* WORD16 pointer to the destination
*
* @param[in] src_strd
* integer source stride
*
* @param[in] dst_strd
* integer destination stride
*
* @param[in] pi1_coeff
* WORD8 pointer to the filter coefficients
*
* @param[in] ht
* integer height of the array
*
* @param[in] wd
* integer width of the array
*
* @returns
*
* @remarks
* None
*
*******************************************************************************
*/
void ihevc_inter_pred_luma_copy_w16out_sse42(UWORD8 *pu1_src,
WORD16 *pi2_dst,
WORD32 src_strd,
WORD32 dst_strd,
WORD8 *pi1_coeff,
WORD32 ht,
WORD32 wd)
{
WORD32 row, col;
__m128i src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b;
UNUSED(pi1_coeff);
ASSERT(wd % 4 == 0); /* checking assumption*/
ASSERT(ht % 4 == 0); /* checking assumption*/
if(0 == (wd & 7)) /* multiple of 8 case */
{
for(row = 0; row < ht; row += 4)
{
for(col = 0; col < wd; col += 8)
{
/*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */
src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
src2_16x8b = _mm_cvtepu8_epi16(src2_16x8b);
src3_16x8b = _mm_cvtepu8_epi16(src3_16x8b);
src0_16x8b = _mm_slli_epi16(src0_16x8b, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
src1_16x8b = _mm_slli_epi16(src1_16x8b, SHIFT_14_MINUS_BIT_DEPTH);
src2_16x8b = _mm_slli_epi16(src2_16x8b, SHIFT_14_MINUS_BIT_DEPTH);
src3_16x8b = _mm_slli_epi16(src3_16x8b, SHIFT_14_MINUS_BIT_DEPTH);
/* storing 16 8-bit output values */
_mm_storeu_si128((__m128i *)(pi2_dst), src0_16x8b); /* row =0 */
_mm_storeu_si128((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b); /* row =1 */
_mm_storeu_si128((__m128i *)(pi2_dst + 2 * dst_strd), src2_16x8b); /* row =2 */
_mm_storeu_si128((__m128i *)(pi2_dst + 3 * dst_strd), src3_16x8b); /* row =3 */
pu1_src += 8; /* pointer update */
pi2_dst += 8; /* pointer update */
} /* inner for loop ends here(8-output values in single iteration) */
pu1_src += 4 * src_strd - wd; /* pointer update */
pi2_dst += 4 * dst_strd - wd; /* pointer update */
}
}
else /* wd = multiple of 4 case */
{
for(row = 0; row < ht; row += 4)
{
for(col = 0; col < wd; col += 4)
{
/*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */
src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
src2_16x8b = _mm_cvtepu8_epi16(src2_16x8b);
src3_16x8b = _mm_cvtepu8_epi16(src3_16x8b);
src0_16x8b = _mm_slli_epi16(src0_16x8b, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
src1_16x8b = _mm_slli_epi16(src1_16x8b, SHIFT_14_MINUS_BIT_DEPTH);
src2_16x8b = _mm_slli_epi16(src2_16x8b, SHIFT_14_MINUS_BIT_DEPTH);
src3_16x8b = _mm_slli_epi16(src3_16x8b, SHIFT_14_MINUS_BIT_DEPTH);
/* storing 16 8-bit output values */
_mm_storel_epi64((__m128i *)(pi2_dst), src0_16x8b); /* row =0 */
_mm_storel_epi64((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b); /* row =1 */
_mm_storel_epi64((__m128i *)(pi2_dst + 2 * dst_strd), src2_16x8b); /* row =2 */
_mm_storel_epi64((__m128i *)(pi2_dst + 3 * dst_strd), src3_16x8b); /* row =3 */
pu1_src += 4; /* pointer update */
pi2_dst += 4; /* pointer update */
} /* inner for loop ends here(4-output values in single iteration) */
pu1_src += 4 * src_strd - wd; /* pointer update */
pi2_dst += 4 * dst_strd - wd; /* pointer update */
}
}
}
/**
*******************************************************************************
*
* @brief
* Chroma interprediction filter for copy
*
* @par Description:
* Copies the array of width 'wd' and height 'ht' from the location pointed
* by 'src' to the location pointed by 'dst'
*
* @param[in] pu1_src
* UWORD8 pointer to the source
*
* @param[out] pu1_dst
* UWORD8 pointer to the destination
*
* @param[in] src_strd
* integer source stride
*
* @param[in] dst_strd
* integer destination stride
*
* @param[in] pi1_coeff
* WORD8 pointer to the filter coefficients
*
* @param[in] ht
* integer height of the array
*
* @param[in] wd
* integer width of the array
*
* @returns
*
* @remarks
* None
*
*******************************************************************************
*/
void ihevc_inter_pred_chroma_copy_sse42(UWORD8 *pu1_src,
UWORD8 *pu1_dst,
WORD32 src_strd,
WORD32 dst_strd,
WORD8 *pi1_coeff,
WORD32 ht,
WORD32 wd)
{
WORD32 row, col, wdx2;
__m128i src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b;
ASSERT(wd % 2 == 0); /* checking assumption*/
ASSERT(ht % 2 == 0); /* checking assumption*/
UNUSED(pi1_coeff);
wdx2 = wd * 2;
if(0 == (ht & 3)) /* ht multiple of 4 */
{
if(0 == (wdx2 & 15)) /* wdx2 multiple of 16 case */
{
for(row = 0; row < ht; row += 4)
{
for(col = 0; col < wdx2; col += 16)
{
/*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
src0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); /* row =0 */
src1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
/* storing 16 8-bit output values */
_mm_storeu_si128((__m128i *)(pu1_dst), src0_16x8b); /* row =0 */
_mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b); /* row =1 */
_mm_storeu_si128((__m128i *)(pu1_dst + 2 * dst_strd), src2_16x8b); /* row =2 */
_mm_storeu_si128((__m128i *)(pu1_dst + 3 * dst_strd), src3_16x8b); /* row =3 */
pu1_src += 16; /* pointer update */
pu1_dst += 16; /* pointer update */
} /* inner for loop ends here(16-output values in single iteration) */
pu1_src += 4 * src_strd - wdx2; /* pointer update */
pu1_dst += 4 * dst_strd - wdx2; /* pointer update */
}
}
else if(0 == (wdx2 & 7)) /* multiple of 8 case */
{
for(row = 0; row < ht; row += 4)
{
for(col = 0; col < wdx2; col += 8)
{
/*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */
src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
/* storing 16 8-bit output values */
_mm_storel_epi64((__m128i *)(pu1_dst), src0_16x8b); /* row =0 */
_mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b); /* row =1 */
_mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src2_16x8b); /* row =2 */
_mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src3_16x8b); /* row =3 */
pu1_src += 8; /* pointer update */
pu1_dst += 8; /* pointer update */
} /* inner for loop ends here(8-output values in single iteration) */
pu1_src += 4 * src_strd - wdx2; /* pointer update */
pu1_dst += 4 * dst_strd - wdx2; /* pointer update */
}
}
else /* wdx2 = multiple of 4 case */
{
WORD32 dst0, dst1, dst2, dst3;
for(row = 0; row < ht; row += 4)
{
for(col = 0; col < wdx2; col += 4)
{
/*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */
src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
dst0 = _mm_cvtsi128_si32(src0_16x8b);
dst1 = _mm_cvtsi128_si32(src1_16x8b);
dst2 = _mm_cvtsi128_si32(src2_16x8b);
dst3 = _mm_cvtsi128_si32(src3_16x8b);
/* storing 4 8-bit output values */
*(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; /* row =0 */
*(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; /* row =1 */
*(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2; /* row =2 */
*(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3; /* row =3 */
pu1_src += 4; /* pointer update */
pu1_dst += 4; /* pointer update */
} /* inner for loop ends here(4- output values in single iteration) */
pu1_src += 4 * src_strd - wdx2; /* pointer update */
pu1_dst += 4 * dst_strd - wdx2; /* pointer update */
}
}
}
else /* ht multiple of 2 */
{
if(0 == (wdx2 & 15)) /* wdx2 multiple of 16 case */
{
for(row = 0; row < ht; row += 2)
{
for(col = 0; col < wdx2; col += 16)
{
/*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
src0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); /* row =0 */
src1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
/* storing 16 8-bit output values */
_mm_storeu_si128((__m128i *)(pu1_dst), src0_16x8b); /* row =0 */
_mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b); /* row =1 */
pu1_src += 16; /* pointer update */
pu1_dst += 16; /* pointer update */
} /* inner for loop ends here(16-output values in single iteration) */
pu1_src += 2 * src_strd - wdx2; /* pointer update */
pu1_dst += 2 * dst_strd - wdx2; /* pointer update */
}
}
else if(0 == (wdx2 & 7)) /* multiple of 8 case */
{
for(row = 0; row < ht; row += 2)
{
for(col = 0; col < wdx2; col += 8)
{
/*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */
src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
/* storing 16 8-bit output values */
_mm_storel_epi64((__m128i *)(pu1_dst), src0_16x8b); /* row =0 */
_mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b); /* row =1 */
pu1_src += 8; /* pointer update */
pu1_dst += 8; /* pointer update */
} /* inner for loop ends here(8-output values in single iteration) */
pu1_src += 2 * src_strd - wdx2; /* pointer update */
pu1_dst += 2 * dst_strd - wdx2; /* pointer update */
}
}
else /* wdx2 = multiple of 4 case */
{
WORD32 dst0, dst1;
for(row = 0; row < ht; row += 2)
{
for(col = 0; col < wdx2; col += 4)
{
/*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */
src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
dst0 = _mm_cvtsi128_si32(src0_16x8b);
dst1 = _mm_cvtsi128_si32(src1_16x8b);
/* storing 4 8-bit output values */
*(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; /* row =0 */
*(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; /* row =1 */
pu1_src += 4; /* pointer update */
pu1_dst += 4; /* pointer update */
} /* inner for loop ends here(4- output values in single iteration) */
pu1_src += 2 * src_strd - wdx2; /* pointer update */
pu1_dst += 2 * dst_strd - wdx2; /* pointer update */
}
}
}
}
/**
*******************************************************************************
*
* @brief
* chroma interprediction filter for copying 16bit output
*
* @par Description:
* Copies the array of width 'wd' and height 'ht' from the location pointed
* by 'src' to the location pointed by 'dst' The output is upshifted by 6
* bits and is used as input for vertical filtering or weighted prediction
*
* @param[in] pu1_src
* UWORD8 pointer to the source
*
* @param[out] pi2_dst
* WORD16 pointer to the destination
*
* @param[in] src_strd
* integer source stride
*
* @param[in] dst_strd
* integer destination stride
*
* @param[in] pi1_coeff
* WORD8 pointer to the filter coefficients
*
* @param[in] ht
* integer height of the array
*
* @param[in] wd
* integer width of the array
*
* @returns
*
* @remarks
* None
*
*******************************************************************************
*/
void ihevc_inter_pred_chroma_copy_w16out_sse42(UWORD8 *pu1_src,
WORD16 *pi2_dst,
WORD32 src_strd,
WORD32 dst_strd,
WORD8 *pi1_coeff,
WORD32 ht,
WORD32 wd)
{
WORD32 row, col, wdx2;
__m128i src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b;
ASSERT(wd % 2 == 0); /* checking assumption*/
ASSERT(ht % 2 == 0); /* checking assumption*/
UNUSED(pi1_coeff);
wdx2 = wd * 2;
if(0 == (ht & 3)) /* multiple of 4 case */
{
if(0 == (wdx2 & 7)) /* multiple of 8 case */
{
for(row = 0; row < ht; row += 4)
{
for(col = 0; col < wdx2; col += 8)
{
/*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */
src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
src2_16x8b = _mm_cvtepu8_epi16(src2_16x8b);
src3_16x8b = _mm_cvtepu8_epi16(src3_16x8b);
src0_16x8b = _mm_slli_epi16(src0_16x8b, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
src1_16x8b = _mm_slli_epi16(src1_16x8b, SHIFT_14_MINUS_BIT_DEPTH);
src2_16x8b = _mm_slli_epi16(src2_16x8b, SHIFT_14_MINUS_BIT_DEPTH);
src3_16x8b = _mm_slli_epi16(src3_16x8b, SHIFT_14_MINUS_BIT_DEPTH);
/* storing 16 8-bit output values */
_mm_storeu_si128((__m128i *)(pi2_dst), src0_16x8b); /* row =0 */
_mm_storeu_si128((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b); /* row =1 */
_mm_storeu_si128((__m128i *)(pi2_dst + 2 * dst_strd), src2_16x8b); /* row =2 */
_mm_storeu_si128((__m128i *)(pi2_dst + 3 * dst_strd), src3_16x8b); /* row =3 */
pu1_src += 8; /* pointer update */
pi2_dst += 8; /* pointer update */
} /* inner for loop ends here(8-output values in single iteration) */
pu1_src += 4 * src_strd - wdx2; /* pointer update */
pi2_dst += 4 * dst_strd - wdx2; /* pointer update */
}
}
else /* wdx2 = multiple of 4 case */
{
for(row = 0; row < ht; row += 4)
{
for(col = 0; col < wdx2; col += 4)
{
/*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */
src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
src2_16x8b = _mm_cvtepu8_epi16(src2_16x8b);
src3_16x8b = _mm_cvtepu8_epi16(src3_16x8b);
src0_16x8b = _mm_slli_epi16(src0_16x8b, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
src1_16x8b = _mm_slli_epi16(src1_16x8b, SHIFT_14_MINUS_BIT_DEPTH);
src2_16x8b = _mm_slli_epi16(src2_16x8b, SHIFT_14_MINUS_BIT_DEPTH);
src3_16x8b = _mm_slli_epi16(src3_16x8b, SHIFT_14_MINUS_BIT_DEPTH);
/* storing 16 8-bit output values */
_mm_storel_epi64((__m128i *)(pi2_dst), src0_16x8b); /* row =0 */
_mm_storel_epi64((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b); /* row =1 */
_mm_storel_epi64((__m128i *)(pi2_dst + 2 * dst_strd), src2_16x8b); /* row =2 */
_mm_storel_epi64((__m128i *)(pi2_dst + 3 * dst_strd), src3_16x8b); /* row =3 */
pu1_src += 4; /* pointer update */
pi2_dst += 4; /* pointer update */
} /* inner for loop ends here(4-output values in single iteration) */
pu1_src += 4 * src_strd - wdx2; /* pointer update */
pi2_dst += 4 * dst_strd - wdx2; /* pointer update */
}
}
}
else /* ht multiple of 2 case */
{
if(0 == (wdx2 & 7)) /* multiple of 8 case */
{
for(row = 0; row < ht; row += 2)
{
for(col = 0; col < wdx2; col += 8)
{
/*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */
src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
src0_16x8b = _mm_slli_epi16(src0_16x8b, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
src1_16x8b = _mm_slli_epi16(src1_16x8b, SHIFT_14_MINUS_BIT_DEPTH);
/* storing 16 8-bit output values */
_mm_storeu_si128((__m128i *)(pi2_dst), src0_16x8b); /* row =0 */
_mm_storeu_si128((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b); /* row =1 */
pu1_src += 8; /* pointer update */
pi2_dst += 8; /* pointer update */
} /* inner for loop ends here(8-output values in single iteration) */
pu1_src += 2 * src_strd - wdx2; /* pointer update */
pi2_dst += 2 * dst_strd - wdx2; /* pointer update */
}
}
else /* wdx2 = multiple of 4 case */
{
for(row = 0; row < ht; row += 2)
{
for(col = 0; col < wdx2; col += 4)
{
/*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */
src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
src0_16x8b = _mm_slli_epi16(src0_16x8b, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
src1_16x8b = _mm_slli_epi16(src1_16x8b, SHIFT_14_MINUS_BIT_DEPTH);
/* storing 16 8-bit output values */
_mm_storel_epi64((__m128i *)(pi2_dst), src0_16x8b); /* row =0 */
_mm_storel_epi64((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b); /* row =1 */
pu1_src += 4; /* pointer update */
pi2_dst += 4; /* pointer update */
} /* inner for loop ends here(4-output values in single iteration) */
pu1_src += 2 * src_strd - wdx2; /* pointer update */
pi2_dst += 2 * dst_strd - wdx2; /* pointer update */
}
}
}
}