blob: 480a8c7c603318c5040cdeabc3e441a118545ea6 [file] [log] [blame]
/******************************************************************************
*
* Copyright (C) 2015 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*****************************************************************************
* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
*/
/*****************************************************************************/
/* */
/* File Name : ih264_inter_pred_filters_intr_ssse3.c */
/* */
/* Description : Contains function definitions for weighted */
/* prediction functions in x86 sse4 intrinsics */
/* */
/* List of Functions : ih264_inter_pred_luma_copy_ssse3() */
/* ih264_inter_pred_luma_horz_ssse3() */
/* ih264_inter_pred_luma_vert_ssse3() */
/* ih264_inter_pred_luma_horz_hpel_vert_hpel_ssse3() */
/* ih264_inter_pred_luma_horz_qpel_ssse3() */
/* ih264_inter_pred_luma_vert_qpel_ssse3() */
/* ih264_inter_pred_luma_horz_qpel_vert_qpel_ssse3() */
/* ih264_inter_pred_luma_horz_hpel_vert_qpel_ssse3() */
/* ih264_inter_pred_luma_horz_qpel_vert_hpel_ssse3() */
/* ih264_inter_pred_chroma_ssse3() */
/* */
/* Issues / Problems : None */
/* */
/* Revision History : */
/* */
/* DD MM YYYY Author(s) Changes */
/* 13 02 2015 Kaushik Initial version */
/* Senthoor */
/* */
/*****************************************************************************/
/*****************************************************************************/
/* File Includes */
/*****************************************************************************/
#include <immintrin.h>
#include "ih264_typedefs.h"
#include "ih264_macros.h"
#include "ih264_platform_macros.h"
#include "ih264_inter_pred_filters.h"
/*****************************************************************************/
/* Constant Data variables */
/*****************************************************************************/
/* coefficients for 6 tap filtering*/
//const WORD32 ih264_g_six_tap[3] ={1,-5,20};
/*****************************************************************************/
/* Function definitions . */
/*****************************************************************************/
/*****************************************************************************/
/* */
/* Function Name : ih264_inter_pred_luma_copy_ssse3 */
/* */
/* Description : This function copies the contents of ht x wd block from */
/* source to destination. (ht,wd) can be (4,4), (8,4), */
/* (4,8), (8,8), (16,8), (8,16) or (16,16). */
/* */
/* Inputs : puc_src - pointer to source */
/* puc_dst - pointer to destination */
/* src_strd - stride for source */
/* dst_strd - stride for destination */
/* ht - height of the block */
/* wd - width of the block */
/* */
/* Issues : None */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes */
/* 13 02 2015 Kaushik Initial Version */
/* Senthoor */
/* */
/*****************************************************************************/
void ih264_inter_pred_luma_copy_ssse3(UWORD8 *pu1_src,
UWORD8 *pu1_dst,
WORD32 src_strd,
WORD32 dst_strd,
WORD32 ht,
WORD32 wd,
UWORD8* pu1_tmp,
WORD32 dydx)
{
__m128i y_0_16x8b, y_1_16x8b, y_2_16x8b, y_3_16x8b;
WORD32 src_strd2, src_strd3, src_strd4, dst_strd2, dst_strd3, dst_strd4;
UNUSED(pu1_tmp);
UNUSED(dydx);
src_strd2 = src_strd << 1;
dst_strd2 = dst_strd << 1;
src_strd4 = src_strd << 2;
dst_strd4 = dst_strd << 2;
src_strd3 = src_strd2 + src_strd;
dst_strd3 = dst_strd2 + dst_strd;
if(wd == 4)
{
do
{
*((WORD32 *)(pu1_dst)) = *((WORD32 *)(pu1_src));
*((WORD32 *)(pu1_dst + dst_strd)) = *((WORD32 *)(pu1_src + src_strd));
*((WORD32 *)(pu1_dst + dst_strd2)) = *((WORD32 *)(pu1_src + src_strd2));
*((WORD32 *)(pu1_dst + dst_strd3)) = *((WORD32 *)(pu1_src + src_strd3));
ht -= 4;
pu1_src += src_strd4;
pu1_dst += dst_strd4;
}
while(ht > 0);
}
else if(wd == 8)
{
do
{
y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd2));
y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd3));
_mm_storel_epi64((__m128i *)pu1_dst, y_0_16x8b);
_mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
_mm_storel_epi64((__m128i *)(pu1_dst + dst_strd2), y_2_16x8b);
_mm_storel_epi64((__m128i *)(pu1_dst + dst_strd3), y_3_16x8b);
ht -= 4;
pu1_src += src_strd4;
pu1_dst += dst_strd4;
}
while(ht > 0);
}
else // wd == 16
{
WORD32 src_strd5, src_strd6, src_strd7, src_strd8;
WORD32 dst_strd5, dst_strd6, dst_strd7, dst_strd8;
__m128i y_4_16x8b, y_5_16x8b, y_6_16x8b, y_7_16x8b;
src_strd5 = src_strd2 + src_strd3;
dst_strd5 = dst_strd2 + dst_strd3;
src_strd6 = src_strd3 << 1;
dst_strd6 = dst_strd3 << 1;
src_strd7 = src_strd3 + src_strd4;
dst_strd7 = dst_strd3 + dst_strd4;
src_strd8 = src_strd << 3;
dst_strd8 = dst_strd << 3;
do
{
y_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
y_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));
y_2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd2));
y_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd3));
y_4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd4));
y_5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd5));
y_6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd6));
y_7_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd7));
_mm_storeu_si128((__m128i *)pu1_dst, y_0_16x8b);
_mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
_mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), y_2_16x8b);
_mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), y_3_16x8b);
_mm_storeu_si128((__m128i *)(pu1_dst + dst_strd4), y_4_16x8b);
_mm_storeu_si128((__m128i *)(pu1_dst + dst_strd5), y_5_16x8b);
_mm_storeu_si128((__m128i *)(pu1_dst + dst_strd6), y_6_16x8b);
_mm_storeu_si128((__m128i *)(pu1_dst + dst_strd7), y_7_16x8b);
ht -= 8;
pu1_src += src_strd8;
pu1_dst += dst_strd8;
}
while(ht > 0);
}
}
/*****************************************************************************/
/* */
/* Function Name : ih264_inter_pred_luma_horz_ssse3 */
/* */
/* Description : This function applies a horizontal 6-tap filter on */
/* ht x wd block as mentioned in sec. 8.4.2.2.1 titled */
/* "Luma sample interpolation process". (ht,wd) can be */
/* (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16). */
/* */
/* Inputs : puc_src - pointer to source */
/* puc_dst - pointer to destination */
/* src_strd - stride for source */
/* dst_strd - stride for destination */
/* ht - height of the block */
/* wd - width of the block */
/* */
/* Issues : None */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes */
/* 13 02 2015 Kaushik Initial Version */
/* Senthoor */
/* */
/*****************************************************************************/
void ih264_inter_pred_luma_horz_ssse3(UWORD8 *pu1_src,
UWORD8 *pu1_dst,
WORD32 src_strd,
WORD32 dst_strd,
WORD32 ht,
WORD32 wd,
UWORD8* pu1_tmp,
WORD32 dydx)
{
__m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
__m128i const_val16_8x16b;
UNUSED(pu1_tmp);
UNUSED(dydx);
pu1_src -= 2; // the filter input starts from x[-2] (till x[3])
coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5
//c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
const_val16_8x16b = _mm_set1_epi16(16);
if(wd == 4)
{
__m128i src_r0_16x8b, src_r1_16x8b, src_r0r1_16x8b;
__m128i src_r0_sht_16x8b, src_r1_sht_16x8b;
__m128i res_r0r1_t1_8x16b, res_r0r1_t2_8x16b, res_r0r1_t3_8x16b;
__m128i res_r0r1_16x8b;
//Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
//Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
do
{
src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
src_r0_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
src_r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
src_r0r1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 b0 b1 b1 b2 b2 b3 b3 b4
res_r0r1_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
//b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0
src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0
src_r0r1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 b2 b3 b3 b4 b4 b5 b5 b6
res_r0r1_t2_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
//b2*c2+b3*c3 b3*c2+b4*c3 b4*c2+b5*c3 b5*c2+b6*c3
src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 0 0 0 0
src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0 0 0 0 0
src_r0r1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 b4 b5 b5 b6 b6 b7 b7 b8
res_r0r1_t3_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
//b4*c4+b5*c5 b5*c4+b6*c5 b4*c6+b7*c5 b7*c4+b8*c5
res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t2_8x16b);
res_r0r1_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0r1_t3_8x16b);
res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t3_8x16b); //a0*c0+a1*c1+a2*c2+a3*c3+a4*a4+a5*c5 + 16;
//a1*c0+a2*c1+a2*c2+a3*c3+a5*a4+a6*c5 + 16;
//a2*c0+a3*c1+a4*c2+a5*c3+a6*a4+a7*c5 + 16;
//a3*c0+a4*c1+a5*c2+a6*c3+a6*a4+a8*c5 + 16;
//b0*c0+b1*c1+b2*c2+b3*c3+b4*b4+b5*c5 + 16;
//b1*c0+b2*c1+b2*c2+b3*c3+b5*b4+b6*c5 + 16;
//b2*c0+b3*c1+b4*c2+b5*c3+b6*b4+b7*c5 + 16;
//b3*c0+b4*c1+b5*c2+b6*c3+b6*b4+b8*c5 + 16;
res_r0r1_t1_8x16b = _mm_srai_epi16(res_r0r1_t1_8x16b, 5); //shifting right by 5 bits.
res_r0r1_16x8b = _mm_packus_epi16(res_r0r1_t1_8x16b, res_r0r1_t1_8x16b);
*((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_r0r1_16x8b);
res_r0r1_16x8b = _mm_srli_si128(res_r0r1_16x8b, 4);
*((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_r0r1_16x8b);
ht -= 2;
pu1_src += src_strd << 1;
pu1_dst += dst_strd << 1;
}
while(ht > 0);
}
else if(wd == 8)
{
__m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b;
__m128i src_r0_t1_16x8b, src_r1_t1_16x8b;
__m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
__m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;
//Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
//Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
do
{
src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
//a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
//b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1
src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0
src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0
src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0
src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10
res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
//a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
//b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3
src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0
src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0
src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0
src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0
src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12
res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
//a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
//b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
res_r0_t3_8x16b = _mm_add_epi16(res_r0_t3_8x16b, const_val16_8x16b);
res_r1_t3_8x16b = _mm_add_epi16(res_r1_t3_8x16b, const_val16_8x16b);
res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);
res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5); //shifting right by 5 bits.
res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5);
src_r0_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r0_t1_8x16b);
src_r1_16x8b = _mm_packus_epi16(res_r1_t1_8x16b, res_r1_t1_8x16b);
_mm_storel_epi64((__m128i *)pu1_dst, src_r0_16x8b);
_mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), src_r1_16x8b);
ht -= 2;
pu1_src += src_strd << 1;
pu1_dst += dst_strd << 1;
}
while(ht > 0);
}
else // wd == 16
{
__m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b;
__m128i src_r0_t1_16x8b, src_r1_t1_16x8b;
__m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
__m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;
//Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
//Row0 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
//b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels.
do
{
src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
//a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
//b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1
src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0
src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0
src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0
src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10
res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
//a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
//b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3
src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0
src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0
src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0
src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0
src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12
res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
//a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
//b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
res_r0_t3_8x16b = _mm_add_epi16(res_r0_t3_8x16b, const_val16_8x16b);
res_r1_t3_8x16b = _mm_add_epi16(res_r1_t3_8x16b, const_val16_8x16b);
res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);
res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5); //shifting right by 5 bits.
res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5);
src_r0_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r1_t1_8x16b);
_mm_storeu_si128((__m128i *)pu1_dst, src_r0_16x8b);
ht--;
pu1_src += src_strd;
pu1_dst += dst_strd;
}
while(ht > 0);
}
}
/*****************************************************************************/
/* */
/* Function Name : ih264_inter_pred_luma_vert_ssse3 */
/* */
/* Description : This function applies a vertical 6-tap filter on */
/* ht x wd block as mentioned in sec. 8.4.2.2.1 titled */
/* "Luma sample interpolation process". (ht,wd) can be */
/* (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16). */
/* */
/* Inputs : puc_src - pointer to source */
/* puc_dst - pointer to destination */
/* src_strd - stride for source */
/* dst_strd - stride for destination */
/* ht - height of the block */
/* wd - width of the block */
/* */
/* Issues : None */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes */
/* 13 02 2015 Kaushik Initial Version */
/* Senthoor */
/* */
/*****************************************************************************/
void ih264_inter_pred_luma_vert_ssse3(UWORD8 *pu1_src,
UWORD8 *pu1_dst,
WORD32 src_strd,
WORD32 dst_strd,
WORD32 ht,
WORD32 wd,
UWORD8* pu1_tmp,
WORD32 dydx)
{
__m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b, src_r4_16x8b;
__m128i src_r5_16x8b, src_r6_16x8b;
__m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;
__m128i res_16x8b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;
__m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
__m128i const_val16_8x16b;
UNUSED(pu1_tmp);
UNUSED(dydx);
coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5
//c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
const_val16_8x16b = _mm_set1_epi16(16);
pu1_src -= src_strd << 1; // the filter input starts from x[-2] (till x[3])
if(wd == 4)
{
//Epilogue: Load all the pred rows except sixth and seventh row
// for the first and second row processing.
src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
pu1_src += src_strd;
src_r1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
pu1_src += src_strd;
src_r2_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
pu1_src += src_strd;
src_r3_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
pu1_src += src_strd;
src_r4_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
pu1_src += src_strd;
src_r0_16x8b = _mm_unpacklo_epi32(src_r0_16x8b, src_r1_16x8b);
src_r1_16x8b = _mm_unpacklo_epi32(src_r1_16x8b, src_r2_16x8b);
src_r2_16x8b = _mm_unpacklo_epi32(src_r2_16x8b, src_r3_16x8b);
src_r3_16x8b = _mm_unpacklo_epi32(src_r3_16x8b, src_r4_16x8b);
do
{
src_r5_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
src_r4_16x8b = _mm_unpacklo_epi32(src_r4_16x8b, src_r5_16x8b);
src_r5_16x8b = _mm_unpacklo_epi32(src_r5_16x8b, src_r6_16x8b);
src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
*((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_16x8b);
res_16x8b = _mm_srli_si128(res_16x8b, 4);
*((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_16x8b);
src_r0_16x8b = src_r2_16x8b;
src_r1_16x8b = src_r3_16x8b;
src_r2_16x8b = src_r4_16x8b;
src_r3_16x8b = src_r5_16x8b;
src_r4_16x8b = src_r6_16x8b;
ht -= 2;
pu1_src += src_strd << 1;
pu1_dst += dst_strd << 1;
}
while(ht > 0);
}
else if(wd == 8)
{
//Epilogue: Load all the pred rows except sixth and seventh row
// for the first and second row processing.
src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
pu1_src += src_strd;
src_r1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
pu1_src += src_strd;
src_r2_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
pu1_src += src_strd;
src_r3_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
pu1_src += src_strd;
src_r4_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
pu1_src += src_strd;
src_r0_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);
src_r1_16x8b = _mm_unpacklo_epi64(src_r1_16x8b, src_r2_16x8b);
src_r2_16x8b = _mm_unpacklo_epi64(src_r2_16x8b, src_r3_16x8b);
src_r3_16x8b = _mm_unpacklo_epi64(src_r3_16x8b, src_r4_16x8b);
do
{
src_r5_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
src_r4_16x8b = _mm_unpacklo_epi64(src_r4_16x8b, src_r5_16x8b);
src_r5_16x8b = _mm_unpacklo_epi64(src_r5_16x8b, src_r6_16x8b);
src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
_mm_storel_epi64((__m128i *)pu1_dst, res_16x8b);
src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b);
src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b);
src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b);
res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
_mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b);
src_r0_16x8b = src_r2_16x8b;
src_r1_16x8b = src_r3_16x8b;
src_r2_16x8b = src_r4_16x8b;
src_r3_16x8b = src_r5_16x8b;
src_r4_16x8b = src_r6_16x8b;
ht -= 2;
pu1_src += src_strd << 1;
pu1_dst += dst_strd << 1;
}
while(ht > 0);
}
else // wd == 16
{
__m128i res_t0_8x16b;
//Epilogue: Load all the pred rows except sixth and seventh row
// for the first and second row processing.
src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
pu1_src += src_strd;
src_r1_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
pu1_src += src_strd;
src_r2_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
pu1_src += src_strd;
src_r3_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
pu1_src += src_strd;
src_r4_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
pu1_src += src_strd;
do
{
src_r5_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
src_r6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));
src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b);
src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b);
src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b);
res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b);
_mm_storeu_si128((__m128i *)pu1_dst, res_16x8b);
src_r0r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r2_16x8b);
src_r2r3_16x8b = _mm_unpacklo_epi8(src_r3_16x8b, src_r4_16x8b);
src_r4r5_16x8b = _mm_unpacklo_epi8(src_r5_16x8b, src_r6_16x8b);
res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
src_r0r1_16x8b = _mm_unpackhi_epi8(src_r1_16x8b, src_r2_16x8b);
src_r2r3_16x8b = _mm_unpackhi_epi8(src_r3_16x8b, src_r4_16x8b);
src_r4r5_16x8b = _mm_unpackhi_epi8(src_r5_16x8b, src_r6_16x8b);
res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b);
_mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res_16x8b);
src_r0_16x8b = src_r2_16x8b;
src_r1_16x8b = src_r3_16x8b;
src_r2_16x8b = src_r4_16x8b;
src_r3_16x8b = src_r5_16x8b;
src_r4_16x8b = src_r6_16x8b;
ht -= 2;
pu1_src += src_strd << 1;
pu1_dst += dst_strd << 1;
}
while(ht > 0);
}
}
/*****************************************************************************/
/* */
/* Function Name : ih264_inter_pred_luma_horz_hpel_vert_hpel_ssse3 */
/* */
/* Description : This function implements a two stage cascaded six tap */
/* filter, horizontally and then vertically on ht x wd */
/* block as mentioned in sec. 8.4.2.2.1 titled "Luma sample */
/* interpolation process". (ht,wd) can be (4,4), (8,4), */
/* (4,8), (8,8), (16,8), (8,16) or (16,16). */
/* */
/* Inputs : puc_src - pointer to source */
/* puc_dst - pointer to destination */
/* src_strd - stride for source */
/* dst_strd - stride for destination */
/* ht - height of the block */
/* wd - width of the block */
/* pu1_tmp - pointer to temporary buffer */
/* */
/* Issues : None */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes */
/* 13 02 2015 Kaushik Initial Version */
/* Senthoor */
/* */
/*****************************************************************************/
void ih264_inter_pred_luma_horz_hpel_vert_hpel_ssse3(UWORD8 *pu1_src,
UWORD8 *pu1_dst,
WORD32 src_strd,
WORD32 dst_strd,
WORD32 ht,
WORD32 wd,
UWORD8* pu1_tmp,
WORD32 dydx)
{
UNUSED(dydx);
if(wd == 4)
{
WORD16 *pi2_temp;
pu1_tmp += 4;
pu1_src -= src_strd << 1;
pi2_temp = (WORD16 *)pu1_tmp;
pu1_src -= 2; // the filter input starts from x[-2] (till x[3])
// Horizontal 6-tap filtering
{
WORD32 ht_tmp = ht + 4;
__m128i src_r0_16x8b, src_r1_16x8b;
__m128i src_r0_sht_16x8b, src_r1_sht_16x8b;
__m128i src_r0r1_t1_16x8b;
__m128i res_r0r1_t1_8x16b, res_r0r1_t2_8x16b, res_r0r1_t3_8x16b;
__m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
//c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
//Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
//Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
do
{
src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
src_r0_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
src_r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 b0 b1 b1 b2 b2 b3 b3 b4
res_r0r1_t1_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
//b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0
src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0
src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 b2 b3 b3 b4 b4 b5 b5 b6
res_r0r1_t2_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
//b2*c2+b3*c3 b3*c2+b4*c3 b4*c2+b5*c3 b5*c2+b6*c3
src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 0 0 0 0
src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0 0 0 0 0
src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 b4 b5 b5 b6 b6 b7 b7 b8
res_r0r1_t3_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
//b4*c4+b5*c5 b5*c4+b6*c5 b4*c6+b7*c5 b7*c4+b8*c5
res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t2_8x16b);
res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t3_8x16b, res_r0r1_t1_8x16b);
_mm_storeu_si128((__m128i *)pi2_temp, res_r0r1_t1_8x16b);
ht_tmp -= 2;
pu1_src += src_strd << 1;
pi2_temp += 8;
}
while(ht_tmp > 0);
src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
src_r0_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
res_r0r1_t1_8x16b = _mm_maddubs_epi16(src_r0_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
src_r0_16x8b = _mm_srli_si128(src_r0_16x8b,4); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0
res_r0r1_t2_8x16b = _mm_maddubs_epi16(src_r0_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
src_r0_16x8b = _mm_srli_si128(src_r0_16x8b,4); //a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 0 0 0 0
res_r0r1_t3_8x16b = _mm_maddubs_epi16(src_r0_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t2_8x16b);
res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t3_8x16b, res_r0r1_t1_8x16b);
_mm_storel_epi64((__m128i *)pi2_temp, res_r0r1_t1_8x16b);
}
pi2_temp = (WORD16 *)pu1_tmp;
// Vertical 6-tap filtering
{
__m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b,
src_r4_8x16b;
__m128i src_r5_8x16b, src_r6_8x16b;
__m128i src_t1_8x16b, src_t2_8x16b;
__m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b;
__m128i res_8x16b, res_16x8b;
__m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
__m128i const_val512_4x32b;
coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001);
coeff2_3_8x16b = _mm_set1_epi32(0x00140014);
coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB);
const_val512_4x32b = _mm_set1_epi32(512);
src_r0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp));
src_r1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp + 4));
src_r2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp + 8));
src_r3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp + 12));
src_r4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp + 16));
pi2_temp += 20;
do
{
src_r5_8x16b = _mm_loadl_epi64((__m128i *)pi2_temp);
src_r6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp + 4));
src_r0_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
src_t1_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
src_t2_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);
res_t1_4x32b = _mm_madd_epi16(src_r0_8x16b, coeff0_1_8x16b);
res_t2_4x32b = _mm_madd_epi16(src_t1_8x16b, coeff2_3_8x16b);
res_t3_4x32b = _mm_madd_epi16(src_t2_8x16b, coeff4_5_8x16b);
res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
src_r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b);
src_t1_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b);
src_t2_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b);
res_t1_4x32b = _mm_madd_epi16(src_r1_8x16b, coeff0_1_8x16b);
res_t2_4x32b = _mm_madd_epi16(src_t1_8x16b, coeff2_3_8x16b);
res_t3_4x32b = _mm_madd_epi16(src_t2_8x16b, coeff4_5_8x16b);
res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
*((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_16x8b);
res_16x8b = _mm_srli_si128(res_16x8b, 4);
*((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_16x8b);
src_r0_8x16b = src_r2_8x16b;
src_r1_8x16b = src_r3_8x16b;
src_r2_8x16b = src_r4_8x16b;
src_r3_8x16b = src_r5_8x16b;
src_r4_8x16b = src_r6_8x16b;
ht -= 2;
pi2_temp += 8;
pu1_dst += dst_strd << 1;
}
while(ht > 0);
}
}
else if(wd == 8)
{
WORD16 *pi2_temp;
pu1_tmp += 4;
pu1_src -= src_strd << 1;
pi2_temp = (WORD16 *)pu1_tmp;
pu1_src -= 2; // the filter input starts from x[-2] (till x[3])
// Horizontal 6-tap filtering
{
WORD32 ht_tmp = ht + 4;
__m128i src_r0_16x8b, src_r1_16x8b;
__m128i src_r0_sht_16x8b, src_r1_sht_16x8b;
__m128i src_r0_t1_16x8b, src_r1_t1_16x8b;
__m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
__m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;
__m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
//c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
//Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
//Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
do
{
src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 a13 a14 a15
src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 b10 b11 b12 b13 b14 b15
src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
//a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
//b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1
src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0
src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0
src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0
src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10
res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
//a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
//b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3
src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0
src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0
src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0
src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0
src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12
res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
//a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
//b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);
_mm_storeu_si128((__m128i *)pi2_temp, res_r0_t1_8x16b);
_mm_storeu_si128((__m128i *)(pi2_temp + 8), res_r1_t1_8x16b);
ht_tmp -= 2;
pu1_src += src_strd << 1;
pi2_temp += 16;
}
while(ht_tmp > 0);
src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 a13 a14 a15
src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b,src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b,coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
//a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0
src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
//a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0
src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0
src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
//a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
_mm_storeu_si128((__m128i *)pi2_temp, res_r0_t1_8x16b);
}
pi2_temp = (WORD16 *)pu1_tmp;
// Vertical 6-tap filtering
{
__m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b,
src_r4_8x16b;
__m128i src_r5_8x16b, src_r6_8x16b;
__m128i src_r0r1_8x16b, src_r2r3_8x16b, src_r4r5_8x16b;
__m128i res_t1_4x32b, res_t2_4x32b, res_t3_4x32b;
__m128i res_c0_4x32b, res_c1_4x32b;
__m128i res_8x16b, res_16x8b;
__m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
__m128i const_val512_4x32b;
coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001);
coeff2_3_8x16b = _mm_set1_epi32(0x00140014);
coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB);
const_val512_4x32b = _mm_set1_epi32(512);
src_r0_8x16b = _mm_loadu_si128((__m128i *)pi2_temp);
src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 8));
src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 16));
src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 24));
src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 32));
pi2_temp += 40;
do
{
src_r5_8x16b = _mm_loadu_si128((__m128i *)pi2_temp);
src_r6_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 8));
src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);
res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
res_c0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);
res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
res_c1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
res_8x16b = _mm_packs_epi32(res_c0_4x32b, res_c1_4x32b);
res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
_mm_storel_epi64((__m128i *)pu1_dst, res_16x8b);
src_r0r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b);
src_r2r3_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b);
src_r4r5_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b);
res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
res_c0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
src_r0r1_8x16b = _mm_unpackhi_epi16(src_r1_8x16b, src_r2_8x16b);
src_r2r3_8x16b = _mm_unpackhi_epi16(src_r3_8x16b, src_r4_8x16b);
src_r4r5_8x16b = _mm_unpackhi_epi16(src_r5_8x16b, src_r6_8x16b);
res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
res_c1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
res_8x16b = _mm_packs_epi32(res_c0_4x32b, res_c1_4x32b);
res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
_mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b);
src_r0_8x16b = src_r2_8x16b;
src_r1_8x16b = src_r3_8x16b;
src_r2_8x16b = src_r4_8x16b;
src_r3_8x16b = src_r5_8x16b;
src_r4_8x16b = src_r6_8x16b;
ht -= 2;
pi2_temp += 16;
pu1_dst += dst_strd << 1;
}
while(ht > 0);
}
}
else // wd == 16
{
WORD16 *pi2_temp;
WORD32 ht_tmp;
pu1_tmp += 4;
pu1_src -= src_strd << 1;
pi2_temp = (WORD16 *)pu1_tmp;
pu1_src -= 2; // the filter input starts from x[-2] (till x[3])
// Horizontal 6-tap filtering
{
__m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b;
__m128i src_r0_t1_16x8b, src_r1_t1_16x8b;
__m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
__m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;
__m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
ht_tmp = ht + 5;
coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
//c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
//Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
//Row0 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
//b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels.
do
{
src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
//a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
//b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1
src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0
src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0
src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0
src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10
res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
//a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
//b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3
src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0
src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0
src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0
src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0
src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12
res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
//a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
//b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);
_mm_storeu_si128((__m128i *)pi2_temp, res_r0_t1_8x16b);
_mm_storeu_si128((__m128i *)(pi2_temp + 8), res_r1_t1_8x16b);
ht_tmp--;
pu1_src += src_strd;
pi2_temp += 16;
}
while(ht_tmp > 0);
}
pi2_temp = (WORD16 *)pu1_tmp;
// Vertical 6-tap filtering
{
WORD16 *pi2_temp2;
UWORD8 *pu1_dst2;
WORD32 ht_tmp;
__m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b, src_r4_8x16b;
__m128i src_r5_8x16b, src_r6_8x16b;
__m128i src_r0r1_8x16b, src_r2r3_8x16b, src_r4r5_8x16b;
__m128i res_t1_4x32b, res_t2_4x32b, res_t3_4x32b;
__m128i res_c0_4x32b, res_c1_4x32b;
__m128i res_8x16b, res_16x8b;
__m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
__m128i const_val512_4x32b;
coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001);
coeff2_3_8x16b = _mm_set1_epi32(0x00140014);
coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB);
const_val512_4x32b = _mm_set1_epi32(512);
pi2_temp2 = pi2_temp + 8;
pu1_dst2 = pu1_dst + 8;
ht_tmp = ht;
/**********************************************************/
/* Do first height x 8 block */
/**********************************************************/
src_r0_8x16b = _mm_loadu_si128((__m128i *)pi2_temp);
src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 16));
src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 32));
src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 48));
src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 64));
pi2_temp += 80;
do
{
src_r5_8x16b = _mm_loadu_si128((__m128i *)pi2_temp);
src_r6_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 16));
src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);
res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
res_c0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);
res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
res_c1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
res_8x16b = _mm_packs_epi32(res_c0_4x32b, res_c1_4x32b);
res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
_mm_storel_epi64((__m128i *)pu1_dst, res_16x8b);
src_r0r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b);
src_r2r3_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b);
src_r4r5_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b);
res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
res_c0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
src_r0r1_8x16b = _mm_unpackhi_epi16(src_r1_8x16b, src_r2_8x16b);
src_r2r3_8x16b = _mm_unpackhi_epi16(src_r3_8x16b, src_r4_8x16b);
src_r4r5_8x16b = _mm_unpackhi_epi16(src_r5_8x16b, src_r6_8x16b);
res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
res_c1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
res_8x16b = _mm_packs_epi32(res_c0_4x32b, res_c1_4x32b);
res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
_mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b);
src_r0_8x16b = src_r2_8x16b;
src_r1_8x16b = src_r3_8x16b;
src_r2_8x16b = src_r4_8x16b;
src_r3_8x16b = src_r5_8x16b;
src_r4_8x16b = src_r6_8x16b;
ht_tmp -= 2;
pi2_temp += 32;
pu1_dst += dst_strd << 1;
}
while(ht_tmp > 0);
/**********************************************************/
/* Do second ht x 8 block */
/**********************************************************/
src_r0_8x16b = _mm_loadu_si128((__m128i *)pi2_temp2);
src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 16));
src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 32));
src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 48));
src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 64));
pi2_temp2 += 80;
do
{
src_r5_8x16b = _mm_loadu_si128((__m128i *)pi2_temp2);
src_r6_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 16));
src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);
res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
res_c0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);
res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
res_c1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
res_8x16b = _mm_packs_epi32(res_c0_4x32b, res_c1_4x32b);
res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
_mm_storel_epi64((__m128i *)pu1_dst2, res_16x8b);
src_r0r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b);
src_r2r3_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b);
src_r4r5_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b);
res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
res_c0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
src_r0r1_8x16b = _mm_unpackhi_epi16(src_r1_8x16b, src_r2_8x16b);
src_r2r3_8x16b = _mm_unpackhi_epi16(src_r3_8x16b, src_r4_8x16b);
src_r4r5_8x16b = _mm_unpackhi_epi16(src_r5_8x16b, src_r6_8x16b);
res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
res_c1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
res_8x16b = _mm_packs_epi32(res_c0_4x32b, res_c1_4x32b);
res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
_mm_storel_epi64((__m128i *)(pu1_dst2 + dst_strd), res_16x8b);
src_r0_8x16b = src_r2_8x16b;
src_r1_8x16b = src_r3_8x16b;
src_r2_8x16b = src_r4_8x16b;
src_r3_8x16b = src_r5_8x16b;
src_r4_8x16b = src_r6_8x16b;
ht -= 2;
pi2_temp2 += 32;
pu1_dst2 += dst_strd << 1;
}
while(ht > 0);
}
}
}
/*****************************************************************************/
/* */
/* Function Name : ih264_inter_pred_luma_horz_qpel_ssse3 */
/* */
/* Description : This function implements a six-tap filter horizontally */
/* on ht x wd block and averages the values with the source */
/* pixels to calculate horizontal quarter-pel as mentioned */
/* in sec. 8.4.2.2.1 titled "Luma sample interpolation */
/* process". (ht,wd) can be (4,4), (8,4), (4,8), (8,8), */
/* (16,8), (8,16) or (16,16). */
/* */
/* Inputs : puc_src - pointer to source */
/* puc_dst - pointer to destination */
/* src_strd - stride for source */
/* dst_strd - stride for destination */
/* ht - height of the block */
/* wd - width of the block */
/* pu1_tmp - pointer to temporary buffer */
/* dydx - x and y reference offset for q-pel */
/* calculations */
/* */
/* Issues : None */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes */
/* 13 02 2015 Kaushik Initial Version */
/* Senthoor */
/* */
/*****************************************************************************/
void ih264_inter_pred_luma_horz_qpel_ssse3(UWORD8 *pu1_src,
UWORD8 *pu1_dst,
WORD32 src_strd,
WORD32 dst_strd,
WORD32 ht,
WORD32 wd,
UWORD8* pu1_tmp,
WORD32 dydx)
{
WORD32 x_offset;
UWORD8 *pu1_pred1;
__m128i src_r0_16x8b, src_r1_16x8b;
__m128i src_r0_sht_16x8b, src_r1_sht_16x8b;
__m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
__m128i const_val16_8x16b;
UNUSED(pu1_tmp);
x_offset = dydx & 3;
coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5
//c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
pu1_pred1 = pu1_src + (x_offset >> 1);
const_val16_8x16b = _mm_set1_epi16(16);
pu1_src -= 2; // the filter input starts from x[-2] (till x[3])
if(wd == 4)
{
__m128i src_r0r1_16x8b;
__m128i res_r0r1_t1_8x16b, res_r0r1_t2_8x16b, res_r0r1_t3_8x16b;
__m128i res_r0r1_16x8b;
//Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
//Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
do
{
src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
src_r0_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
src_r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
src_r0r1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 b0 b1 b1 b2 b2 b3 b3 b4
res_r0r1_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
//b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0
src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0
src_r0r1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 b2 b3 b3 b4 b4 b5 b5 b6
res_r0r1_t2_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
//b2*c2+b3*c3 b3*c2+b4*c3 b4*c2+b5*c3 b5*c2+b6*c3
src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 0 0 0 0
src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0 0 0 0 0
src_r0r1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 b4 b5 b5 b6 b6 b7 b7 b8
res_r0r1_t3_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
//b4*c4+b5*c5 b5*c4+b6*c5 b4*c6+b7*c5 b7*c4+b8*c5
src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_pred1);
src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred1 + src_strd));
res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t2_8x16b);
res_r0r1_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0r1_t3_8x16b);
res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t3_8x16b); //a0*c0+a1*c1+a2*c2+a3*c3+a4*a4+a5*c5 + 16;
//a1*c0+a2*c1+a2*c2+a3*c3+a5*a4+a6*c5 + 16;
//a2*c0+a3*c1+a4*c2+a5*c3+a6*a4+a7*c5 + 16;
//a3*c0+a4*c1+a5*c2+a6*c3+a6*a4+a8*c5 + 16;
//b0*c0+b1*c1+b2*c2+b3*c3+b4*b4+b5*c5 + 16;
//b1*c0+b2*c1+b2*c2+b3*c3+b5*b4+b6*c5 + 16;
//b2*c0+b3*c1+b4*c2+b5*c3+b6*b4+b7*c5 + 16;
//b3*c0+b4*c1+b5*c2+b6*c3+b6*b4+b8*c5 + 16;
src_r0r1_16x8b = _mm_unpacklo_epi32(src_r0_16x8b,src_r1_16x8b);
res_r0r1_t1_8x16b = _mm_srai_epi16(res_r0r1_t1_8x16b, 5); //shifting right by 5 bits.
res_r0r1_16x8b = _mm_packus_epi16(res_r0r1_t1_8x16b, res_r0r1_t1_8x16b);
res_r0r1_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_r0r1_16x8b); //computing q-pel
*((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_r0r1_16x8b);
res_r0r1_16x8b = _mm_srli_si128(res_r0r1_16x8b, 4);
*((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_r0r1_16x8b);
ht -= 2;
pu1_src += src_strd << 1;
pu1_pred1 += src_strd << 1;
pu1_dst += dst_strd << 1;
}
while(ht > 0);
}
else if(wd == 8)
{
__m128i src_r0_t1_16x8b, src_r1_t1_16x8b;
__m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
__m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;
__m128i res_r0_16x8b, res_r1_16x8b;
//Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
//Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
do
{
src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
//a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
//b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1
src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0
src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0
src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0
src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10
res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
//a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
//b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3
src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0
src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0
src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0
src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0
src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12
res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
//a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
//b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_pred1);
src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred1 + src_strd));
res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
res_r0_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0_t3_8x16b);
res_r1_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r1_t3_8x16b);
res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);
res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5);
res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5); //shifting right by 5 bits.
res_r0_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r0_t1_8x16b);
res_r1_16x8b = _mm_packus_epi16(res_r1_t1_8x16b, res_r1_t1_8x16b);
res_r0_16x8b = _mm_avg_epu8(src_r0_16x8b, res_r0_16x8b);
res_r1_16x8b = _mm_avg_epu8(src_r1_16x8b, res_r1_16x8b); //computing q-pel
_mm_storel_epi64((__m128i *)pu1_dst, res_r0_16x8b);
_mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_r1_16x8b);
ht -= 2;
pu1_src += src_strd << 1;
pu1_pred1 += src_strd << 1;
pu1_dst += dst_strd << 1;
}
while(ht > 0);
}
else // wd == 16
{
__m128i src_r0_t1_16x8b, src_r1_t1_16x8b;
__m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
__m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;
__m128i res_16x8b;
//Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
//Row0 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
//b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels.
do
{
src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
//a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
//b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1
src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0
src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0
src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0
src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10
res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
//a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
//b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3
src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0
src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0
src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0
src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0
src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12
res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
//a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
//b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_pred1);
res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
res_r0_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0_t3_8x16b);
res_r1_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r1_t3_8x16b);
res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);
res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5);
res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5); //shifting right by 5 bits
res_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r1_t1_8x16b);
res_16x8b = _mm_avg_epu8(src_r0_16x8b, res_16x8b); //computing q-pel
_mm_storeu_si128((__m128i *)pu1_dst, res_16x8b);
ht--;
pu1_src += src_strd;
pu1_pred1 += src_strd;
pu1_dst += dst_strd;
}
while(ht > 0);
}
}
/*****************************************************************************/
/* */
/* Function Name : ih264_inter_pred_luma_vert_qpel_ssse3 */
/* */
/* Description : This function implements a six-tap filter vertically on */
/* ht x wd block and averages the values with the source */
/* pixels to calculate vertical quarter-pel as mentioned in */
/* sec. 8.4.2.2.1 titled "Luma sample interpolation */
/* process". (ht,wd) can be (4,4), (8,4), (4,8), (8,8), */
/* (16,8), (8,16) or (16,16). */
/* */
/* Inputs : puc_src - pointer to source */
/* puc_dst - pointer to destination */
/* src_strd - stride for source */
/* dst_strd - stride for destination */
/* ht - height of the block */
/* wd - width of the block */
/* pu1_tmp - pointer to temporary buffer */
/* dydx - x and y reference offset for q-pel */
/* calculations */
/* */
/* Issues : None */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes */
/* 13 02 2015 Kaushik Initial Version */
/* Senthoor */
/* */
/*****************************************************************************/
void ih264_inter_pred_luma_vert_qpel_ssse3(UWORD8 *pu1_src,
UWORD8 *pu1_dst,
WORD32 src_strd,
WORD32 dst_strd,
WORD32 ht,
WORD32 wd,
UWORD8* pu1_tmp,
WORD32 dydx)
{
WORD32 y_offset;
UWORD8 *pu1_pred1;
__m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b, src_r4_16x8b;
__m128i src_r5_16x8b, src_r6_16x8b;
__m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;
__m128i res_16x8b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;
__m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
__m128i const_val16_8x16b;
UNUSED(pu1_tmp);
y_offset = dydx & 0xf;
coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5
//c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
pu1_pred1 = pu1_src + (y_offset >> 3) * src_strd;
const_val16_8x16b = _mm_set1_epi16(16);
pu1_src -= src_strd << 1; // the filter input starts from x[-2] (till x[3])
if(wd == 4)
{
//Epilogue: Load all the pred rows except sixth and seventh row
// for the first and second row processing.
src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
pu1_src += src_strd;
src_r1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
pu1_src += src_strd;
src_r2_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
pu1_src += src_strd;
src_r3_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
pu1_src += src_strd;
src_r4_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
pu1_src += src_strd;
src_r0_16x8b = _mm_unpacklo_epi32(src_r0_16x8b, src_r1_16x8b);
src_r1_16x8b = _mm_unpacklo_epi32(src_r1_16x8b, src_r2_16x8b);
src_r2_16x8b = _mm_unpacklo_epi32(src_r2_16x8b, src_r3_16x8b);
src_r3_16x8b = _mm_unpacklo_epi32(src_r3_16x8b, src_r4_16x8b);
do
{
src_r5_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
src_r4_16x8b = _mm_unpacklo_epi32(src_r4_16x8b, src_r5_16x8b);
src_r5_16x8b = _mm_unpacklo_epi32(src_r5_16x8b, src_r6_16x8b);
src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_pred1);
src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred1 + src_strd));
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
src_r0r1_16x8b = _mm_unpacklo_epi32(src_r0_16x8b,src_r1_16x8b);
res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
res_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_16x8b); //computing q-pel
*((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_16x8b);
res_16x8b = _mm_srli_si128(res_16x8b, 4);
*((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_16x8b);
src_r0_16x8b = src_r2_16x8b;
src_r1_16x8b = src_r3_16x8b;
src_r2_16x8b = src_r4_16x8b;
src_r3_16x8b = src_r5_16x8b;
src_r4_16x8b = src_r6_16x8b;
ht -= 2;
pu1_src += src_strd << 1;
pu1_pred1 += src_strd << 1;
pu1_dst += dst_strd << 1;
}
while(ht > 0);
}
else if(wd == 8)
{
//Epilogue: Load all the pred rows except sixth and seventh row
// for the first and second row processing.
src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
pu1_src += src_strd;
src_r1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
pu1_src += src_strd;
src_r2_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
pu1_src += src_strd;
src_r3_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
pu1_src += src_strd;
src_r4_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
pu1_src += src_strd;
src_r0_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);
src_r1_16x8b = _mm_unpacklo_epi64(src_r1_16x8b, src_r2_16x8b);
src_r2_16x8b = _mm_unpacklo_epi64(src_r2_16x8b, src_r3_16x8b);
src_r3_16x8b = _mm_unpacklo_epi64(src_r3_16x8b, src_r4_16x8b);
do
{
src_r5_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
src_r4_16x8b = _mm_unpacklo_epi64(src_r4_16x8b, src_r5_16x8b);
src_r5_16x8b = _mm_unpacklo_epi64(src_r5_16x8b, src_r6_16x8b);
src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
src_r0r1_16x8b = _mm_loadl_epi64((__m128i *)pu1_pred1);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
res_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_16x8b); //computing q-pel
_mm_storel_epi64((__m128i *)pu1_dst, res_16x8b);
src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b);
src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b);
src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b);
res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
src_r0r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred1 + src_strd));
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
res_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_16x8b); //computing q-pel
_mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b);
src_r0_16x8b = src_r2_16x8b;
src_r1_16x8b = src_r3_16x8b;
src_r2_16x8b = src_r4_16x8b;
src_r3_16x8b = src_r5_16x8b;
src_r4_16x8b = src_r6_16x8b;
ht -= 2;
pu1_src += src_strd << 1;
pu1_pred1 += src_strd << 1;
pu1_dst += dst_strd << 1;
}
while(ht > 0);
}
else // wd == 16
{
__m128i res_t0_8x16b;
//Epilogue: Load all the pred rows except sixth and seventh row
// for the first and second row processing.
src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
pu1_src += src_strd;
src_r1_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
pu1_src += src_strd;
src_r2_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
pu1_src += src_strd;
src_r3_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
pu1_src += src_strd;
src_r4_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
pu1_src += src_strd;
do
{
src_r5_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
src_r6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));
src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b);
src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b);
src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b);
res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
src_r0r1_16x8b = _mm_loadu_si128((__m128i *)pu1_pred1);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b);
res_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_16x8b); //computing q-pel
_mm_storeu_si128((__m128i *)pu1_dst, res_16x8b);
src_r0r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r2_16x8b);
src_r2r3_16x8b = _mm_unpacklo_epi8(src_r3_16x8b, src_r4_16x8b);
src_r4r5_16x8b = _mm_unpacklo_epi8(src_r5_16x8b, src_r6_16x8b);
res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
src_r0r1_16x8b = _mm_unpackhi_epi8(src_r1_16x8b, src_r2_16x8b);
src_r2r3_16x8b = _mm_unpackhi_epi8(src_r3_16x8b, src_r4_16x8b);
src_r4r5_16x8b = _mm_unpackhi_epi8(src_r5_16x8b, src_r6_16x8b);
res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
src_r0r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred1 + src_strd));
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b);
res_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_16x8b); //computing q-pel
_mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res_16x8b);
src_r0_16x8b = src_r2_16x8b;
src_r1_16x8b = src_r3_16x8b;
src_r2_16x8b = src_r4_16x8b;
src_r3_16x8b = src_r5_16x8b;
src_r4_16x8b = src_r6_16x8b;
ht -= 2;
pu1_src += src_strd << 1;
pu1_pred1 += src_strd << 1;
pu1_dst += dst_strd << 1;
}
while(ht > 0);
}
}
/*****************************************************************************/
/* */
/* Function Name : ih264_inter_pred_luma_horz_qpel_vert_qpel_ssse3 */
/* */
/* Description : This function implements a six-tap filter vertically and */
/* horizontally on ht x wd block separately and averages */
/* the two sets of values to calculate values at (1/4,1/4), */
/* (1/4, 3/4), (3/4, 1/4) or (3/4, 3/4) as mentioned in */
/* sec. 8.4.2.2.1 titled "Luma sample interpolation */
/* process". (ht,wd) can be (4,4), (8,4), (4,8), (8,8), */
/* (16,8), (8,16) or (16,16). */
/* */
/* Inputs : puc_src - pointer to source */
/* puc_dst - pointer to destination */
/* src_strd - stride for source */
/* dst_strd - stride for destination */
/* ht - height of the block */
/* wd - width of the block */
/* pu1_tmp - pointer to temporary buffer */
/* dydx - x and y reference offset for q-pel */
/* calculations */
/* */
/* Issues : None */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes */
/* 13 02 2015 Kaushik Initial Version */
/* Senthoor */
/* */
/*****************************************************************************/
void ih264_inter_pred_luma_horz_qpel_vert_qpel_ssse3(UWORD8 *pu1_src,
UWORD8 *pu1_dst,
WORD32 src_strd,
WORD32 dst_strd,
WORD32 ht,
WORD32 wd,
UWORD8* pu1_tmp,
WORD32 dydx)
{
WORD32 ht_temp;
UWORD8 *pu1_pred_vert,*pu1_pred_horiz;
UWORD8 *pu1_tmp1, *pu1_tmp2;
WORD32 x_offset, y_offset;
__m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
__m128i const_val16_8x16b;
pu1_tmp1 = pu1_tmp;
dydx &= 0xf;
ht_temp = ht;
x_offset = dydx & 0x3;
y_offset = dydx >> 2;
pu1_tmp2 = pu1_tmp1;
pu1_pred_vert = pu1_src + (x_offset >> 1) - 2*src_strd;
pu1_pred_horiz = pu1_src + (y_offset >> 1) * src_strd - 2;
//the filter input starts from x[-2] (till x[3])
coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
//c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
const_val16_8x16b = _mm_set1_epi16(16);
if(wd == 4)
{
//vertical q-pel filter
{
__m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b, src_r4_16x8b;
__m128i src_r5_16x8b, src_r6_16x8b;
__m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;
__m128i res_r0r1_16x8b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;
//epilogue: Load all the pred rows except sixth and seventh row for the
//first and second row processing.
src_r0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
pu1_pred_vert = pu1_pred_vert + src_strd;
src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
pu1_pred_vert = pu1_pred_vert + src_strd;
src_r0_16x8b = _mm_unpacklo_epi32(src_r0_16x8b, src_r1_16x8b);
src_r2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
pu1_pred_vert = pu1_pred_vert + src_strd;
src_r1_16x8b = _mm_unpacklo_epi32(src_r1_16x8b, src_r2_16x8b);
src_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
pu1_pred_vert = pu1_pred_vert + src_strd;
src_r2_16x8b = _mm_unpacklo_epi32(src_r2_16x8b, src_r3_16x8b);
src_r4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
pu1_pred_vert = pu1_pred_vert + src_strd;
src_r3_16x8b = _mm_unpacklo_epi32(src_r3_16x8b, src_r4_16x8b);
//Core Loop: Process all the rows.
do
{
src_r5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
src_r4_16x8b = _mm_unpacklo_epi32(src_r4_16x8b, src_r5_16x8b);
src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert + src_strd));
src_r5_16x8b = _mm_unpacklo_epi32(src_r5_16x8b, src_r6_16x8b);
src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
res_r0r1_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
_mm_storel_epi64((__m128i *)pu1_tmp1, res_r0r1_16x8b);
src_r0_16x8b = src_r2_16x8b;
src_r1_16x8b = src_r3_16x8b;
src_r2_16x8b = src_r4_16x8b;
src_r3_16x8b = src_r5_16x8b;
src_r4_16x8b = src_r6_16x8b;
ht_temp -= 2;
pu1_pred_vert += src_strd << 1;
pu1_tmp1 += 8;
}
while(ht_temp > 0);
}
//horizontal q-pel filter
{
__m128i src_r0_16x8b, src_r1_16x8b;
__m128i src_r0_sht_16x8b, src_r1_sht_16x8b;
__m128i src_r0r1_vpel_16x8b, src_r0r1_t1_16x8b;
__m128i res_r0r1_t1_8x16b, res_r0r1_t2_8x16b, res_r0r1_t3_8x16b;
__m128i res_r0r1_16x8b;
//Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
//Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
do
{
src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_pred_horiz); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_horiz + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
src_r0r1_vpel_16x8b = _mm_loadl_epi64((__m128i *)pu1_tmp2);
src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
src_r0_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
src_r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 b0 b1 b1 b2 b2 b3 b3 b4
res_r0r1_t1_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
//b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0
src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0
src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 b2 b3 b3 b4 b4 b5 b5 b6
res_r0r1_t2_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
//b2*c2+b3*c3 b3*c2+b4*c3 b4*c2+b5*c3 b5*c2+b6*c3
src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 0 0 0 0
src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0 0 0 0 0
src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 b4 b5 b5 b6 b6 b7 b7 b8
res_r0r1_t3_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
//b4*c4+b5*c5 b5*c4+b6*c5 b4*c6+b7*c5 b7*c4+b8*c5
res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t2_8x16b);
res_r0r1_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0r1_t3_8x16b);
res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t3_8x16b); //a0*c0+a1*c1+a2*c2+a3*c3+a4*a4+a5*c5 + 15;
//a1*c0+a2*c1+a2*c2+a3*c3+a5*a4+a6*c5 + 15;
//a2*c0+a3*c1+a4*c2+a5*c3+a6*a4+a7*c5 + 15;
//a3*c0+a4*c1+a5*c2+a6*c3+a6*a4+a8*c5 + 15;
//b0*c0+b1*c1+b2*c2+b3*c3+b4*b4+b5*c5 + 15;
//b1*c0+b2*c1+b2*c2+b3*c3+b5*b4+b6*c5 + 15;
//b2*c0+b3*c1+b4*c2+b5*c3+b6*b4+b7*c5 + 15;
//b3*c0+b4*c1+b5*c2+b6*c3+b6*b4+b8*c5 + 15;
res_r0r1_t1_8x16b = _mm_srai_epi16(res_r0r1_t1_8x16b, 5); //shifting right by 5 bits.
res_r0r1_16x8b = _mm_packus_epi16(res_r0r1_t1_8x16b,res_r0r1_t1_8x16b);
res_r0r1_16x8b = _mm_avg_epu8(res_r0r1_16x8b,src_r0r1_vpel_16x8b);
*((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_r0r1_16x8b);
res_r0r1_16x8b = _mm_srli_si128(res_r0r1_16x8b, 4);
*((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_r0r1_16x8b);
ht -= 2;
pu1_pred_horiz += src_strd << 1;
pu1_tmp2 += 8;
pu1_dst += dst_strd << 1;
}
while(ht > 0);
}
}
else if(wd == 8)
{
//vertical q-pel filter
{
__m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b;
__m128i src_r4_16x8b, src_r5_16x8b, src_r6_16x8b;
__m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;
__m128i res_16x8b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;
//epilogue: Load all the pred rows except sixth and seventh row for the
//first and second row processing.
src_r0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
pu1_pred_vert = pu1_pred_vert + src_strd;
src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
pu1_pred_vert = pu1_pred_vert + src_strd;
src_r0_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);
src_r2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
pu1_pred_vert = pu1_pred_vert + src_strd;
src_r1_16x8b = _mm_unpacklo_epi64(src_r1_16x8b, src_r2_16x8b);
src_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
pu1_pred_vert = pu1_pred_vert + src_strd;
src_r2_16x8b = _mm_unpacklo_epi64(src_r2_16x8b, src_r3_16x8b);
src_r4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
pu1_pred_vert = pu1_pred_vert + src_strd;
src_r3_16x8b = _mm_unpacklo_epi64(src_r3_16x8b, src_r4_16x8b);
//Core Loop: Process all the rows.
do
{
src_r5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
src_r4_16x8b = _mm_unpacklo_epi64(src_r4_16x8b, src_r5_16x8b);
src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert + src_strd));
src_r5_16x8b = _mm_unpacklo_epi64(src_r5_16x8b, src_r6_16x8b);
src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
_mm_storel_epi64((__m128i *)(pu1_tmp1), res_16x8b);
src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b);
src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b);
src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b);
res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
_mm_storel_epi64((__m128i *)(pu1_tmp1 + 8), res_16x8b);
src_r0_16x8b = src_r2_16x8b;
src_r1_16x8b = src_r3_16x8b;
src_r2_16x8b = src_r4_16x8b;
src_r3_16x8b = src_r5_16x8b;
src_r4_16x8b = src_r6_16x8b;
ht_temp -= 2;
pu1_pred_vert += src_strd << 1;
pu1_tmp1 += 16;
}
while(ht_temp > 0);
}
//horizontal q-pel filter
{
__m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b;
__m128i src_r0_t1_16x8b, src_r1_t1_16x8b;
__m128i src_r0_vpel_16x8b, src_r1_vpel_16x8b;
__m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
__m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b, res_16x8b;
//Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
//Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
do
{
src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_horiz)); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_horiz + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
src_r0_vpel_16x8b = _mm_loadl_epi64((__m128i *)(pu1_tmp2)); //a2 a3 a4 a5 a6 a7 a8....a15 0 or
//a3 a4 a5 a6 a7 a8 a9....a15 0
src_r1_vpel_16x8b = _mm_loadl_epi64((__m128i *)(pu1_tmp2 + 8));
//b2 b3 b4 b5 b6 b7 b8....b15 0 or
//b3 b4 b5 b6 b7 b8 b9....b15 0
src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
//a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
//b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1
src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0
src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0
src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0
src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10
res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
//a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
//b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3
src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0
src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0
src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0
src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0
src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12
res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
//a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
//b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
res_r0_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0_t3_8x16b);
res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5); //shifting right by 5 bits.
res_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r0_t1_8x16b);
res_16x8b = _mm_avg_epu8(res_16x8b, src_r0_vpel_16x8b);
_mm_storel_epi64((__m128i *)(pu1_dst), res_16x8b);
res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
res_r1_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r1_t3_8x16b);
res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);
res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5); //shifting right by 5 bits.
res_16x8b = _mm_packus_epi16(res_r1_t1_8x16b, res_r1_t1_8x16b);
res_16x8b = _mm_avg_epu8(res_16x8b,src_r1_vpel_16x8b);
_mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b);
ht -= 2;
pu1_pred_horiz += src_strd << 1;
pu1_dst += dst_strd << 1;
pu1_tmp2 += 16;
}
while(ht > 0);
}
}
else // wd == 16
{
//vertical q-pel filter
{
__m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b;
__m128i src_r4_16x8b, src_r5_16x8b, src_r6_16x8b;
__m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;
__m128i res_t0_8x16b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;
__m128i res_16x8b;
//epilogue: Load all the pred rows except sixth and seventh row for the
//first and second row processing.
src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert));
pu1_pred_vert = pu1_pred_vert + src_strd;
src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert));
pu1_pred_vert = pu1_pred_vert + src_strd;
src_r2_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert));
pu1_pred_vert = pu1_pred_vert + src_strd;
src_r3_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert));
pu1_pred_vert = pu1_pred_vert + src_strd;
src_r4_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert));
pu1_pred_vert = pu1_pred_vert + src_strd;
//Core Loop: Process all the rows.
do
{
src_r5_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert));
src_r6_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert + src_strd));
src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b);
src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b);
src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b);
res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b);
_mm_storeu_si128((__m128i *)(pu1_tmp1), res_16x8b);
src_r0r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r2_16x8b);
src_r2r3_16x8b = _mm_unpacklo_epi8(src_r3_16x8b, src_r4_16x8b);
src_r4r5_16x8b = _mm_unpacklo_epi8(src_r5_16x8b, src_r6_16x8b);
res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
src_r0r1_16x8b = _mm_unpackhi_epi8(src_r1_16x8b, src_r2_16x8b);
src_r2r3_16x8b = _mm_unpackhi_epi8(src_r3_16x8b, src_r4_16x8b);
src_r4r5_16x8b = _mm_unpackhi_epi8(src_r5_16x8b, src_r6_16x8b);
res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b);
_mm_storeu_si128((__m128i *)(pu1_tmp1 + 16), res_16x8b);
src_r0_16x8b = src_r2_16x8b;
src_r1_16x8b = src_r3_16x8b;
src_r2_16x8b = src_r4_16x8b;
src_r3_16x8b = src_r5_16x8b;
src_r4_16x8b = src_r6_16x8b;
ht_temp -= 2;
pu1_pred_vert += src_strd << 1;
pu1_tmp1 += 32;
}
while(ht_temp > 0);
}
//horizontal q-pel filter
{
__m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b;
__m128i src_r0_t1_16x8b, src_r1_t1_16x8b;
__m128i src_vpel_16x8b;
__m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
__m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;
__m128i res_16x8b;
//Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
//Row0 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
//b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels.
do
{
src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_horiz)); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_horiz + 8)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
src_vpel_16x8b = _mm_loadu_si128((__m128i *)(pu1_tmp2));
src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1