blob: 6a3883ea771d00e00c6ada0a550f8d3e760b8cf5 [file] [log] [blame]
/******************************************************************************
*
* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
/**
*******************************************************************************
* @file
* ihevc_chroma_intra_pred_filters_atom_intr.c
*
* @brief
* Contains function Definition for intra prediction interpolation filters
*
*
* @author
* Ittiam
*
* @par List of Functions:
* ihevc_intra_pred_chroma_planar_ssse3()
*
* ihevc_intra_pred_chroma_dc_ssse3()
*
* ihevc_intra_pred_chroma_horz_ssse3()
*
* ihevc_intra_pred_chroma_ver_ssse3()
*
* ihevc_intra_pred_chroma_mode2_ssse3()
*
* ihevc_intra_pred_chroma_mode_18_34_ssse3()
*
* ihevc_intra_pred_chroma_mode_3_to_9_ssse3()
*
* ihevc_intra_pred_chroma_mode_11_to_17_ssse3()
*
* ihevc_intra_pred_chroma_mode_19_to_25_ssse3()
*
* ihevc_intra_pred_chroma_mode_27_to_33_ssse3()
*
*
*
* @remarks
* None
*
*******************************************************************************
*/
/*****************************************************************************/
/* File Includes */
/*****************************************************************************/
#include "ihevc_typedefs.h"
#include "ihevc_platform_macros.h"
#include "ihevc_macros.h"
#include "ihevc_func_selector.h"
#include "ihevc_intra_pred.h"
#include "ihevc_chroma_intra_pred.h"
#include "ihevc_common_tables.h"
#include "ihevc_tables_x86_intr.h"
#include <mmintrin.h>
#include <xmmintrin.h>
#include <emmintrin.h>
#include <immintrin.h>
/****************************************************************************/
/* Constant Macros */
/****************************************************************************/
#define MAX_CU_SIZE 64
#define BIT_DEPTH 8
#define T32_4NT 128
#define T16_4NT 64
#define T16C_4NT 64
#define T8C_4NT 32
/****************************************************************************/
/* Function Macros */
/****************************************************************************/
#define GET_BIT(y,x) ((y) & (1 << x)) && (1 << x)
/* tables to shuffle 8-bit values */
/*****************************************************************************/
/* Function Definition */
/*****************************************************************************/
/**
*******************************************************************************
*
* @brief
* Planar Intraprediction with reference neighboring samples location
* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer
* to section 8.4.4.2.4 in the standard
*
* @par Description:
*
*
* @param[in] pu1_src
* UWORD8 pointer to the source
*
* @param[in] pu1_dst
* UWORD8 pointer to the destination
*
* @param[in] src_strd
* integer source stride
*
* @param[in] dst_strd
* integer destination stride
*
* @param[in] nt
* integer Transform Block size
*
* @param[in] mode
* integer intraprediction mode
*
* @returns
*
* @remarks
* None
*
*******************************************************************************
*/
void ihevc_intra_pred_chroma_planar_ssse3(UWORD8 *pu1_ref,
WORD32 src_strd,
UWORD8 *pu1_dst,
WORD32 dst_strd,
WORD32 nt,
WORD32 mode)
{
WORD32 row, col;
WORD32 log2nt = 5;
WORD32 two_nt, three_nt;
__m128i const_temp_4x32b, const_temp1_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
__m128i col_8x16b, const_temp5_4x32b, const_temp6_4x32b, zero_8x16b, const_temp7_4x32b;
UNUSED(src_strd);
UNUSED(mode);
switch(nt)
{
case 16:
log2nt = 4;
break;
case 8:
log2nt = 3;
break;
case 4:
log2nt = 2;
break;
default:
break;
}
two_nt = 2 * nt;
three_nt = 3 * nt;
/* Planar filtering */
/* setting vallues in registera*/
// pu1_ref[2*(two_nt - 1 - row)]
// pu1_ref[2 * (three_nt + 1)]
// pu1_ref[2 * (two_nt + 1) + col]
// pu1_ref[2 * (nt - 1)]
const_temp_4x32b = _mm_set_epi16(pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)], pu1_ref[2 * (three_nt + 1) + 1],
pu1_ref[2 * (three_nt + 1)], pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)],
pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)]);
const_temp1_4x32b = _mm_set_epi16(pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)],
pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)]);
const_temp4_4x32b = _mm_set1_epi16(nt - 1);
const_temp6_4x32b = _mm_set1_epi16(nt);
const_temp7_4x32b = _mm_set1_epi16(4);
zero_8x16b = _mm_set1_epi32(0);
if(nt % 4 == 0)
{
const_temp7_4x32b = _mm_set1_epi16(4);
for(row = 0; row < nt; row++)
{
__m128i res_temp_8x16b, row_8x16b, res_temp1_8x16b, res_temp2_8x16b;
__m128i res_temp3_8x16b;
const_temp2_4x32b = _mm_set_epi16(pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)], pu1_ref[2 * (two_nt - 1 - row) + 1],
pu1_ref[2 * (two_nt - 1 - row)], pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)],
pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)]);
const_temp3_4x32b = _mm_set1_epi16((row + 1));
row_8x16b = _mm_set1_epi16((nt - 1 - row));
const_temp5_4x32b = _mm_set_epi16(3, 3, 2, 2, 1, 1, 0, 0);
col_8x16b = _mm_set_epi16(4, 4, 3, 3, 2, 2, 1, 1);
const_temp5_4x32b = _mm_sub_epi16(const_temp4_4x32b, const_temp5_4x32b);
/*(row + 1) * pu1_ref[nt - 1]*/
res_temp_8x16b = _mm_mullo_epi16(const_temp3_4x32b, const_temp1_4x32b);
/*(row + 1) * pu1_ref[nt - 1] + nt)*/
res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b);
for(col = 0; col < 2 * nt; col += 8)
{
__m128i src_temp_8x16b;
/* loding 8bit 16 pixles*/
src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (two_nt + 1) + col));
//src_temp_8x16b = _mm_cvtepu8_epi16 (src_temp_8x16b); /* row=0*/
src_temp_8x16b = _mm_unpacklo_epi8(src_temp_8x16b, zero_8x16b);
/* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] */
res_temp1_8x16b = _mm_mullo_epi16(src_temp_8x16b, row_8x16b);
/*(col + 1) * pu1_ref[three_nt + 1]*/
res_temp2_8x16b = _mm_mullo_epi16(const_temp_4x32b, col_8x16b);
/*(nt - 1 - col)* pu1_ref[two_nt - 1 - row]*/
res_temp3_8x16b = _mm_mullo_epi16(const_temp2_4x32b, const_temp5_4x32b);
res_temp1_8x16b = _mm_add_epi16(res_temp_8x16b, res_temp1_8x16b);
res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp3_8x16b);
res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, (log2nt + 1));
res_temp1_8x16b = _mm_packus_epi16(res_temp1_8x16b, zero_8x16b);
_mm_storel_epi64((__m128i *)(pu1_dst + (row * dst_strd) + col), res_temp1_8x16b);
const_temp5_4x32b = _mm_sub_epi16(const_temp5_4x32b, const_temp7_4x32b);
col_8x16b = _mm_add_epi16(col_8x16b, const_temp7_4x32b);
} /* inner loop ends here */
}
}
}
/**
*******************************************************************************
*
* @brief
* Intraprediction for DC mode with reference neighboring samples location
* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer
* to section 8.4.4.2.5 in the standard
*
* @par Description:
*
*
* @param[in] pu1_src
* UWORD8 pointer to the source
*
* @param[in] pu1_dst
* UWORD8 pointer to the destination
*
* @param[in] src_strd
* integer source stride
*
* @param[in] dst_strd
* integer destination stride
*
* @param[in] nt
* integer Transform Block size (Chroma)
*
* @param[in] mode
* integer intraprediction mode
*
* @returns
*
* @remarks
* None
*
*******************************************************************************
*/
void ihevc_intra_pred_chroma_dc_ssse3(UWORD8 *pu1_ref,
WORD32 src_strd,
UWORD8 *pu1_dst,
WORD32 dst_strd,
WORD32 nt,
WORD32 mode)
{
WORD32 acc_dc_u, acc_dc_v;
WORD32 dc_val_u, dc_val_v;
WORD32 row;
WORD32 log2nt = 5;
__m128i src_temp1, src_temp3, src_temp4, src_temp5, src_temp6, m_mask;
__m128i src_temp7, src_temp8, src_temp9, src_temp10;
__m128i m_zero = _mm_set1_epi32(0);
UNUSED(src_strd);
UNUSED(mode);
switch(nt)
{
case 32:
log2nt = 5;
break;
case 16:
log2nt = 4;
break;
case 8:
log2nt = 3;
break;
case 4:
log2nt = 2;
break;
default:
break;
}
acc_dc_u = 0;
acc_dc_v = 0;
/* Calculate DC value for the transform block */
m_mask = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY9[0]);
if(nt == 16)
{
__m128i temp_sad, sign_8x16b;
src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 16));
src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 32));
src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 48));
src_temp5 = _mm_unpacklo_epi8(src_temp3, m_zero);
src_temp6 = _mm_unpacklo_epi8(src_temp4, m_zero);
src_temp9 = _mm_unpacklo_epi8(src_temp7, m_zero);
src_temp10 = _mm_unpacklo_epi8(src_temp8, m_zero);
src_temp3 = _mm_srli_si128(src_temp3, 8);
src_temp4 = _mm_srli_si128(src_temp4, 8);
src_temp7 = _mm_srli_si128(src_temp7, 8);
src_temp8 = _mm_srli_si128(src_temp8, 8);
src_temp3 = _mm_unpacklo_epi8(src_temp3, m_zero);
src_temp4 = _mm_unpacklo_epi8(src_temp4, m_zero);
src_temp7 = _mm_unpacklo_epi8(src_temp7, m_zero);
src_temp8 = _mm_unpacklo_epi8(src_temp8, m_zero);
src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
src_temp6 = _mm_add_epi16(src_temp3, src_temp5);
src_temp8 = _mm_add_epi16(src_temp7, src_temp8);
src_temp10 = _mm_add_epi16(src_temp9, src_temp10);
src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
src_temp8 = _mm_add_epi16(src_temp8, src_temp10);
src_temp4 = _mm_add_epi16(src_temp4, src_temp8);
src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
sign_8x16b = _mm_cmpgt_epi16(m_zero, src_temp4);
src_temp4 = _mm_unpacklo_epi16(src_temp4, sign_8x16b);
temp_sad = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
acc_dc_u = _mm_cvtsi128_si32(src_temp4);
acc_dc_v = _mm_cvtsi128_si32(temp_sad);
}
else if(nt == 8)
{
__m128i temp_sad, sign_8x16b;
src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 16));
src_temp5 = _mm_unpacklo_epi8(src_temp3, m_zero);
src_temp6 = _mm_unpacklo_epi8(src_temp4, m_zero);
src_temp3 = _mm_srli_si128(src_temp3, 8);
src_temp4 = _mm_srli_si128(src_temp4, 8);
src_temp3 = _mm_unpacklo_epi8(src_temp3, m_zero);
src_temp4 = _mm_unpacklo_epi8(src_temp4, m_zero);
src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
src_temp6 = _mm_add_epi16(src_temp3, src_temp5);
src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
sign_8x16b = _mm_cmpgt_epi16(m_zero, src_temp4);
src_temp4 = _mm_unpacklo_epi16(src_temp4, sign_8x16b);
temp_sad = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
acc_dc_u = _mm_cvtsi128_si32(src_temp4);
acc_dc_v = _mm_cvtsi128_si32(temp_sad);
}
else if(nt == 4)
{
__m128i temp_sad, sign_8x16b;
src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
src_temp5 = _mm_unpacklo_epi8(src_temp3, m_zero);
src_temp4 = _mm_srli_si128(src_temp3, 8);
src_temp4 = _mm_unpacklo_epi8(src_temp4, m_zero);
src_temp4 = _mm_add_epi16(src_temp4, src_temp5);
src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
sign_8x16b = _mm_cmpgt_epi16(m_zero, src_temp4);
src_temp4 = _mm_unpacklo_epi16(src_temp4, sign_8x16b);
temp_sad = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
acc_dc_u = _mm_cvtsi128_si32(src_temp4);
acc_dc_v = _mm_cvtsi128_si32(temp_sad);
}
acc_dc_u += pu1_ref[6 * nt];
acc_dc_v += pu1_ref[6 * nt + 1];
acc_dc_u -= pu1_ref[4 * nt];
acc_dc_v -= pu1_ref[4 * nt + 1];
dc_val_u = (acc_dc_u + nt) >> (log2nt + 1);
dc_val_v = (acc_dc_v + nt) >> (log2nt + 1);
dc_val_u = dc_val_u | (dc_val_v << 8);
/* Fill the remaining rows with DC value*/
if(nt == 4)
{
src_temp1 = _mm_set1_epi16(dc_val_u);
/* pu1_dst[(row * dst_strd) + col] = dc_val;*/
_mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
_mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
_mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
_mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
}
else if(nt == 8)
{
src_temp1 = _mm_set1_epi16(dc_val_u);
/* pu1_dst[(row * dst_strd) + col] = dc_val;*/
_mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1);
}
else /* nt == 16 */
{
src_temp1 = _mm_set1_epi16(dc_val_u);
for(row = 0; row < nt; row += 8)
{
/* pu1_dst[(row * dst_strd) + col] = dc_val;*/
_mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp1);
pu1_dst += 8 * dst_strd;
}
}
}
/**
*******************************************************************************
*
* @brief
* Horizontal intraprediction(mode 10) with reference samples location
* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer
* to section 8.4.4.2.6 in the standard (Special case)
*
* @par Description:
*
*
* @param[in] pu1_src
* UWORD8 pointer to the source
*
* @param[in] pu1_dst
* UWORD8 pointer to the destination
*
* @param[in] src_strd
* integer source stride
*
* @param[in] dst_strd
* integer destination stride
*
* @param[in] nt
* integer Transform Block size
*
* @param[in] mode
* integer intraprediction mode
*
* @returns
*
* @remarks
* None
*
*******************************************************************************
*/
void ihevc_intra_pred_chroma_horz_ssse3(UWORD8 *pu1_ref,
WORD32 src_strd,
UWORD8 *pu1_dst,
WORD32 dst_strd,
WORD32 nt,
WORD32 mode)
{
WORD32 row;
__m128i temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
UNUSED(src_strd);
UNUSED(mode);
/* Replication to next rows*/
if(nt == 8)
{
for(row = 0; row < nt; row += 4)
{
temp1 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 0)]);
temp2 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 0)]);
temp3 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 1)]);
temp4 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 1)]);
temp5 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 2)]);
temp6 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 2)]);
temp7 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 3)]);
temp8 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 3)]);
temp2 = _mm_unpacklo_epi8(temp1, temp2);
temp4 = _mm_unpacklo_epi8(temp3, temp4);
temp6 = _mm_unpacklo_epi8(temp5, temp6);
temp8 = _mm_unpacklo_epi8(temp7, temp8);
_mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), temp2);
_mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), temp4);
_mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), temp6);
_mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), temp8);
}
}
else if(nt == 16)
{
for(row = 0; row < nt; row += 4)
{
temp1 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 0)]);
temp2 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 0)]);
temp3 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 1)]);
temp4 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 1)]);
temp5 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 2)]);
temp6 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 2)]);
temp7 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 3)]);
temp8 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 3)]);
temp2 = _mm_unpacklo_epi8(temp1, temp2);
temp4 = _mm_unpacklo_epi8(temp3, temp4);
temp6 = _mm_unpacklo_epi8(temp5, temp6);
temp8 = _mm_unpacklo_epi8(temp7, temp8);
_mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd) + 0), temp2);
_mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd) + 16), temp2);
_mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd) + 0), temp4);
_mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd) + 16), temp4);
_mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd) + 0), temp6);
_mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd) + 16), temp6);
_mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd) + 0), temp8);
_mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd) + 16), temp8);
}
}
else
{
temp1 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * 0]);
temp2 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * 0]);
temp3 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * 1]);
temp4 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * 1]);
temp5 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * 2]);
temp6 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * 2]);
temp7 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * 3]);
temp8 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * 3]);
temp2 = _mm_unpacklo_epi8(temp1, temp2);
temp4 = _mm_unpacklo_epi8(temp3, temp4);
temp6 = _mm_unpacklo_epi8(temp5, temp6);
temp8 = _mm_unpacklo_epi8(temp7, temp8);
_mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), temp2);
_mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), temp4);
_mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), temp6);
_mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), temp8);
}
}
/**
*******************************************************************************
*
* @brief
* Horizontal intraprediction with reference neighboring samples location
* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer
* to section 8.4.4.2.6 in the standard (Special case)
*
* @par Description:
*
*
* @param[in] pu1_src
* UWORD8 pointer to the source
*
* @param[in] pu1_dst
* UWORD8 pointer to the destination
*
* @param[in] src_strd
* integer source stride
*
* @param[in] dst_strd
* integer destination stride
*
* @param[in] nt
* integer Transform Block size
*
* @param[in] mode
* integer intraprediction mode
*
* @returns
*
* @remarks
* None
*
*******************************************************************************
*/
void ihevc_intra_pred_chroma_ver_ssse3(UWORD8 *pu1_ref,
WORD32 src_strd,
UWORD8 *pu1_dst,
WORD32 dst_strd,
WORD32 nt,
WORD32 mode)
{
__m128i src_temp1;
UNUSED(src_strd);
UNUSED(mode);
/* Replication to next columns*/
if(nt == 8)
{
src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) + 2 + 0));
_mm_storeu_si128((__m128i *)(pu1_dst + ((0) * dst_strd)), src_temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp1);
}
if(nt == 16)
{
__m128i temp1, temp2;
temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) + 2 + 0));
temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) + 2 + 16));
/* pu1_dst[(row * dst_strd) + col] = dc_val;*/
_mm_storeu_si128((__m128i *)(pu1_dst + ((0) * dst_strd)), temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((0) * dst_strd)), temp2);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((1) * dst_strd)), temp2);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((2) * dst_strd)), temp2);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((3) * dst_strd)), temp2);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((4) * dst_strd)), temp2);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((5) * dst_strd)), temp2);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((6) * dst_strd)), temp2);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((7) * dst_strd)), temp2);
_mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((8) * dst_strd)), temp2);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((9) * dst_strd)), temp2);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((10) * dst_strd)), temp2);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((11) * dst_strd)), temp2);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((12) * dst_strd)), temp2);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((13) * dst_strd)), temp2);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((14) * dst_strd)), temp2);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((15) * dst_strd)), temp2);
}
else
{
src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) + 2 + 0));
_mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
_mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
_mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
_mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
}
}
/**
*******************************************************************************
*
* @brief
* Intraprediction for mode 2 (sw angle) with reference neighboring samples
* location pointed by 'pu1_ref' to the TU block location pointed by
* 'pu1_dst' Refer to section 8.4.4.2.6 in the standard
*
* @par Description:
*
*
* @param[in] pu1_src
* UWORD8 pointer to the source
*
* @param[in] pu1_dst
* UWORD8 pointer to the destination
*
* @param[in] src_strd
* integer source stride
*
* @param[in] dst_strd
* integer destination stride
*
* @param[in] nt
* integer Transform Block size
*
* @param[in] mode
* integer intraprediction mode
*
* @returns
*
* @remarks
* None
*
*******************************************************************************
*/
void ihevc_intra_pred_chroma_mode2_ssse3(UWORD8 *pu1_ref,
WORD32 src_strd,
UWORD8 *pu1_dst,
WORD32 dst_strd,
WORD32 nt,
WORD32 mode)
{
WORD32 row, col;
__m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7, src_temp8, sm2, sm3;
UNUSED(src_strd);
UNUSED(mode);
sm2 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY7[0]);
sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY8[0]);
/* For the angle 45, replication is done from the corresponding angle */
/* intra_pred_ang = tan(angle) in q5 format */
if(nt == 4)
{
/*pu1_ref[two_nt - row - (col+1) - 1]*/
src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 0 - 8 - 2));
src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 1 - 8 - 2));
src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 2 - 8 - 2));
src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 3 - 8 - 2));
_mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), _mm_shuffle_epi8(src_temp1, sm2));
_mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), _mm_shuffle_epi8(src_temp2, sm2));
_mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), _mm_shuffle_epi8(src_temp3, sm2));
_mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), _mm_shuffle_epi8(src_temp4, sm2));
}
else if(nt == 8)
{
/*pu1_ref[two_nt - row - (col+1) - 1]*/
src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 0 - 16 - 2));
src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 1 - 16 - 2));
src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 2 - 16 - 2));
src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 3 - 16 - 2));
src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 4 - 16 - 2));
src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 5 - 16 - 2));
src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 6 - 16 - 2));
src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 7 - 16 - 2));
_mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), _mm_shuffle_epi8(src_temp1, sm3));
_mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), _mm_shuffle_epi8(src_temp2, sm3));
_mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), _mm_shuffle_epi8(src_temp3, sm3));
_mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), _mm_shuffle_epi8(src_temp4, sm3));
_mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), _mm_shuffle_epi8(src_temp5, sm3));
_mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), _mm_shuffle_epi8(src_temp6, sm3));
_mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), _mm_shuffle_epi8(src_temp7, sm3));
_mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), _mm_shuffle_epi8(src_temp8, sm3));
}
else
{
for(row = 0; row < nt; row += 8)
{
for(col = 0; col < 2 * nt; col += 16)
{ /*pu1_ref[two_nt - row - (col+1) - 1]*/
src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 0) - (col + 16) - 2));
src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 1) - (col + 16) - 2));
src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 2) - (col + 16) - 2));
src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 3) - (col + 16) - 2));
src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 4) - (col + 16) - 2));
src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 5) - (col + 16) - 2));
src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 6) - (col + 16) - 2));
src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 7) - (col + 16) - 2));
_mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 0) * dst_strd)), _mm_shuffle_epi8(src_temp1, sm3));
_mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 1) * dst_strd)), _mm_shuffle_epi8(src_temp2, sm3));
_mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 2) * dst_strd)), _mm_shuffle_epi8(src_temp3, sm3));
_mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 3) * dst_strd)), _mm_shuffle_epi8(src_temp4, sm3));
_mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 4) * dst_strd)), _mm_shuffle_epi8(src_temp5, sm3));
_mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 5) * dst_strd)), _mm_shuffle_epi8(src_temp6, sm3));
_mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 6) * dst_strd)), _mm_shuffle_epi8(src_temp7, sm3));
_mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 7) * dst_strd)), _mm_shuffle_epi8(src_temp8, sm3));
}
}
}
}
/**
*******************************************************************************
*
* @brief
* Intraprediction for mode 34 (ne angle) and mode 18 (nw angle) with
* reference neighboring samples location pointed by 'pu1_ref' to the TU
* block location pointed by 'pu1_dst'
*
* @par Description:
*
*
* @param[in] pu1_src
* UWORD8 pointer to the source
*
* @param[in] pu1_dst
* UWORD8 pointer to the destination
*
* @param[in] src_strd
* integer source stride
*
* @param[in] dst_strd
* integer destination stride
*
* @param[in] nt
* integer Transform Block size
*
* @param[in] mode
* integer intraprediction mode
*
* @returns
*
* @remarks
* None
*
*******************************************************************************
*/
void ihevc_intra_pred_chroma_mode_18_34_ssse3(UWORD8 *pu1_ref,
WORD32 src_strd,
UWORD8 *pu1_dst,
WORD32 dst_strd,
WORD32 nt,
WORD32 mode)
{
WORD32 row;
WORD32 idx = 0;
__m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7, src_temp8;
UNUSED(src_strd);
if(mode == 34)
{
if(nt == 4)
{
/*pu1_ref[two_nt + col + idx + 1]*/
src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (0 + 1) + (4 * nt) + 2 * idx + 2));
src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (1 + 1) + (4 * nt) + 2 * idx + 2));
src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (2 + 1) + (4 * nt) + 2 * idx + 2));
src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (3 + 1) + (4 * nt) + 2 * idx + 2));
_mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
_mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2);
_mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3);
_mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4);
}
else if(nt == 8)
{
/*pu1_ref[two_nt + col + idx + 1]*/
src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (0 + 1) + (4 * nt) + 2 * idx + 2));
src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (1 + 1) + (4 * nt) + 2 * idx + 2));
src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (2 + 1) + (4 * nt) + 2 * idx + 2));
src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (3 + 1) + (4 * nt) + 2 * idx + 2));
src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (4 + 1) + (4 * nt) + 2 * idx + 2));
src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (5 + 1) + (4 * nt) + 2 * idx + 2));
src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (6 + 1) + (4 * nt) + 2 * idx + 2));
src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (7 + 1) + (4 * nt) + 2 * idx + 2));
_mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2);
_mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3);
_mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4);
_mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp5);
_mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp6);
_mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp7);
_mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp8);
}
else
{
__m128i src_temp9, src_temp10, src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16;
for(row = 0; row < nt; row += 8)
{
/*pu1_ref[two_nt + col + idx + 1]*/
src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (0 + 1) + 0 + (4 * nt) + 2 * idx + 2));
src_temp9 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (0 + 1) + 16 + (4 * nt) + 2 * idx + 2));
src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (1 + 1) + 0 + (4 * nt) + 2 * idx + 2));
src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (1 + 1) + 16 + (4 * nt) + 2 * idx + 2));
src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (2 + 1) + 0 + (4 * nt) + 2 * idx + 2));
src_temp11 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (2 + 1) + 16 + (4 * nt) + 2 * idx + 2));
src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (3 + 1) + 0 + (4 * nt) + 2 * idx + 2));
src_temp12 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (3 + 1) + 16 + (4 * nt) + 2 * idx + 2));
_mm_storeu_si128((__m128i *)(pu1_dst + 0 + (0 * dst_strd)), src_temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp9);
_mm_storeu_si128((__m128i *)(pu1_dst + 0 + (1 * dst_strd)), src_temp2);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp10);
_mm_storeu_si128((__m128i *)(pu1_dst + 0 + (2 * dst_strd)), src_temp3);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp11);
_mm_storeu_si128((__m128i *)(pu1_dst + 0 + (3 * dst_strd)), src_temp4);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp12);
src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (4 + 1) + 0 + (4 * nt) + 2 * idx + 2));
src_temp13 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (4 + 1) + 16 + (4 * nt) + 2 * idx + 2));
src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (5 + 1) + 0 + (4 * nt) + 2 * idx + 2));
src_temp14 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (5 + 1) + 16 + (4 * nt) + 2 * idx + 2));
src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (6 + 1) + 0 + (4 * nt) + 2 * idx + 2));
src_temp15 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (6 + 1) + 16 + (4 * nt) + 2 * idx + 2));
src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (7 + 1) + 0 + (4 * nt) + 2 * idx + 2));
src_temp16 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (7 + 1) + 16 + (4 * nt) + 2 * idx + 2));
_mm_storeu_si128((__m128i *)(pu1_dst + 0 + (4 * dst_strd)), src_temp5);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp13);
_mm_storeu_si128((__m128i *)(pu1_dst + 0 + (5 * dst_strd)), src_temp6);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp14);
_mm_storeu_si128((__m128i *)(pu1_dst + 0 + (6 * dst_strd)), src_temp7);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp15);
_mm_storeu_si128((__m128i *)(pu1_dst + 0 + (7 * dst_strd)), src_temp8);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp16);
pu1_ref += 2 * 8;
pu1_dst += 8 * dst_strd;
}
}
}
else
{
if(nt == 4)
{
/*pu1_ref[two_nt + col + idx + 1]*/
src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (0 + 1) + (4 * nt) + 2 * idx + 2));
src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (1 + 1) + (4 * nt) + 2 * idx + 2));
src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (2 + 1) + (4 * nt) + 2 * idx + 2));
src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (3 + 1) + (4 * nt) + 2 * idx + 2));
_mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
_mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2);
_mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3);
_mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4);
}
else if(nt == 8)
{
/*pu1_ref[two_nt + col + idx + 1]*/
src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (0 + 1) + (4 * nt) + 2 * idx + 2));
src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (1 + 1) + (4 * nt) + 2 * idx + 2));
src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (2 + 1) + (4 * nt) + 2 * idx + 2));
src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (3 + 1) + (4 * nt) + 2 * idx + 2));
src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (4 + 1) + (4 * nt) + 2 * idx + 2));
src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (5 + 1) + (4 * nt) + 2 * idx + 2));
src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (6 + 1) + (4 * nt) + 2 * idx + 2));
src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (7 + 1) + (4 * nt) + 2 * idx + 2));
_mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2);
_mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3);
_mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4);
_mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp5);
_mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp6);
_mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp7);
_mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp8);
}
else
{
__m128i src_temp9, src_temp10, src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16;
for(row = 0; row < nt; row += 8)
{
/*pu1_ref[two_nt + col + idx + 1]*/
src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (0 + 1) + 0 + (4 * nt) + 2 * idx + 2));
src_temp9 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (0 + 1) + 16 + (4 * nt) + 2 * idx + 2));
src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (1 + 1) + 0 + (4 * nt) + 2 * idx + 2));
src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (1 + 1) + 16 + (4 * nt) + 2 * idx + 2));
src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (2 + 1) + 0 + (4 * nt) + 2 * idx + 2));
src_temp11 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (2 + 1) + 16 + (4 * nt) + 2 * idx + 2));
src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (3 + 1) + 0 + (4 * nt) + 2 * idx + 2));
src_temp12 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (3 + 1) + 16 + (4 * nt) + 2 * idx + 2));
_mm_storeu_si128((__m128i *)(pu1_dst + 0 + (0 * dst_strd)), src_temp1);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp9);
_mm_storeu_si128((__m128i *)(pu1_dst + 0 + (1 * dst_strd)), src_temp2);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp10);
_mm_storeu_si128((__m128i *)(pu1_dst + 0 + (2 * dst_strd)), src_temp3);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp11);
_mm_storeu_si128((__m128i *)(pu1_dst + 0 + (3 * dst_strd)), src_temp4);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp12);
src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (4 + 1) + 0 + (4 * nt) + 2 * idx + 2));
src_temp13 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (4 + 1) + 16 + (4 * nt) + 2 * idx + 2));
src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (5 + 1) + 0 + (4 * nt) + 2 * idx + 2));
src_temp14 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (5 + 1) + 16 + (4 * nt) + 2 * idx + 2));
src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (6 + 1) + 0 + (4 * nt) + 2 * idx + 2));
src_temp15 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (6 + 1) + 16 + (4 * nt) + 2 * idx + 2));
src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (7 + 1) + 0 + (4 * nt) + 2 * idx + 2));
src_temp16 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (7 + 1) + 16 + (4 * nt) + 2 * idx + 2));
_mm_storeu_si128((__m128i *)(pu1_dst + 0 + (4 * dst_strd)), src_temp5);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp13);
_mm_storeu_si128((__m128i *)(pu1_dst + 0 + (5 * dst_strd)), src_temp6);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp14);
_mm_storeu_si128((__m128i *)(pu1_dst + 0 + (6 * dst_strd)), src_temp7);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp15);
_mm_storeu_si128((__m128i *)(pu1_dst + 0 + (7 * dst_strd)), src_temp8);
_mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp16);
pu1_ref -= 2 * 8;
pu1_dst += 8 * dst_strd;
}
}
}
}
/**
*******************************************************************************
*
* @brief
* Intraprediction for mode 3 to 9 (positive angle, horizontal mode ) with
* reference neighboring samples location pointed by 'pu1_ref' to the TU
* block location pointed by 'pu1_dst'
*
* @par Description:
*
*
* @param[in] pu1_src
* UWORD8 pointer to the source
*
* @param[in] pu1_dst
* UWORD8 pointer to the destination
*
* @param[in] src_strd
* integer source stride
*
* @param[in] dst_strd
* integer destination stride
*
* @param[in] nt
* integer Transform Block size
*
* @param[in] mode
* integer intraprediction mode
*
* @returns
*
* @remarks
* None
*
*******************************************************************************
*/
void ihevc_intra_pred_chroma_mode_3_to_9_ssse3(UWORD8 *pu1_ref,
WORD32 src_strd,
UWORD8 *pu1_dst,
WORD32 dst_strd,
WORD32 nt,
WORD32 mode)
{
WORD32 row, col;
WORD32 intra_pred_ang;
__m128i const_temp_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
__m128i fract_4x32b, zero_8x16b, intra_pred_ang_4x32b;
__m128i row_4x32b, two_nt_4x32b, ref_main_idx_4x32b, res_temp5_4x32b, sm1;
UNUSED(src_strd);
/* Intra Pred Angle according to the mode */
intra_pred_ang = gai4_ihevc_ang_table[mode];
/* For the angles other then 45 degree, interpolation btw 2 neighboring */
/* samples dependent on distance to obtain destination sample */
sm1 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY7[0]);
const_temp_4x32b = _mm_set1_epi16(16);
const_temp2_4x32b = _mm_set1_epi32(31);
const_temp3_4x32b = _mm_set1_epi16(32);
const_temp4_4x32b = _mm_set1_epi32(4);
two_nt_4x32b = _mm_set1_epi32(1);
zero_8x16b = _mm_set1_epi16(0);
/* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang);
row_4x32b = _mm_set_epi32(4, 3, 2, 1);
if(nt == 4)
{
intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
const_temp2_4x32b = _mm_set1_epi16(31);
const_temp4_4x32b = _mm_set1_epi16(4);
two_nt_4x32b = _mm_set1_epi16((4 * nt) - 2);
{
WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
WORD8 ai1_fract_temp_val[16], ai1_src_temp_val[16];
__m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b;
__m128i src_values10;
__m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
/* pos = ((row + 1) * intra_pred_ang); */
res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
/* fract = pos & (31); */
fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
ref_main_idx_4x32b = _mm_srai_epi16(res_temp5_4x32b, 5);
ref_main_idx_4x32b = _mm_add_epi16(ref_main_idx_4x32b, ref_main_idx_4x32b);
ref_main_idx_4x32b = _mm_sub_epi16(two_nt_4x32b, ref_main_idx_4x32b);
row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
/*(32 - fract) */
src_values10 = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
_mm_storel_epi64((__m128i *)(ai1_fract_temp_val), fract_4x32b);
_mm_storel_epi64((__m128i *)(ai1_src_temp_val), src_values10);
fract1_8x16b = _mm_set1_epi8(ai1_fract_temp_val[0]); /* col=0*/
fract2_8x16b = _mm_set1_epi8(ai1_fract_temp_val[2]); /* col=1*/
fract3_8x16b = _mm_set1_epi8(ai1_fract_temp_val[4]); /* col=2*/
fract4_8x16b = _mm_set1_epi8(ai1_fract_temp_val[6]); /* col=3*/
temp1_8x16b = _mm_set1_epi8(ai1_src_temp_val[0]); /* col=0*/
temp2_8x16b = _mm_set1_epi8(ai1_src_temp_val[2]); /* col=1*/
temp3_8x16b = _mm_set1_epi8(ai1_src_temp_val[4]); /* col=2*/
temp4_8x16b = _mm_set1_epi8(ai1_src_temp_val[6]); /* col=3*/
temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);
pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/
pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/
pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/
pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/
{
__m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
__m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
/* loding 8-bit 16 pixels */
src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - 8)); /* col=0*/
src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - 8)); /* col=1*/
src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - 8)); /* col=2*/
src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - 8)); /* col=3*/
src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 2); /* col=0*/
src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 2); /* col=1*/
src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 2); /* col=2*/
src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 2); /* col=3*/
src_temp1_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp5_8x16b); /* col=0*/
src_temp2_8x16b = _mm_unpacklo_epi8(src_temp2_8x16b, src_temp6_8x16b); /* col=1*/
src_temp3_8x16b = _mm_unpacklo_epi8(src_temp3_8x16b, src_temp7_8x16b); /* col=2*/
src_temp4_8x16b = _mm_unpacklo_epi8(src_temp4_8x16b, src_temp8_8x16b); /* col=3*/
/* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
/*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
/*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/
src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/
src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/
src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/
/* converting 16 bit to 8 bit */
src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, zero_8x16b); /* col=0*/
src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, zero_8x16b); /* col=1*/
src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, zero_8x16b); /* col=2*/
src_temp4_8x16b = _mm_packus_epi16(src_temp4_8x16b, zero_8x16b); /* col=3*/
src_temp1_8x16b = _mm_shuffle_epi8(src_temp1_8x16b, sm1);
src_temp2_8x16b = _mm_shuffle_epi8(src_temp2_8x16b, sm1);
src_temp3_8x16b = _mm_shuffle_epi8(src_temp3_8x16b, sm1);
src_temp4_8x16b = _mm_shuffle_epi8(src_temp4_8x16b, sm1);
src_temp5_8x16b = _mm_unpacklo_epi16(src_temp1_8x16b, src_temp2_8x16b);
src_temp6_8x16b = _mm_unpacklo_epi16(src_temp3_8x16b, src_temp4_8x16b);
src_temp8_8x16b = _mm_unpacklo_epi32(src_temp5_8x16b, src_temp6_8x16b);
src_temp7_8x16b = _mm_unpackhi_epi32(src_temp5_8x16b, src_temp6_8x16b);
_mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 0)), src_temp8_8x16b); /* row=0*/
src_temp2_8x16b = _mm_shuffle_epi32(src_temp8_8x16b, _MM_SHUFFLE(3, 2, 3, 2));
_mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (1))), src_temp2_8x16b); /* row=1*/
_mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (2))), src_temp7_8x16b); /* row=2*/
src_temp4_8x16b = _mm_shuffle_epi32(src_temp7_8x16b, _MM_SHUFFLE(3, 2, 3, 2));
_mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (3))), src_temp4_8x16b); /* row=4*/
}
}
}
else
{
intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
const_temp2_4x32b = _mm_set1_epi16(31);
const_temp4_4x32b = _mm_set1_epi16(8);
two_nt_4x32b = _mm_set1_epi16((4 * nt) - 2);
for(col = 0; col < 2 * nt; col += 16)
{
WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
WORD8 ai1_fract_temp_val[16], ai1_src_temp_val[16];
__m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b;
__m128i fract5_8x16b, fract6_8x16b, fract7_8x16b, fract8_8x16b, src_values10;
__m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
__m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
/* pos = ((row + 1) * intra_pred_ang); */
res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
/* fract = pos & (31); */
fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
ref_main_idx_4x32b = _mm_srai_epi16(res_temp5_4x32b, 5);
ref_main_idx_4x32b = _mm_add_epi16(ref_main_idx_4x32b, ref_main_idx_4x32b);
ref_main_idx_4x32b = _mm_sub_epi16(two_nt_4x32b, ref_main_idx_4x32b);
row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
/*(32 - fract) */
src_values10 = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
_mm_storeu_si128((__m128i *)(ai1_fract_temp_val), fract_4x32b);
_mm_storeu_si128((__m128i *)(ai1_src_temp_val), src_values10);
fract1_8x16b = _mm_set1_epi8(ai1_fract_temp_val[0]); /* col=0*/
fract2_8x16b = _mm_set1_epi8(ai1_fract_temp_val[2]); /* col=1*/
fract3_8x16b = _mm_set1_epi8(ai1_fract_temp_val[4]); /* col=2*/
fract4_8x16b = _mm_set1_epi8(ai1_fract_temp_val[6]); /* col=3*/
temp1_8x16b = _mm_set1_epi8(ai1_src_temp_val[0]); /* col=0*/
temp2_8x16b = _mm_set1_epi8(ai1_src_temp_val[2]); /* col=1*/
temp3_8x16b = _mm_set1_epi8(ai1_src_temp_val[4]); /* col=2*/
temp4_8x16b = _mm_set1_epi8(ai1_src_temp_val[6]); /* col=3*/
temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);
pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/
pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/
pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/
pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/
fract5_8x16b = _mm_set1_epi8(ai1_fract_temp_val[8]); /* col=5*/
fract6_8x16b = _mm_set1_epi8(ai1_fract_temp_val[10]); /* col=6*/
fract7_8x16b = _mm_set1_epi8(ai1_fract_temp_val[12]); /* col=7*/
fract8_8x16b = _mm_set1_epi8(ai1_fract_temp_val[14]); /* col=8*/
temp11_8x16b = _mm_set1_epi8(ai1_src_temp_val[8]); /* col=0*/
temp12_8x16b = _mm_set1_epi8(ai1_src_temp_val[10]); /* col=1*/
temp13_8x16b = _mm_set1_epi8(ai1_src_temp_val[12]); /* col=2*/
temp14_8x16b = _mm_set1_epi8(ai1_src_temp_val[14]); /* col=3*/
temp11_8x16b = _mm_unpacklo_epi8(temp11_8x16b, fract5_8x16b);
temp12_8x16b = _mm_unpacklo_epi8(temp12_8x16b, fract6_8x16b);
temp13_8x16b = _mm_unpacklo_epi8(temp13_8x16b, fract7_8x16b);
temp14_8x16b = _mm_unpacklo_epi8(temp14_8x16b, fract8_8x16b);
pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4); /* col=5*/
pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5); /* col=6*/
pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6); /* col=7*/
pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7); /* col=8*/
for(row = 0; row < nt; row += 4)
{
__m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
__m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
__m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
__m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
/* loding 8-bit 16 pixels */
src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - row - (8 + row))); /* col=0*/
src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - row - (8 + row))); /* col=1*/
src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - row - (8 + row))); /* col=2*/
src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - row - (8 + row))); /* col=3*/
src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 2); /* col=0*/
src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 2); /* col=1*/
src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 2); /* col=2*/
src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 2); /* col=3*/
src_temp1_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp5_8x16b); /* col=0*/
src_temp2_8x16b = _mm_unpacklo_epi8(src_temp2_8x16b, src_temp6_8x16b); /* col=1*/
src_temp3_8x16b = _mm_unpacklo_epi8(src_temp3_8x16b, src_temp7_8x16b); /* col=2*/
src_temp4_8x16b = _mm_unpacklo_epi8(src_temp4_8x16b, src_temp8_8x16b); /* col=3*/
/* loding 8-bit 16 pixels */
src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx5 - row - row - 8)); /* col=5*/
src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx6 - row - row - 8)); /* col=6*/
src_temp17_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx7 - row - row - 8)); /* col=7*/
src_temp18_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx8 - row - row - 8)); /* col=8*/
src_temp11_8x16b = _mm_srli_si128(src_temp15_8x16b, 2); /* col=5*/
src_temp12_8x16b = _mm_srli_si128(src_temp16_8x16b, 2); /* col=6*/
src_temp13_8x16b = _mm_srli_si128(src_temp17_8x16b, 2); /* col=7*/
src_temp14_8x16b = _mm_srli_si128(src_temp18_8x16b, 2); /* col=8*/
src_temp11_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp15_8x16b); /* col=0*/
src_temp12_8x16b = _mm_unpacklo_epi8(src_temp12_8x16b, src_temp16_8x16b); /* col=1*/
src_temp13_8x16b = _mm_unpacklo_epi8(src_temp13_8x16b, src_temp17_8x16b); /* col=2*/
src_temp14_8x16b = _mm_unpacklo_epi8(src_temp14_8x16b, src_temp18_8x16b); /* col=3*/
/* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
/* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
/*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
/*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/
src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/
src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/
src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/
/*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
/*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, 5); /* col=5*/
src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b, 5); /* col=6*/
src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, 5); /* col=7*/
src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b, 5); /* col=8*/
/* converting 16 bit to 8 bit */
src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, zero_8x16b); /* col=0*/
src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, zero_8x16b); /* col=1*/
src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, zero_8x16b); /* col=2*/
src_temp4_8x16b = _mm_packus_epi16(src_temp4_8x16b, zero_8x16b); /* col=3*/
src_temp1_8x16b = _mm_shuffle_epi8(src_temp1_8x16b, sm1);
src_temp2_8x16b = _mm_shuffle_epi8(src_temp2_8x16b, sm1);
src_temp3_8x16b = _mm_shuffle_epi8(src_temp3_8x16b, sm1);
src_temp4_8x16b = _mm_shuffle_epi8(src_temp4_8x16b, sm1);
/* converting 16 bit to 8 bit */
src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, zero_8x16b); /* col=5*/
src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, zero_8x16b); /* col=6*/
src_temp13_8x16b = _mm_packus_epi16(src_temp13_8x16b, zero_8x16b); /* col=7*/
src_temp14_8x16b = _mm_packus_epi16(src_temp14_8x16b, zero_8x16b); /* col=8*/
src_temp11_8x16b = _mm_shuffle_epi8(src_temp11_8x16b, sm1);
src_temp12_8x16b = _mm_shuffle_epi8(src_temp12_8x16b, sm1);
src_temp13_8x16b = _mm_shuffle_epi8(src_temp13_8x16b, sm1);
src_temp14_8x16b = _mm_shuffle_epi8(src_temp14_8x16b, sm1);
src_temp5_8x16b = _mm_unpacklo_epi16(src_temp1_8x16b, src_temp2_8x16b);
src_temp6_8x16b = _mm_unpacklo_epi16(src_temp3_8x16b, src_temp4_8x16b);
src_temp8_8x16b = _mm_unpacklo_epi32(src_temp5_8x16b, src_temp6_8x16b);
src_temp7_8x16b = _mm_unpackhi_epi32(src_temp5_8x16b, src_temp6_8x16b);
src_temp15_8x16b = _mm_unpacklo_epi16(src_temp11_8x16b, src_temp12_8x16b);
src_temp16_8x16b = _mm_unpacklo_epi16(src_temp13_8x16b, src_temp14_8x16b);
src_temp18_8x16b = _mm_unpacklo_epi32(src_temp15_8x16b, src_temp16_8x16b);
src_temp17_8x16b = _mm_unpackhi_epi32(src_temp15_8x16b, src_temp16_8x16b);
src_temp11_8x16b = _mm_unpacklo_epi64(src_temp8_8x16b, src_temp18_8x16b);
src_temp12_8x16b = _mm_unpackhi_epi64(src_temp8_8x16b, src_temp18_8x16b);
src_temp13_8x16b = _mm_unpacklo_epi64(src_temp7_8x16b, src_temp17_8x16b);
src_temp14_8x16b = _mm_unpackhi_epi64(src_temp7_8x16b, src_temp17_8x16b);
_mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * row)), src_temp11_8x16b); /* row=0*/
_mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp12_8x16b); /* row=1*/
_mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp13_8x16b); /* row=2*/
_mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp14_8x16b); /* row=4*/
}
}
}
}
/**
*******************************************************************************
*
* @brief
* Intraprediction for mode 11 to 17 (negative angle, horizontal mode )
* with reference neighboring samples location pointed by 'pu1_ref' to the
* TU block location pointed by 'pu1_dst'
*
* @par Description:
*
*
* @param[in] pu1_src
* UWORD8 pointer to the source
*
* @param[in] pu1_dst
* UWORD8 pointer to the destination
*
* @param[in] src_strd
* integer source stride
*
* @param[in] dst_strd
* integer destination stride
*
* @param[in] nt
* integer Transform Block size
*
* @param[in] mode
* integer intraprediction mode
*
* @returns
*
* @remarks
* None
*
*******************************************************************************
*/
void ihevc_intra_pred_chroma_mode_11_to_17_ssse3(UWORD8 *pu1_ref,
WORD32 src_strd,
UWORD8 *pu1_dst,
WORD32 dst_strd,
WORD32 nt,
WORD32 mode)
{
/* This function and ihevc_intra_pred_CHROMA_mode_19_to_25 are same except*/
/* for ref main & side samples assignment,can be combined for */
/* optimzation*/
WORD32 row, col, k;
WORD32 intra_pred_ang, inv_ang, inv_ang_sum;
WORD32 ref_idx;
__m128i const_temp_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
__m128i fract_4x32b, zero_8x16b, intra_pred_ang_4x32b;
__m128i row_4x32b, two_nt_4x32b, ref_main_idx_4x32b, res_temp5_4x32b;
UWORD8 ref_temp[2 * MAX_CU_SIZE + 2];
UWORD8 *ref_main;
UNUSED(src_strd);
inv_ang_sum = 128;
intra_pred_ang = gai4_ihevc_ang_table[mode];
inv_ang = gai4_ihevc_inv_ang_table[mode - 11];
/* Intermediate reference samples for negative angle modes */
/* This have to be removed during optimization*/
/* For horizontal modes, (ref main = ref left) (ref side = ref above) */
ref_main = ref_temp + 2 * nt;
for(k = 0; k < (2 * (nt + 1)); k += 2)
{
ref_temp[k + (2 * (nt - 1))] = pu1_ref[(4 * nt) - k];
ref_temp[k + 1 + (2 * (nt - 1))] = pu1_ref[(4 * nt) - k + 1];
}
ref_main = ref_temp + (2 * (nt - 1));
ref_idx = (nt * intra_pred_ang) >> 5;
/* SIMD Optimization can be done using look-up table for the loop */
/* For negative angled derive the main reference samples from side */
/* reference samples refer to section 8.4.4.2.6 */
for(k = -2; k > (2 * ref_idx); k -= 2)
{
inv_ang_sum += inv_ang;
ref_main[k] = pu1_ref[(4 * nt) + ((inv_ang_sum >> 8) << 1)];
ref_main[k + 1] = pu1_ref[((4 * nt) + 1) + ((inv_ang_sum >> 8) << 1)];
}
/* For the angles other then 45 degree, interpolation btw 2 neighboring */
/* samples dependent on distance to obtain destination sample */
const_temp_4x32b = _mm_set1_epi16(16);
const_temp2_4x32b = _mm_set1_epi32(31);
const_temp3_4x32b = _mm_set1_epi16(32);
const_temp4_4x32b = _mm_set1_epi32(4);
two_nt_4x32b = _mm_set1_epi32(1);
zero_8x16b = _mm_set1_epi16(0);
/* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang);
row_4x32b = _mm_set_epi32(4, 3, 2, 1);
if(nt == 4)
{
intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
const_temp2_4x32b = _mm_set1_epi16(31);
const_temp4_4x32b = _mm_set1_epi16(4);
two_nt_4x32b = _mm_set1_epi16(1);
{
WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
WORD8 ai1_fract_temp_val[16], ai1_src_temp_val[16];
__m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b;
__m128i src_values10;
__m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
/* pos = ((row + 1) * intra_pred_ang); */
res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
/* fract = pos & (31); */
fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5));
ref_main_idx_4x32b = _mm_add_epi16(ref_main_idx_4x32b, ref_main_idx_4x32b);
row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
/*(32 - fract) */
src_values10 = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
_mm_storel_epi64((__m128i *)(ai1_fract_temp_val), fract_4x32b);
_mm_storel_epi64((__m128i *)(ai1_src_temp_val), src_values10);
fract1_8x16b = _mm_set1_epi8(ai1_fract_temp_val[0]); /* col=0*/
fract2_8x16b = _mm_set1_epi8(ai1_fract_temp_val[2]); /* col=1*/
fract3_8x16b = _mm_set1_epi8(ai1_fract_temp_val[4]); /* col=2*/
fract4_8x16b = _mm_set1_epi8(ai1_fract_temp_val[6]); /* col=3*/
temp1_8x16b = _mm_set1_epi8(ai1_src_temp_val[0]); /* col=0*/
temp2_8x16b = _mm_set1_epi8(ai1_src_temp_val[2]); /* col=1*/
temp3_8x16b = _mm_set1_epi8(ai1_src_temp_val[4]); /* col=2*/
temp4_8x16b = _mm_set1_epi8(ai1_src_temp_val[6]); /* col=3*/
temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);
pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/
pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/
pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/
pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/
{
__m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
__m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
/* loding 8-bit 16 pixels */
src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1)); /* col=0*/
src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2)); /* col=1*/
src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3)); /* col=2*/
src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4)); /* col=3*/
src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 2); /* col=0*/
src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 2); /* col=1*/
src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 2); /* col=2*/
src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 2); /* col=3*/
src_temp1_8x16b = _mm_unpacklo_epi8(src_temp5_8x16b, src_temp1_8x16b); /* col=0*/
src_temp2_8x16b = _mm_unpacklo_epi8(src_temp6_8x16b, src_temp2_8x16b); /* col=1*/
src_temp3_8x16b = _mm_unpacklo_epi8(src_temp7_8x16b, src_temp3_8x16b); /* col=2*/
src_temp4_8x16b = _mm_unpacklo_epi8(src_temp8_8x16b, src_temp4_8x16b); /* col=3*/
/* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
/*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
/*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/
src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/
src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/
src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/
/* converting 16 bit to 8 bit */
src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, zero_8x16b); /* col=0*/
src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, zero_8x16b); /* col=1*/
src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, zero_8x16b); /* col=2*/
src_temp4_8x16b = _mm_packus_epi16(src_temp4_8x16b, zero_8x16b); /* col=3*/
src_temp5_8x16b = _mm_unpacklo_epi16(src_temp1_8x16b, src_temp2_8x16b);
src_temp6_8x16b = _mm_unpacklo_epi16(src_temp3_8x16b, src_temp4_8x16b);
src_temp8_8x16b = _mm_unpacklo_epi32(src_temp5_8x16b, src_temp6_8x16b);
src_temp7_8x16b = _mm_unpackhi_epi32(src_temp5_8x16b, src_temp6_8x16b);
_mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 0)), src_temp8_8x16b); /* row=0*/
src_temp2_8x16b = _mm_shuffle_epi32(src_temp8_8x16b, _MM_SHUFFLE(3, 2, 3, 2));
_mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (1))), src_temp2_8x16b); /* row=1*/
_mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (2))), src_temp7_8x16b); /* row=2*/
src_temp4_8x16b = _mm_shuffle_epi32(src_temp7_8x16b, _MM_SHUFFLE(3, 2, 3, 2));
_mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (3))), src_temp4_8x16b); /* row=4*/
}
}
}
else
{
intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
const_temp2_4x32b = _mm_set1_epi16(31);
const_temp4_4x32b = _mm_set1_epi16(8);
two_nt_4x32b = _mm_set1_epi16(1);
for(col = 0; col < 2 * nt; col += 16)
{
WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
WORD8 ai1_fract_temp_val[16], ai1_src_temp_val[16];
__m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b;
__m128i fract5_8x16b, fract6_8x16b, fract7_8x16b, fract8_8x16b, src_values10;
__m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
__m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
/* pos = ((row + 1) * intra_pred_ang); */
res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
/* fract = pos & (31); */
fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5));
ref_main_idx_4x32b = _mm_add_epi16(ref_main_idx_4x32b, ref_main_idx_4x32b);
row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
/*(32 - fract) */
src_values10 = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
_mm_storeu_si128((__m128i *)(ai1_fract_temp_val), fract_4x32b);
_mm_storeu_si128((__m128i *)(ai1_src_temp_val), src_values10);
fract1_8x16b = _mm_set1_epi8(ai1_fract_temp_val[0]); /* col=0*/
fract2_8x16b = _mm_set1_epi8(ai1_fract_temp_val[2]); /* col=1*/
fract3_8x16b = _mm_set1_epi8(ai1_fract_temp_val[4]); /* col=2*/
fract4_8x16b = _mm_set1_epi8(ai1_fract_temp_val[6]); /* col=3*/
temp1_8x16b = _mm_set1_epi8(ai1_src_temp_val[0]); /* col=0*/
temp2_8x16b = _mm_set1_epi8(ai1_src_temp_val[2]); /* col=1*/
temp3_8x16b = _mm_set1_epi8(ai1_src_temp_val[4]); /* col=2*/
temp4_8x16b = _mm_set1_epi8(ai1_src_temp_val[6]); /* col=3*/
temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);
pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/
pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/
pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/
pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/
fract5_8x16b = _mm_set1_epi8(ai1_fract_temp_val[8]); /* col=5*/
fract6_8x16b = _mm_set1_epi8(ai1_fract_temp_val[10]); /* col=6*/
fract7_8x16b = _mm_set1_epi8(ai1_fract_temp_val[12]); /* col=7*/
fract8_8x16b = _mm_set1_epi8(ai1_fract_temp_val[14]); /* col=8*/
temp11_8x16b = _mm_set1_epi8(ai1_src_temp_val[8]); /* col=0*/
temp12_8x16b = _mm_set1_epi8(ai1_src_temp_val[10]); /* col=1*/
temp13_8x16b = _mm_set1_epi8(ai1_src_temp_val[12]); /* col=2*/
temp14_8x16b = _mm_set1_epi8(ai1_src_temp_val[14]); /* col=3*/
temp11_8x16b = _mm_unpacklo_epi8(temp11_8x16b, fract5_8x16b);
temp12_8x16b = _mm_unpacklo_epi8(temp12_8x16b, fract6_8x16b);
temp13_8x16b = _mm_unpacklo_epi8(temp13_8x16b, fract7_8x16b);
temp14_8x16b = _mm_unpacklo_epi8(temp14_8x16b, fract8_8x16b);
pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4); /* col=5*/
pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5); /* col=6*/
pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6); /* col=7*/
pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7); /* col=8*/
for(row = 0; row < nt; row += 4)
{
__m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
__m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
__m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
__m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
/* loding 8-bit 16 pixels */
src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1 + row + row)); /* col=0*/
src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2 + row + row)); /* col=1*/
src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3 + row + row)); /* col=2*/
src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4 + row + row)); /* col=3*/
src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 2); /* col=0*/
src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 2); /* col=1*/
src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 2); /* col=2*/
src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 2); /* col=3*/
src_temp1_8x16b = _mm_unpacklo_epi8(src_temp5_8x16b, src_temp1_8x16b); /* col=0*/
src_temp2_8x16b = _mm_unpacklo_epi8(src_temp6_8x16b, src_temp2_8x16b); /* col=1*/
src_temp3_8x16b = _mm_unpacklo_epi8(src_temp7_8x16b, src_temp3_8x16b); /* col=2*/
src_temp4_8x16b = _mm_unpacklo_epi8(src_temp8_8x16b, src_temp4_8x16b); /* col=3*/
/* loding 8-bit 16 pixels */
src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5 + row + row)); /* col=5*/
src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6 + row + row)); /* col=6*/
src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7 + row + row)); /* col=7*/
src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8 + row + row)); /* col=8*/
src_temp11_8x16b = _mm_srli_si128(src_temp15_8x16b, 2); /* col=5*/
src_temp12_8x16b = _mm_srli_si128(src_temp16_8x16b, 2); /* col=6*/
src_temp13_8x16b = _mm_srli_si128(src_temp17_8x16b, 2); /* col=7*/
src_temp14_8x16b = _mm_srli_si128(src_temp18_8x16b, 2); /* col=8*/
src_temp11_8x16b = _mm_unpacklo_epi8(src_temp15_8x16b, src_temp11_8x16b); /* col=0*/
src_temp12_8x16b = _mm_unpacklo_epi8(src_temp16_8x16b, src_temp12_8x16b); /* col=1*/
src_temp13_8x16b = _mm_unpacklo_epi8(src_temp17_8x16b, src_temp13_8x16b); /* col=2*/
src_temp14_8x16b = _mm_unpacklo_epi8(src_temp18_8x16b, src_temp14_8x16b); /* col=3*/
/* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
/* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
/*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
/*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/
src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/
src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/
src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/
/*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
/*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, 5); /* col=5*/
src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b, 5); /* col=6*/
src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, 5); /* col=7*/
src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b, 5); /* col=8*/
/* converting 16 bit to 8 bit */
src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, zero_8x16b); /* col=0*/
src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, zero_8x16b); /* col=1*/
src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, zero_8x16b); /* col=2*/
src_temp4_8x16b = _mm_packus_epi16(src_temp4_8x16b, zero_8x16b); /* col=3*/
/* converting 16 bit to 8 bit */
src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, zero_8x16b); /* col=5*/
src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, zero_8x16b); /* col=6*/
src_temp13_8x16b = _mm_packus_epi16(src_temp13_8x16b, zero_8x16b); /* col=7*/
src_temp14_8x16b = _mm_packus_epi16(src_temp14_8x16b, zero_8x16b); /* col=8*/
src_temp5_8x16b = _mm_unpacklo_epi16(src_temp1_8x16b, src_temp2_8x16b);
src_temp6_8x16b = _mm_unpacklo_epi16(src_temp3_8x16b, src_temp4_8x16b);
src_temp8_8x16b = _mm_unpacklo_epi32(src_temp5_8x16b, src_temp6_8x16b);
src_temp7_8x16b = _mm_unpackhi_epi32(src_temp5_8x16b, src_temp6_8x16b);
src_temp15_8x16b = _mm_unpacklo_epi16(src_temp11_8x16b, src_temp12_8x16b);
src_temp16_8x16b = _mm_unpacklo_epi16(src_temp13_8x16b, src_temp14_8x16b);
src_temp18_8x16b = _mm_unpacklo_epi32(src_temp15_8x16b, src_temp16_8x16b);
src_temp17_8x16b = _mm_unpackhi_epi32(src_temp15_8x16b, src_temp16_8x16b);
src_temp11_8x16b = _mm_unpacklo_epi64(src_temp8_8x16b, src_temp18_8x16b);
src_temp12_8x16b = _mm_unpackhi_epi64(src_temp8_8x16b, src_temp18_8x16b);
src_temp13_8x16b = _mm_unpacklo_epi64(src_temp7_8x16b, src_temp17_8x16b);
src_temp14_8x16b = _mm_unpackhi_epi64(src_temp7_8x16b, src_temp17_8x16b);
_mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * row)), src_temp11_8x16b); /* row=0*/
_mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp12_8x16b); /* row=1*/
_mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp13_8x16b); /* row=2*/
_mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp14_8x16b); /* row=4*/
}
}
}
}
/**
*******************************************************************************
*
* @brief
* Intraprediction for mode 19 to 25 (negative angle, vertical mode ) with
* reference neighboring samples location pointed by 'pu1_ref' to the TU
* block location pointed by 'pu1_dst'
*
* @par Description:
*
*
* @param[in] pu1_src
* UWORD8 pointer to the source
*
* @param[in] pu1_dst
* UWORD8 pointer to the destination
*
* @param[in] src_strd
* integer source stride
*
* @param[in] dst_strd
* integer destination stride
*
* @param[in] nt
* integer Transform Block size
*
* @param[in] mode
* integer intraprediction mode
*
* @returns
*
* @remarks
* None
*
*******************************************************************************
*/
void ihevc_intra_pred_chroma_mode_19_to_25_ssse3(UWORD8 *pu1_ref,
WORD32 src_strd,
UWORD8 *pu1_dst,
WORD32 dst_strd,
WORD32 nt,
WORD32 mode)
{
WORD32 row, k;
WORD32 intra_pred_ang, idx;
WORD32 inv_ang, inv_ang_sum, pos, fract;
WORD32 ref_main_idx, ref_idx;
UWORD8 ref_temp[(2 * MAX_CU_SIZE) + 2];
UWORD8 *ref_main;
__m128i zero_8x16b, fract_8x16b, const_temp_8x16b;
UNUSED(src_strd);
intra_pred_ang = gai4_ihevc_ang_table_chroma[mode];
inv_ang = gai4_ihevc_inv_ang_table_chroma[mode - 12];
/* Intermediate reference samples for negative angle modes */
/* This have to be removed during optimization*/
/* For horizontal modes, (ref main = ref above) (ref side = ref left) */
ref_main = ref_temp + 2 * nt;
for(k = 0; k < (2 * (nt + 1)); k += 2)
{
ref_temp[k + (2 * (nt - 1))] = pu1_ref[(4 * nt) + k];
ref_temp[k + 1 + (2 * (nt - 1))] = pu1_ref[(4 * nt) + k + 1];
}
ref_idx = (nt * intra_pred_ang) >> 5;
inv_ang_sum = 128;
ref_main = ref_temp + (2 * (nt - 1));
/* SIMD Optimization can be done using look-up table for the loop */
/* For negative angled derive the main reference samples from side */
/* reference samples refer to section 8.4.4.2.6 */
for(k = -2; k > (2 * ref_idx); k -= 2)
{
inv_ang_sum += inv_ang;
ref_main[k] = pu1_ref[(4 * nt) - (inv_ang_sum >> 8) * 2];
ref_main[k + 1] = pu1_ref[((4 * nt) + 1) - (inv_ang_sum >> 8) * 2];
}
const_temp_8x16b = _mm_set1_epi16(16);
if(nt == 4) /* if nt =4*/
{
__m128i const_temp2_4x32b, const_temp3_4x32b;
__m128i src_values10, src_values11, zero_8x16b, intra_pred_ang_4x32b;
__m128i row_4x32b, two_nt_4x32b, src_values12;
const_temp2_4x32b = _mm_set1_epi32(31);
const_temp3_4x32b = _mm_set1_epi32(32);
two_nt_4x32b = _mm_set1_epi32(2);
zero_8x16b = _mm_set1_epi16(0);
/* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
row_4x32b = _mm_set_epi16(4, 3, 2, 1, 4, 3, 2, 1);
{
WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
WORD8 ai1_src_temp0_val[16], ai1_src_temp1_val[16];
__m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b, res_temp5_4x32b;
__m128i src_values0, src_values1, src_values2, src_values3, src_values13;
__m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
__m128i ref_main_temp0, ref_main_temp1, ref_main_temp2, sign_8x16b;
/* pos = ((row + 1) * intra_pred_ang); */
res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
sign_8x16b = _mm_cmpgt_epi16(zero_8x16b, res_temp5_4x32b);
res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, sign_8x16b);
src_values12 = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b, 5));
src_values12 = _mm_add_epi32(src_values12, _mm_srai_epi32(res_temp5_4x32b, 5));
ref_main_temp0 = _mm_srli_si128(src_values12, 4); /* next 32 bit values */
ref_main_temp1 = _mm_srli_si128(src_values12, 8); /* next 32 bit values */
ref_main_temp2 = _mm_srli_si128(src_values12, 12); /* next 32 bit values */
ref_main_idx1 = _mm_cvtsi128_si32(src_values12); /* row=0*/
ref_main_idx2 = _mm_cvtsi128_si32(ref_main_temp0); /* row=1*/
ref_main_idx3 = _mm_cvtsi128_si32(ref_main_temp1); /* row=2*/
ref_main_idx4 = _mm_cvtsi128_si32(ref_main_temp2); /* row=3*/
/* fract = pos & (31); */
src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
/*(32 - fract) */
src_values10 = _mm_sub_epi32(const_temp3_4x32b, src_values11);
_mm_storeu_si128((__m128i *)(ai1_src_temp1_val), src_values11);
_mm_storeu_si128((__m128i *)(ai1_src_temp0_val), src_values10);
fract1_8x16b = _mm_set1_epi8(ai1_src_temp1_val[0]); /* row=0*/
fract2_8x16b = _mm_set1_epi8(ai1_src_temp1_val[4]); /* row=1*/
fract3_8x16b = _mm_set1_epi8(ai1_src_temp1_val[8]); /* row=2*/
fract4_8x16b = _mm_set1_epi8(ai1_src_temp1_val[12]); /* row=3*/
temp1_8x16b = _mm_set1_epi8(ai1_src_temp0_val[0]); /* row=0*/
temp2_8x16b = _mm_set1_epi8(ai1_src_temp0_val[4]); /* row=1*/
temp3_8x16b = _mm_set1_epi8(ai1_src_temp0_val[8]); /* row=2*/
temp4_8x16b = _mm_set1_epi8(ai1_src_temp0_val[12]); /* row=3*/
temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);
// inner loop starts from here
src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1)); /* col = 0-7 */
src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx2)); /* col = 8-15 */
src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx3)); /* col = 16-23 */
src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx4)); /* col = 24-31 */
src_values10 = _mm_srli_si128(src_values0, 2);
src_values11 = _mm_srli_si128(src_values1, 2);
src_values12 = _mm_srli_si128(src_values2, 2);
src_values13 = _mm_srli_si128(src_values3, 2);
src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);
src_values0 = _mm_maddubs_epi16(src_values0, temp1_8x16b);
src_values1 = _mm_maddubs_epi16(src_values1, temp2_8x16b);
src_values2 = _mm_maddubs_epi16(src_values2, temp3_8x16b);
src_values3 = _mm_maddubs_epi16(src_values3, temp4_8x16b);
/*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
/*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
src_values0 = _mm_srai_epi16(src_values0, 5);
src_values1 = _mm_srai_epi16(src_values1, 5);
src_values2 = _mm_srai_epi16(src_values2, 5);
src_values3 = _mm_srai_epi16(src_values3, 5);
/* converting 16 bit to 8 bit */
src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);
_mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_values0); /* row=0*/
_mm_storel_epi64((__m128i *)(pu1_dst + ((1) * dst_strd)), src_values1); /* row=1*/
_mm_storel_epi64((__m128i *)(pu1_dst + ((2) * dst_strd)), src_values2); /* row=2*/
_mm_storel_epi64((__m128i *)(pu1_dst + ((3) * dst_strd)), src_values3); /* row=3*/
}
}
else if(nt == 8) /* for nt = 16 case */
{
WORD32 ref_main_idx1, fract1, temp, temp1;
__m128i fract1_8x16b, temp_8x16b, temp1_8x16b;
zero_8x16b = _mm_set1_epi16(0);
for(row = 0; row < nt; row += 2)
{
__m128i src_values0, src_values1, src_values2, src_values3;
__m128i src_values10, src_values11, src_values12, src_values13;
pos = ((row + 1) * intra_pred_ang);
idx = pos >> 5;
fract = pos & (31);
temp = 32 - fract;
ref_main_idx = 2 * idx + 2; /* col from 0-15 */
pos = ((row + 2) * intra_pred_ang);
idx = pos >> 5;
fract1 = pos & (31);
temp1 = 32 - fract1;
ref_main_idx1 = 2 * idx + 2; /* col from 0-15 */
fract_8x16b = _mm_set1_epi8(fract);
fract1_8x16b = _mm_set1_epi8(fract1);
temp_8x16b = _mm_set1_epi8(temp);
temp1_8x16b = _mm_set1_epi8(temp1);
temp_8x16b = _mm_unpacklo_epi8(temp_8x16b, fract_8x16b);
temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
/* row=0 */
src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx)); /* col = 0-7 */
src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx + 8)); /* col = 8-15 */
/* row=1 */
src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1)); /* col = 0-7 */
src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1 + 8)); /* col = 8-15 */
src_values10 = _mm_srli_si128(src_values0, 2);
src_values11 = _mm_srli_si128(src_values1, 2);
src_values12 = _mm_srli_si128(src_values2, 2);
src_values13 = _mm_srli_si128(src_values3, 2);
src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);
src_values0 = _mm_maddubs_epi16(src_values0, temp_8x16b);
src_values1 = _mm_maddubs_epi16(src_values1, temp_8x16b);
src_values2 = _mm_maddubs_epi16(src_values2, temp1_8x16b);
src_values3 = _mm_maddubs_epi16(src_values3, temp1_8x16b);
/*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
/*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
src_values0 = _mm_srai_epi16(src_values0, 5);
src_values1 = _mm_srai_epi16(src_values1, 5);
src_values2 = _mm_srai_epi16(src_values2, 5);
src_values3 = _mm_srai_epi16(src_values3, 5);
/* converting 16 bit to 8 bit */
src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);
/* loding 8-bit 8 pixels values */
_mm_storel_epi64((__m128i *)(pu1_dst), src_values0);
_mm_storel_epi64((__m128i *)(pu1_dst + 8), src_values1);
_mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), src_values2);
_mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + 8), src_values3);
pu1_dst += 2 * dst_strd;
}
}
else if(nt == 16)
{
WORD32 temp;
/* unroll the col loop (inner) */
zero_8x16b = _mm_set1_epi16(0);
for(row = 0; row < nt; row += 1)
{
__m128i src_values0, src_values1, src_values2, src_values3, temp_8x16b;
__m128i src_values10, src_values11, src_values12, src_values13;
pos = ((row + 1) * intra_pred_ang);
idx = pos >> 5;
fract = pos & (31);
temp = 32 - fract;
ref_main_idx = 2 * idx + 2; /* col from 0-31 */
fract_8x16b = _mm_set1_epi8(fract);
temp_8x16b = _mm_set1_epi8(temp);
temp_8x16b = _mm_unpacklo_epi8(temp_8x16b, fract_8x16b);
src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx)); /* col = 0-7 */
src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx + 8)); /* col = 8-15 */
src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx + 16)); /* col = 16-23 */
src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx + 24)); /* col = 24-31 */
src_values10 = _mm_srli_si128(src_values0, 2);
src_values11 = _mm_srli_si128(src_values1, 2);
src_values12 = _mm_srli_si128(src_values2, 2);
src_values13 = _mm_srli_si128(src_values3, 2);
src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);
/* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
src_values0 = _mm_maddubs_epi16(src_values0, temp_8x16b);
src_values1 = _mm_maddubs_epi16(src_values1, temp_8x16b);
src_values2 = _mm_maddubs_epi16(src_values2, temp_8x16b);
src_values3 = _mm_maddubs_epi16(src_values3, temp_8x16b);
/*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
/*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
src_values0 = _mm_srai_epi16(src_values0, 5);
src_values1 = _mm_srai_epi16(src_values1, 5);
src_values2 = _mm_srai_epi16(src_values2, 5);
src_values3 = _mm_srai_epi16(src_values3, 5);
/* converting 16 bit to 8 bit */
src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);
/* loding 8-bit 8 pixels values */
_mm_storel_epi64((__m128i *)(pu1_dst), src_values0);
_mm_storel_epi64((__m128i *)(pu1_dst + 8), src_values1);
_mm_storel_epi64((__m128i *)(pu1_dst + 16), src_values2);
_mm_storel_epi64((__m128i *)(pu1_dst + 24), src_values3);
pu1_dst += dst_strd;
}
}
}
/**
*******************************************************************************
*
* @brief
* Intraprediction for mode 27 to 33 (positive angle, vertical mode ) with
* reference neighboring samples location pointed by 'pu1_ref' to the TU
* block location pointed by 'pu1_dst'
*
* @par Description:
*
*
* @param[in] pu1_src
* UWORD8 pointer to the source
*
* @param[in] pu1_dst
* UWORD8 pointer to the destination
*
* @param[in] src_strd
* integer source stride
*
* @param[in] dst_strd
* integer destination stride
*
* @param[in] nt
* integer Transform Block size
*
* @param[in] mode
* integer intraprediction mode
*
* @returns
*
* @remarks
* None
*
*******************************************************************************
*/
void ihevc_intra_pred_chroma_mode_27_to_33_ssse3(UWORD8 *pu1_ref,
WORD32 src_strd,
UWORD8 *pu1_dst,
WORD32 dst_strd,
WORD32 nt,
WORD32 mode)
{
WORD32 row;
WORD32 pos, fract;
WORD32 intra_pred_ang;
WORD32 idx, ref_main_idx;
__m128i zero_8x16b, fract_8x16b, const_temp_8x16b;
UNUSED(src_strd);
intra_pred_ang = gai4_ihevc_ang_table_chroma[mode];
const_temp_8x16b = _mm_set1_epi16(16);
if(nt == 4) /* if nt =4*/
{
__m128i const_temp2_4x32b, const_temp3_4x32b;
__m128i src_values10, src_values11, zero_8x16b, intra_pred_ang_4x32b;
__m128i row_4x32b, two_nt_4x32b, src_values12;
const_temp2_4x32b = _mm_set1_epi32(31);
const_temp3_4x32b = _mm_set1_epi32(32);
two_nt_4x32b = _mm_set1_epi32((4 * nt) + 2);
zero_8x16b = _mm_set1_epi16(0);
/* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
row_4x32b = _mm_set_epi16(4, 3, 2, 1, 4, 3, 2, 1);
{
WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
WORD8 ai1_src_temp0_val[16], ai1_src_temp1_val[16];
__m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b, res_temp5_4x32b;
__m128i src_values0, src_values1, src_values2, src_values3, src_values13;
__m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
__m128i ref_main_temp0, ref_main_temp1, ref_main_temp2, sign_8x16b;
/* pos = ((row + 1) * intra_pred_ang); */
res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
sign_8x16b = _mm_cmpgt_epi16(zero_8x16b, res_temp5_4x32b);
res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, sign_8x16b);
src_values12 = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b, 5));
src_values12 = _mm_add_epi32(src_values12, _mm_srai_epi32(res_temp5_4x32b, 5));
ref_main_temp0 = _mm_srli_si128(src_values12, 4); /* next 32 bit values */
ref_main_temp1 = _mm_srli_si128(src_values12, 8); /* next 32 bit values */
ref_main_temp2 = _mm_srli_si128(src_values12, 12); /* next 32 bit values */
ref_main_idx1 = _mm_cvtsi128_si32(src_values12); /* row=0*/
ref_main_idx2 = _mm_cvtsi128_si32(ref_main_temp0); /* row=1*/
ref_main_idx3 = _mm_cvtsi128_si32(ref_main_temp1); /* row=2*/
ref_main_idx4 = _mm_cvtsi128_si32(ref_main_temp2); /* row=3*/
/* fract = pos & (31); */
src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
/*(32 - fract) */
src_values10 = _mm_sub_epi32(const_temp3_4x32b, src_values11);
_mm_storeu_si128((__m128i *)(ai1_src_temp1_val), src_values11);
_mm_storeu_si128((__m128i *)(ai1_src_temp0_val), src_values10);
fract1_8x16b = _mm_set1_epi8(ai1_src_temp1_val[0]); /* row=0*/
fract2_8x16b = _mm_set1_epi8(ai1_src_temp1_val[4]); /* row=1*/
fract3_8x16b = _mm_set1_epi8(ai1_src_temp1_val[8]); /* row=2*/
fract4_8x16b = _mm_set1_epi8(ai1_src_temp1_val[12]); /* row=3*/
temp1_8x16b = _mm_set1_epi8(ai1_src_temp0_val[0]); /* row=0*/
temp2_8x16b = _mm_set1_epi8(ai1_src_temp0_val[4]); /* row=1*/
temp3_8x16b = _mm_set1_epi8(ai1_src_temp0_val[8]); /* row=2*/
temp4_8x16b = _mm_set1_epi8(ai1_src_temp0_val[12]); /* row=3*/
temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);
// inner loop starts from here
src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1)); /* col = 0-7 */
src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx2)); /* col = 8-15 */
src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx3)); /* col = 16-23 */
src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx4)); /* col = 24-31 */
src_values10 = _mm_srli_si128(src_values0, 2);
src_values11 = _mm_srli_si128(src_values1, 2);
src_values12 = _mm_srli_si128(src_values2, 2);
src_values13 = _mm_srli_si128(src_values3, 2);
src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);
src_values0 = _mm_maddubs_epi16(src_values0, temp1_8x16b);
src_values1 = _mm_maddubs_epi16(src_values1, temp2_8x16b);
src_values2 = _mm_maddubs_epi16(src_values2, temp3_8x16b);
src_values3 = _mm_maddubs_epi16(src_values3, temp4_8x16b);
/*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
/*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
src_values0 = _mm_srai_epi16(src_values0, 5);
src_values1 = _mm_srai_epi16(src_values1, 5);
src_values2 = _mm_srai_epi16(src_values2, 5);
src_values3 = _mm_srai_epi16(src_values3, 5);
/* converting 16 bit to 8 bit */
src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);
_mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_values0); /* row=0*/
_mm_storel_epi64((__m128i *)(pu1_dst + ((1) * dst_strd)), src_values1); /* row=1*/
_mm_storel_epi64((__m128i *)(pu1_dst + ((2) * dst_strd)), src_values2); /* row=2*/
_mm_storel_epi64((__m128i *)(pu1_dst + ((3) * dst_strd)), src_values3); /* row=3*/
}
}
else if(nt == 8) /* for nt = 16 case */
{
WORD32 ref_main_idx1, fract1, temp, temp1;
__m128i fract1_8x16b, temp_8x16b, temp1_8x16b;
zero_8x16b = _mm_set1_epi16(0);
for(row = 0; row < nt; row += 2)
{
__m128i src_values0, src_values1, src_values2, src_values3;
__m128i src_values10, src_values11, src_values12, src_values13;
pos = ((row + 1) * intra_pred_ang);
idx = pos >> 5;
fract = pos & (31);
temp = 32 - fract;
ref_main_idx = (4 * nt) + 2 * idx + 2; /* col from 0-15 */
pos = ((row + 2) * intra_pred_ang);
idx = pos >> 5;
fract1 = pos & (31);
temp1 = 32 - fract1;
ref_main_idx1 = (4 * nt) + 2 * idx + 2; /* col from 0-15 */
fract_8x16b = _mm_set1_epi8(fract);
fract1_8x16b = _mm_set1_epi8(fract1);
temp_8x16b = _mm_set1_epi8(temp);
temp1_8x16b = _mm_set1_epi8(temp1);
temp_8x16b = _mm_unpacklo_epi8(temp_8x16b, fract_8x16b);
temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
/* row=0 */
src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx)); /* col = 0-7 */
src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx + 8)); /* col = 8-15 */
/* row=1 */
src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1)); /* col = 0-7 */
src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1 + 8)); /* col = 8-15 */
src_values10 = _mm_srli_si128(src_values0, 2);
src_values11 = _mm_srli_si128(src_values1, 2);
src_values12 = _mm_srli_si128(src_values2, 2);
src_values13 = _mm_srli_si128(src_values3, 2);
src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);
src_values0 = _mm_maddubs_epi16(src_values0, temp_8x16b);
src_values1 = _mm_maddubs_epi16(src_values1, temp_8x16b);
src_values2 = _mm_maddubs_epi16(src_values2, temp1_8x16b);
src_values3 = _mm_maddubs_epi16(src_values3, temp1_8x16b);
/*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
/*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
src_values0 = _mm_srai_epi16(src_values0, 5);
src_values1 = _mm_srai_epi16(src_values1, 5);
src_values2 = _mm_srai_epi16(src_values2, 5);
src_values3 = _mm_srai_epi16(src_values3, 5);
/* converting 16 bit to 8 bit */
src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);
/* loding 8-bit 8 pixels values */
_mm_storel_epi64((__m128i *)(pu1_dst), src_values0);
_mm_storel_epi64((__m128i *)(pu1_dst + 8), src_values1);
_mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), src_values2);
_mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + 8), src_values3);
pu1_dst += 2 * dst_strd;
}
}
else if(nt == 16)
{
WORD32 temp;
/* unroll the col loop (inner) */
zero_8x16b = _mm_set1_epi16(0);
for(row = 0; row < nt; row += 1)
{
__m128i src_values0, src_values1, src_values2, src_values3, temp_8x16b;
__m128i src_values10, src_values11, src_values12, src_values13;
pos = ((row + 1) * intra_pred_ang);
idx = pos >> 5;
fract = pos & (31);
temp = 32 - fract;
ref_main_idx = (4 * nt) + 2 * idx + 2; /* col from 0-31 */
fract_8x16b = _mm_set1_epi8(fract);
temp_8x16b = _mm_set1_epi8(temp);
temp_8x16b = _mm_unpacklo_epi8(temp_8x16b, fract_8x16b);
src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx)); /* col = 0-7 */
src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx + 8)); /* col = 8-15 */
src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx + 16)); /* col = 16-23 */
src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx + 24)); /* col = 24-31 */
src_values10 = _mm_srli_si128(src_values0, 2);
src_values11 = _mm_srli_si128(src_values1, 2);
src_values12 = _mm_srli_si128(src_values2, 2);
src_values13 = _mm_srli_si128(src_values3, 2);
src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);
/* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
src_values0 = _mm_maddubs_epi16(src_values0, temp_8x16b);
src_values1 = _mm_maddubs_epi16(src_values1, temp_8x16b);
src_values2 = _mm_maddubs_epi16(src_values2, temp_8x16b);
src_values3 = _mm_maddubs_epi16(src_values3, temp_8x16b);
/*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
/*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
src_values0 = _mm_srai_epi16(src_values0, 5);
src_values1 = _mm_srai_epi16(src_values1, 5);
src_values2 = _mm_srai_epi16(src_values2, 5);
src_values3 = _mm_srai_epi16(src_values3, 5);
/* converting 16 bit to 8 bit */
src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);
/* loding 8-bit 8 pixels values */
_mm_storel_epi64((__m128i *)(pu1_dst), src_values0);
_mm_storel_epi64((__m128i *)(pu1_dst + 8), src_values1);
_mm_storel_epi64((__m128i *)(pu1_dst + 16), src_values2);
_mm_storel_epi64((__m128i *)(pu1_dst + 24), src_values3);
pu1_dst += dst_strd;
}
}
}