blob: cffd2a95acce6c6fdf199647c099d9a23f012d3d [file] [log] [blame]
/******************************************************************************
*
* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
/**
*******************************************************************************
* @file
* ihevc_sao_atom_intr.c
*
* @brief
* Contains function definitions for Sample adaptive offset(SAO) used in-loop
* filtering
*
* @author
* 100592
*
* @par List of Functions:
* - ihevc_sao_band_offset_luma_ssse3()
* - ihevc_sao_band_offset_chroma_ssse3()
* - ihevc_sao_edge_offset_class0_ssse3()
* - ihevc_sao_edge_offset_class0_chroma_ssse3()
* - ihevc_sao_edge_offset_class1_ssse3()
* - ihevc_sao_edge_offset_class1_chroma_ssse3()
* - ihevc_sao_edge_offset_class2_ssse3()
* - ihevc_sao_edge_offset_class2_chroma_ssse3()
* - ihevc_sao_edge_offset_class3_ssse3()
* - ihevc_sao_edge_offset_class3_chroma_ssse3()
*
* @remarks
* None
*
*******************************************************************************
*/
/*****************************************************************************/
/* File Includes */
/*****************************************************************************/
#include <stdio.h>
#include "ihevc_typedefs.h"
#include "ihevc_platform_macros.h"
#include "ihevc_macros.h"
#include "ihevc_func_selector.h"
#include "ihevc_defs.h"
#include "ihevc_tables_x86_intr.h"
#include "ihevc_common_tables.h"
#include "ihevc_sao.h"
#include <immintrin.h>
#define NUM_BAND_TABLE 32
/**
*******************************************************************************
*
* @brief
* Has two sets of functions : band offset and edge offset both for luma and chroma
* edge offset has horizontal ,vertical, 135 degree and 45 degree
*
* @par Description:
*
*
* @param[in-out] pu1_src
* Pointer to the source
*
* @param[in] src_strd
* Source stride
*
* @param[in-out] pu1_src_left
* source left boundary
*
* @param[in-out] pu1_src_top
* Source top boundary
*
* @param[in-out] pu1_src_top_left
* Source top left boundary
*
* @param[in] pu1_src_top_right
* Source top right boundary
*
* @param[in] pu1_src_bot_left
* Source bottom left boundary
*
* @param[in] pu1_avail
* boundary availability flags
*
* @param[in] pi1_sao_offset_u
* Chroma U sao offset values
*
* @param[in] pi1_sao_offset_v
* Chroma V sao offset values
*
* @param[in] pi1_sao_offset
* Luma sao offset values
*
* @param[in] wd
* width of the source
* @param[in] ht
* height of the source
* @returns
*
* @remarks
* None
*
*******************************************************************************
*/
void ihevc_sao_band_offset_luma_ssse3(UWORD8 *pu1_src,
WORD32 src_strd,
UWORD8 *pu1_src_left,
UWORD8 *pu1_src_top,
UWORD8 *pu1_src_top_left,
WORD32 sao_band_pos,
WORD8 *pi1_sao_offset,
WORD32 wd,
WORD32 ht)
{
WORD32 row, col;
UWORD8 *pu1_src_cpy;
WORD32 wd_rem;
WORD8 offset = 0;
__m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b;
__m128i band_table0_8x16b, band_table1_8x16b, band_table2_8x16b, band_table3_8x16b;
__m128i tmp_set_128i_1, tmp_set_128i_2, tmp_set_128i_3, tmp_set_128i_4;
__m128i band_pos_16x8b;
__m128i sao_offset;
__m128i cmp_mask, cmp_store;
/* Updating left and top-left and top */
for(row = 0; row < ht; row++)
{
pu1_src_left[row] = pu1_src[row * src_strd + (wd - 1)];
}
pu1_src_top_left[0] = pu1_src_top[wd - 1];
for(col = 0; col < wd; col += 8)
{
tmp_set_128i_1 = _mm_loadl_epi64((__m128i *)(pu1_src + (ht - 1) * src_strd + offset));
_mm_storel_epi64((__m128i *)(pu1_src_top + offset), tmp_set_128i_1);
offset += 8;
}
//replicating sao_band_pos as 8 bit value 16 times
band_pos_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos << 3));
//value set for sao_offset extraction
tmp_set_128i_1 = _mm_set_epi8(128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1);
tmp_set_128i_2 = _mm_set_epi8(128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2);
tmp_set_128i_3 = _mm_set_epi8(128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3);
tmp_set_128i_4 = _mm_set_epi8(128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4);
//loaded sao offset values
sao_offset = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
//loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers
band_table0_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx));
band_table1_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8));
band_table2_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16));
band_table3_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24));
//band_position addition
band_table0_8x16b = _mm_add_epi16(band_table0_8x16b, band_pos_16x8b);
band_table1_8x16b = _mm_add_epi16(band_table1_8x16b, band_pos_16x8b);
band_table2_8x16b = _mm_add_epi16(band_table2_8x16b, band_pos_16x8b);
band_table3_8x16b = _mm_add_epi16(band_table3_8x16b, band_pos_16x8b);
//sao_offset duplication
tmp_set_128i_1 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1);
tmp_set_128i_2 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2);
tmp_set_128i_3 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3);
tmp_set_128i_4 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4);
//settng for comparision
cmp_mask = _mm_set1_epi16(16);
cmp_store = _mm_set1_epi16(0x00ff);
//sao_offset addition
band_table0_8x16b = _mm_add_epi16(band_table0_8x16b, tmp_set_128i_1);
band_table1_8x16b = _mm_add_epi16(band_table1_8x16b, tmp_set_128i_2);
band_table2_8x16b = _mm_add_epi16(band_table2_8x16b, tmp_set_128i_3);
band_table3_8x16b = _mm_add_epi16(band_table3_8x16b, tmp_set_128i_4);
//masking upper 8bit values of each 16 bit band table value
band_table0_8x16b = _mm_and_si128(band_table0_8x16b, cmp_store);
band_table1_8x16b = _mm_and_si128(band_table1_8x16b, cmp_store);
band_table2_8x16b = _mm_and_si128(band_table2_8x16b, cmp_store);
band_table3_8x16b = _mm_and_si128(band_table3_8x16b, cmp_store);
switch(sao_band_pos)
{
case 0:
tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table0_8x16b);
band_table0_8x16b = _mm_and_si128(band_table0_8x16b, tmp_set_128i_2);
break;
case 28:
tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table3_8x16b);
band_table3_8x16b = _mm_or_si128(band_table3_8x16b, tmp_set_128i_2);
break;
case 29:
tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table2_8x16b);
band_table2_8x16b = _mm_or_si128(band_table2_8x16b, tmp_set_128i_2);
tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table3_8x16b);
band_table3_8x16b = _mm_and_si128(band_table3_8x16b, tmp_set_128i_2);
break;
case 30:
tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table1_8x16b);
band_table1_8x16b = _mm_or_si128(band_table1_8x16b, tmp_set_128i_2);
tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table2_8x16b);
band_table2_8x16b = _mm_and_si128(band_table2_8x16b, tmp_set_128i_2);
break;
case 31:
tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table0_8x16b);
band_table0_8x16b = _mm_or_si128(band_table0_8x16b, tmp_set_128i_2);
tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table1_8x16b);
band_table1_8x16b = _mm_and_si128(band_table1_8x16b, tmp_set_128i_2);
break;
default:
break;
}
//sao_offset is reused for zero cmp mask.
sao_offset = _mm_setzero_si128();
tmp_set_128i_1 = _mm_set1_epi8(1);
//tmp_set_128i_2 = _mm_set_epi8 (128,7,128,6,128,5,128,4,128,3,128,2,128,1,128,0);
cmp_mask = _mm_packus_epi16(cmp_mask, cmp_mask); //cmp_msk=dup16(16);
//masking upper 8bit values of each 16 bit band table value
band_table0_8x16b = _mm_and_si128(band_table0_8x16b, cmp_store);
band_table1_8x16b = _mm_and_si128(band_table1_8x16b, cmp_store);
band_table2_8x16b = _mm_and_si128(band_table2_8x16b, cmp_store);
band_table3_8x16b = _mm_and_si128(band_table3_8x16b, cmp_store);
//band table 8x16 four registers are packed into 16x8 two registers: band_table0_8x16b and band_table2_8x16b
band_table0_8x16b = _mm_packus_epi16(band_table0_8x16b, band_table1_8x16b);
band_table2_8x16b = _mm_packus_epi16(band_table2_8x16b, band_table3_8x16b);
band_table3_8x16b = _mm_slli_epi16(cmp_mask, 1); // to compare if value is greater than 31
band_pos_16x8b = _mm_packus_epi16(band_pos_16x8b, band_pos_16x8b); //band_pos is now 8 bit aligned
band_table3_8x16b = _mm_sub_epi8(band_table3_8x16b, tmp_set_128i_1); // to compare if value is greater than 31
cmp_mask = _mm_sub_epi8(cmp_mask, tmp_set_128i_1);
// band_pos_16x8b = _mm_or_si128(band_pos_16x8b,cmp_store);
for(col = wd; col >= 16; col -= 16)
{
pu1_src_cpy = pu1_src;
for(row = ht; row > 0; row -= 2)
{
//row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
// row = 1
src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
//saturated substract 8 bit
tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_16x8b);
tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_16x8b);
//if the values less than 0 put ff
tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
//if the values gret=ater than 31 put ff
tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, band_table3_8x16b);
tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, band_table3_8x16b);
tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
//row 0 and row1
//if the values >16 then put ff ,cmp_mask = dup16(15)
cmp_store = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
//values 16 to 31 for row 0 & 1 but values <16 ==0
tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, cmp_store);
// values 0 to 15 for row 0 & 1
tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, cmp_store);
//values 16 to 31 for row 0 & 1 but values <16 masked to ff
cmp_store = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, cmp_store);
//row 2 and row 3
//if the values >16 then put ff ,cmp_mask = dup16(15)
cmp_store = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
//values 16 to 31 for row 2 & 3 but values <16 ==0
tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, cmp_store);
// values 0 to 15 for row 2 & 3
tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, cmp_store);
//values 16 to 31 for row 2 & 3 but values <16 masked to ff
cmp_store = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, cmp_store);
//row 0 and row 1
//to preserve pixel values in which no offset needs to be added.
cmp_store = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, cmp_store);
//row 2 and row 3
//to preserve pixel values in which no offset needs to be added.
cmp_store = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, cmp_store);
//indexing 0 - 15 bandtable indexes
tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_1);
tmp_set_128i_3 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_3);
tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_2);
tmp_set_128i_4 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_4);
// combining all offsets results
tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
// combing results woth the pixel values
src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
//row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
_mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_8x16b);
// row = 1
_mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp2_8x16b);
pu1_src_cpy += (src_strd << 1);
}
pu1_src += 16;
}
wd_rem = wd & 0xF;
if(wd_rem)
{pu1_src_cpy = pu1_src;
for(row = ht; row > 0; row -= 4)
{
//row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
// row = 1
src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
// row = 2
src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
// row = 3
src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
//row0 and row1 packed and row2 and row3 packed
src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b);
src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp3_8x16b);
//saturated substract 8 bit
tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_16x8b);
tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_16x8b);
//if the values less than 0 put ff
tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
//if the values gret=ater than 31 put ff
tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, band_table3_8x16b);
tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, band_table3_8x16b);
tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
//row 0 and row1
//if the values >16 then put ff ,cmp_mask = dup16(15)
cmp_store = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
//values 16 to 31 for row 0 & 1 but values <16 ==0
tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, cmp_store);
// values 0 to 15 for row 0 & 1
tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, cmp_store);
//values 16 to 31 for row 0 & 1 but values <16 masked to ff
cmp_store = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, cmp_store);
//row 2 and row 3
//if the values >16 then put ff ,cmp_mask = dup16(15)
cmp_store = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
//values 16 to 31 for row 2 & 3 but values <16 ==0
tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, cmp_store);
// values 0 to 15 for row 2 & 3
tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, cmp_store);
//values 16 to 31 for row 2 & 3 but values <16 masked to ff
cmp_store = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, cmp_store);
//row 0 and row 1
//to preserve pixel values in which no offset needs to be added.
cmp_store = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, cmp_store);
//row 2 and row 3
//to preserve pixel values in which no offset needs to be added.
cmp_store = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, cmp_store);
//indexing 0 - 15 bandtable indexes
tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_1);
tmp_set_128i_3 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_3);
tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_2);
tmp_set_128i_4 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_4);
// combining all offsets results
tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
// combing results woth the pixel values
src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
//Getting row1 separately
src_temp1_8x16b = _mm_srli_si128(src_temp0_8x16b, 8);
//Getting row3 separately
src_temp3_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
//row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
_mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_8x16b);
// row = 1
_mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), src_temp1_8x16b);
// row = 2
_mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp2_8x16b);
// row = 3
_mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), src_temp3_8x16b);
pu1_src_cpy += (src_strd << 2);
}
pu1_src += 8;
}
}
void ihevc_sao_band_offset_chroma_ssse3(UWORD8 *pu1_src,
WORD32 src_strd,
UWORD8 *pu1_src_left,
UWORD8 *pu1_src_top,
UWORD8 *pu1_src_top_left,
WORD32 sao_band_pos_u,
WORD32 sao_band_pos_v,
WORD8 *pi1_sao_offset_u,
WORD8 *pi1_sao_offset_v,
WORD32 wd,
WORD32 ht)
{
WORD32 row, col;
WORD8 offset = 0;
__m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b;
__m128i cmp_msk2;
__m128i band_table0_16x8b, band_table1_16x8b, band_table2_16x8b, band_table3_16x8b;
__m128i tmp_set_128i_1, tmp_set_128i_2, tmp_set_128i_3, tmp_set_128i_4;
__m128i band_pos_u_16x8b, band_pos_v_16x8b;
__m128i sao_offset;
__m128i cmp_mask;
/* Updating left and top and top-left */
for(row = 0; row < ht; row++)
{
pu1_src_left[2 * row] = pu1_src[row * src_strd + (wd - 2)];
pu1_src_left[2 * row + 1] = pu1_src[row * src_strd + (wd - 1)];
}
pu1_src_top_left[0] = pu1_src_top[wd - 2];
pu1_src_top_left[1] = pu1_src_top[wd - 1];
for(col = 0; col < wd; col += 8)
{
tmp_set_128i_1 = _mm_loadl_epi64((__m128i *)(pu1_src + (ht - 1) * src_strd + offset));
_mm_storel_epi64((__m128i *)(pu1_src_top + offset), tmp_set_128i_1);
offset += 8;
}
{ // band _table creation
__m128i temp0_8x16b, temp1_8x16b, temp2_8x16b, temp3_8x16b;
// Band table for U component : band_table0_16x8b and band_table2_16x8b
//replicating sao_band_pos as 8 bit value 16 times
band_pos_u_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos_u << 3));
//value set for sao_offset extraction
tmp_set_128i_1 = _mm_set_epi8(128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1);
tmp_set_128i_2 = _mm_set_epi8(128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2);
tmp_set_128i_3 = _mm_set_epi8(128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3);
tmp_set_128i_4 = _mm_set_epi8(128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4);
//loaded sao offset values
sao_offset = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
//loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers
band_table0_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx));
band_table1_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8));
band_table2_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16));
band_table3_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24));
//band_position addition
band_table0_16x8b = _mm_add_epi16(band_table0_16x8b, band_pos_u_16x8b);
band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, band_pos_u_16x8b);
band_table2_16x8b = _mm_add_epi16(band_table2_16x8b, band_pos_u_16x8b);
band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, band_pos_u_16x8b);
//sao_offset duplication
temp0_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1);
temp1_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2);
temp2_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3);
temp3_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4);
//sao_offset addition
band_table0_16x8b = _mm_add_epi16(band_table0_16x8b, temp0_8x16b);
band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, temp1_8x16b);
band_table2_16x8b = _mm_add_epi16(band_table2_16x8b, temp2_8x16b);
band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, temp3_8x16b);
//reuse for clipping
temp1_8x16b = _mm_set1_epi16(0x00ff);
//settng for comparision
cmp_mask = _mm_set1_epi16(16);
//masking upper 8bit values of each 16 bit band table value
band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp1_8x16b);
band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp1_8x16b);
band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
//temp1_8x16b reuse for compare storage
switch(sao_band_pos_u)
{
case 0:
temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table0_16x8b);
band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp3_8x16b);
break;
case 28:
temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
band_table3_16x8b = _mm_or_si128(band_table3_16x8b, temp3_8x16b);
break;
case 29:
temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table2_16x8b);
band_table2_16x8b = _mm_or_si128(band_table2_16x8b, temp3_8x16b);
temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp3_8x16b);
break;
case 30:
temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
band_table1_16x8b = _mm_or_si128(band_table1_16x8b, temp3_8x16b);
temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table2_16x8b);
band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp3_8x16b);
break;
case 31:
temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table0_16x8b);
band_table0_16x8b = _mm_or_si128(band_table0_16x8b, temp3_8x16b);
temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp3_8x16b);
break;
default:
break;
}
//masking upper 8bit values of each 16 bit band table value
band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp1_8x16b);
band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp1_8x16b);
band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
//band table 8x16 four registers are packed into 16x8 two registers: band_table0_8x16b and band_table2_8x16b
band_table0_16x8b = _mm_packus_epi16(band_table0_16x8b, band_table1_16x8b);
band_table2_16x8b = _mm_packus_epi16(band_table2_16x8b, band_table3_16x8b);
// Band table for U component over
// Band table for V component : band_table1_16x8b and band_table3_16x8b
// replicating sao_band_pos as 8 bit value 16 times
band_pos_v_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos_v << 3));
//loaded sao offset values
sao_offset = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
//loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers
temp0_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx));
band_table1_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8));
temp2_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16));
band_table3_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24));
//band_position addition
temp0_8x16b = _mm_add_epi16(temp0_8x16b, band_pos_v_16x8b);
band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, band_pos_v_16x8b);
temp2_8x16b = _mm_add_epi16(temp2_8x16b, band_pos_v_16x8b);
band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, band_pos_v_16x8b);
//sao_offset duplication
tmp_set_128i_1 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1);
tmp_set_128i_2 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2);
tmp_set_128i_3 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3);
tmp_set_128i_4 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4);
//sao_offset addition
temp0_8x16b = _mm_add_epi16(temp0_8x16b, tmp_set_128i_1);
band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, tmp_set_128i_2);
temp2_8x16b = _mm_add_epi16(temp2_8x16b, tmp_set_128i_3);
band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, tmp_set_128i_4);
//masking upper 8bit values of 16 bit band table value
temp0_8x16b = _mm_and_si128(temp0_8x16b, temp1_8x16b);
band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
temp2_8x16b = _mm_and_si128(temp2_8x16b, temp1_8x16b);
band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
//temp1_8x16b reuse for compare storage
switch(sao_band_pos_v)
{
case 0:
temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp0_8x16b);
temp0_8x16b = _mm_and_si128(temp0_8x16b, temp3_8x16b);
break;
case 28:
temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
band_table3_16x8b = _mm_or_si128(band_table3_16x8b, temp3_8x16b);
break;
case 29:
temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp2_8x16b);
temp2_8x16b = _mm_or_si128(temp2_8x16b, temp3_8x16b);
temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp3_8x16b);
break;
case 30:
temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
band_table1_16x8b = _mm_or_si128(band_table1_16x8b, temp3_8x16b);
temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp2_8x16b);
temp2_8x16b = _mm_and_si128(temp2_8x16b, temp3_8x16b);
break;
case 31:
temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp0_8x16b);
temp0_8x16b = _mm_or_si128(temp0_8x16b, temp3_8x16b);
temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp3_8x16b);
break;
default:
break;
}
//masking upper 8bit values of each 16 bit band table value
temp0_8x16b = _mm_and_si128(temp0_8x16b, temp1_8x16b);
band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
temp2_8x16b = _mm_and_si128(temp2_8x16b, temp1_8x16b);
band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
//band table 8x16 four registers are packed into 16x8 two registers: band_table0_8x16b and band_table2_8x16b
band_table1_16x8b = _mm_packus_epi16(temp0_8x16b, band_table1_16x8b);
band_table3_16x8b = _mm_packus_epi16(temp2_8x16b, band_table3_16x8b);
//band table for u and v created
}
{
UWORD8 *pu1_src_cpy;
WORD32 wd_rem;
//sao_offset is reused for zero cmp mask.
sao_offset = _mm_setzero_si128();
tmp_set_128i_1 = _mm_set1_epi8(1);
//tmp_set_128i_2 = _mm_set_epi8 (128,7,128,6,128,5,128,4,128,3,128,2,128,1,128,0);
cmp_mask = _mm_packus_epi16(cmp_mask, cmp_mask); //cmp_msk=dup16(16);
//to avoid ffff to be saturated to 0 instead it should be to ff
cmp_msk2 = _mm_slli_epi16(cmp_mask, 1); // to compare if value is greater than 31
band_pos_u_16x8b = _mm_packus_epi16(band_pos_u_16x8b, band_pos_u_16x8b); //band_pos_u is now 8 bit aligned
band_pos_v_16x8b = _mm_packus_epi16(band_pos_v_16x8b, band_pos_v_16x8b); //band_pos_v is now 8 bit aligned
cmp_msk2 = _mm_sub_epi8(cmp_msk2, tmp_set_128i_1); // to compare if value is greater than 31
cmp_mask = _mm_sub_epi8(cmp_mask, tmp_set_128i_1);
for(col = wd; col >= 16; col -= 16)
{
pu1_src_cpy = pu1_src;
for(row = ht; row > 0; row -= 2)
{
//row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
// row = 1
src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
//odd values
src_temp1_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
src_temp2_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
//even values
src_temp0_8x16b = _mm_slli_epi16(src_temp0_8x16b, 8);
src_temp3_8x16b = _mm_slli_epi16(src_temp3_8x16b, 8);
src_temp0_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
src_temp3_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
//combining odd values
src_temp2_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp2_8x16b);
//combining even values
src_temp0_8x16b = _mm_packus_epi16(src_temp0_8x16b, src_temp3_8x16b);
//saturated substract 8 bit
tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_u_16x8b);
tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_v_16x8b);
//if the values less than 0 put ff
tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
//if the values greater than 31 put ff
tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_msk2);
tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_msk2);
tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
// registers reused to increase performance
//if the values >16 then put ff ,cmp_mask = dup16(15) row 0 and row1
src_temp1_8x16b = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
//if the values >16 then put ff ,cmp_mask = dup16(15) row 2 and row 3
src_temp3_8x16b = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
//values 16 to 31 for row 0 & 1 but values <16 ==0
tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, src_temp1_8x16b);
// values 0 to 15 for row 0 & 1
tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, src_temp1_8x16b);
//values 16 to 31 for row 2 & 3 but values <16 ==0
tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, src_temp3_8x16b);
// values 0 to 15 for row 2 & 3
tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, src_temp3_8x16b);
//values 16 to 31 for row 0 & 1 but values <16 masked to ff row 0 and row1
src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
//values 16 to 31 for row 0 & 1 but values <16 masked to ff row 2 and row 3
src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, src_temp1_8x16b);
tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, src_temp3_8x16b);
//to choose which pixel values to preserve in row 0 and row 1
src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
//to choose which pixel values to preserve in row 2 and row 3
src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
//values of all rows to which no offset needs to be added preserved.
src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, src_temp1_8x16b);
src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, src_temp3_8x16b);
//indexing 0 - 15 bandtable indexes
tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_16x8b, tmp_set_128i_1); //U low
tmp_set_128i_3 = _mm_shuffle_epi8(band_table1_16x8b, tmp_set_128i_3); //V low
//indexing 16 -31 bandtable indexes
tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_16x8b, tmp_set_128i_2); //U high
tmp_set_128i_4 = _mm_shuffle_epi8(band_table3_16x8b, tmp_set_128i_4); //V high
// combining all offsets results
tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); //U
tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); //V
// combing results with the pixel values
src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
//reorganising even and odd values
src_temp1_8x16b = _mm_unpacklo_epi8(src_temp0_8x16b, src_temp2_8x16b);
src_temp3_8x16b = _mm_unpackhi_epi8(src_temp0_8x16b, src_temp2_8x16b);
//row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
_mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp1_8x16b);
// row = 1
_mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp3_8x16b);
pu1_src_cpy += (src_strd << 1);
}
pu1_src += 16;
}
wd_rem = wd & 0xF;
if(wd_rem)
{
pu1_src_cpy = pu1_src;
for(row = ht; row > 0; row -= 4)
{
//row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
// row = 1
src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
// row = 2
src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
// row = 3
src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
//row0 and row1 packed and row2 and row3 packed
src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b);
src_temp3_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp3_8x16b);
//odd values
src_temp1_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
src_temp2_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
//even values
src_temp0_8x16b = _mm_slli_epi16(src_temp0_8x16b, 8);
src_temp3_8x16b = _mm_slli_epi16(src_temp3_8x16b, 8);
src_temp0_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
src_temp3_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
//combining odd values
src_temp2_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp2_8x16b);
//combining even values
src_temp0_8x16b = _mm_packus_epi16(src_temp0_8x16b, src_temp3_8x16b);
//saturated substract 8 bit
tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_u_16x8b);
tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_v_16x8b);
//if the values less than 0 put ff
tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
//if the values greater than 31 put ff
tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_msk2);
tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_msk2);
tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
// registers reused to increase performance
//if the values >16 then put ff ,cmp_mask = dup16(15) row 0 and row1
src_temp1_8x16b = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
//if the values >16 then put ff ,cmp_mask = dup16(15) row 2 and row 3
src_temp3_8x16b = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
//values 16 to 31 for row 0 & 1 but values <16 ==0
tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, src_temp1_8x16b);
// values 0 to 15 for row 0 & 1
tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, src_temp1_8x16b);
//values 16 to 31 for row 2 & 3 but values <16 ==0
tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, src_temp3_8x16b);
// values 0 to 15 for row 2 & 3
tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, src_temp3_8x16b);
//values 16 to 31 for row 0 & 1 but values <16 masked to ff row 0 and row1
src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
//values 16 to 31 for row 0 & 1 but values <16 masked to ff row 2 and row 3
src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, src_temp1_8x16b);
tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, src_temp3_8x16b);
//to choose which pixel values to preserve in row 0 and row 1
src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
//to choose which pixel values to preserve in row 2 and row 3
src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
//values of all rows to which no offset needs to be added preserved.
src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, src_temp1_8x16b);
src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, src_temp3_8x16b);
//indexing 0 - 15 bandtable indexes
tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_16x8b, tmp_set_128i_1); //U low
tmp_set_128i_3 = _mm_shuffle_epi8(band_table1_16x8b, tmp_set_128i_3); //V low
//indexing 16 -31 bandtable indexes
tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_16x8b, tmp_set_128i_2); //U high
tmp_set_128i_4 = _mm_shuffle_epi8(band_table3_16x8b, tmp_set_128i_4); //V high
// combining all offsets results
tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); //U
tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); //V
// combing results with the pixel values
src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
//reorganising even and odd values
src_temp1_8x16b = _mm_unpacklo_epi8(src_temp0_8x16b, src_temp2_8x16b);
src_temp3_8x16b = _mm_unpackhi_epi8(src_temp0_8x16b, src_temp2_8x16b);
//Getting row1 separately
src_temp0_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
//Getting row3 separately
src_temp2_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
//row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
_mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp1_8x16b);
// row = 1
_mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), src_temp0_8x16b);
// row = 2
_mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp3_8x16b);
// row = 3
_mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), src_temp2_8x16b);
pu1_src_cpy += (src_strd << 2);
}
pu1_src += 16;
}
}
}
void ihevc_sao_edge_offset_class0_ssse3(UWORD8 *pu1_src,
WORD32 src_strd,
UWORD8 *pu1_src_left,
UWORD8 *pu1_src_top,
UWORD8 *pu1_src_top_left,
UWORD8 *pu1_src_top_right,
UWORD8 *pu1_src_bot_left,
UWORD8 *pu1_avail,
WORD8 *pi1_sao_offset,
WORD32 wd,
WORD32 ht)
{
WORD32 row, col;
UWORD8 *pu1_src_cpy, *pu1_src_left_cpy, *pu1_src_left_str, *pu1_left_tmp;
UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8];
UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8];
UWORD8 u1_avail0, u1_avail1;
WORD32 wd_rem;
WORD32 offset = 0;
__m128i src_temp0_16x8b, src_temp1_16x8b;
__m128i left0_16x8b, left1_16x8b;
__m128i cmp_gt0_16x8b, cmp_lt0_16x8b, cmp_gt1_16x8b, cmp_lt1_16x8b;
__m128i edge0_16x8b, edge1_16x8b;
__m128i au1_mask8x16b;
__m128i edge_idx_8x16b, sao_offset_8x16b;
__m128i const2_16x8b, const0_16x8b;
__m128i left_store_16x8b;
UNUSED(pu1_src_top_right);
UNUSED(pu1_src_bot_left);
au1_mask8x16b = _mm_set1_epi8(0xff);
/* Update top and top-left arrays */
*pu1_src_top_left = pu1_src_top[wd - 1];
for(col = wd; col >= 16; col -= 16)
{
const0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + offset + (ht - 1) * src_strd));
_mm_storeu_si128((__m128i *)(pu1_src_top + offset), const0_16x8b);
offset += 16;
}
//setting availability mask to ff size MAX_CTB_SIZE
for(col = 0; col < MAX_CTB_SIZE; col += 16)
_mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
for(row = 0; row < ht; row++)
{
au1_src_left_tmp[row] = pu1_src_left[row];
}
edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
//availability mask creation
u1_avail0 = pu1_avail[0];
u1_avail1 = pu1_avail[1];
au1_mask[0] = u1_avail0;
au1_mask[wd - 1] = u1_avail1;
const2_16x8b = _mm_set1_epi8(2);
const0_16x8b = _mm_setzero_si128();
pu1_src_left_cpy = au1_src_left_tmp;
pu1_src_left_str = au1_src_left_tmp1;
{
au1_mask_cpy = au1_mask;
for(col = wd; col >= 16; col -= 16)
{
pu1_src_cpy = pu1_src;
au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
//pu1_src_left_cpy =au1_src_left_tmp;
for(row = ht; row > 0; row -= 2)
{
left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
//row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
// row = 1
src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 2);
//row 1 left
left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 15);
left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15);
//row 0 left
left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 15);
left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
//separating +ve and and -ve values.
cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
cmp_lt1_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
//combining the appropriate sign change
left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
left1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
//row = 0 right
edge0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 1));
// row = 1 right
edge1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
//separating +ve and and -ve values.
cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
cmp_lt1_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
//combining the appropriate sign change
edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
edge1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
//combining sign-left and sign_right
edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
//adding constant 2
edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
//shuffle to get sao index
edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
//using availability mask
edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
//shuffle to get sao offset
edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
//cnvert to 16 bit then add and then saturated pack
left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
_mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
//row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
_mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
// row = 1
_mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
pu1_src_cpy += (src_strd << 1);
pu1_src_left_cpy += 2;
pu1_src_left_str += 2;
}
au1_mask_cpy += 16;
pu1_src += 16;
pu1_src_left_cpy -= ht;
pu1_src_left_str -= ht;
pu1_left_tmp = pu1_src_left_cpy;
pu1_src_left_cpy = pu1_src_left_str;
pu1_src_left_str = pu1_left_tmp;
}
wd_rem = wd & 0xF;
if(wd_rem)
{
cmp_gt1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (ht - 1) * src_strd));
_mm_storel_epi64((__m128i *)(pu1_src_top + offset), cmp_gt1_16x8b);
au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy);
pu1_src_cpy = pu1_src;
au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
//pu1_src_left_cpy =au1_src_left_tmp;
for(row = ht; row > 0; row -= 4)
{
left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
//row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
// row = 1
cmp_gt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
// row = 2
src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
// row = 3
cmp_gt1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 4);
//row 3 left
edge0_16x8b = _mm_slli_si128(cmp_gt1_16x8b, 8);
cmp_lt1_16x8b = _mm_alignr_epi8(cmp_gt1_16x8b, left_store_16x8b, 15);
left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
//row 2 left
edge0_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 15);
left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
//row 1 left
edge0_16x8b = _mm_slli_si128(cmp_gt0_16x8b, 8);
cmp_lt0_16x8b = _mm_alignr_epi8(cmp_gt0_16x8b, left_store_16x8b, 15);
left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
//row 0 left
edge0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 15);
left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
// packing rows together for 16 SIMD operations
src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, cmp_gt0_16x8b);
src_temp1_16x8b = _mm_unpacklo_epi64(src_temp1_16x8b, cmp_gt1_16x8b);
// packing rows together for 16 SIMD operations
left0_16x8b = _mm_unpacklo_epi64(left0_16x8b, cmp_lt0_16x8b);
left1_16x8b = _mm_unpacklo_epi64(left1_16x8b, cmp_lt1_16x8b);
//separating +ve and and -ve values.
cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
cmp_lt1_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
//combining the appropriate sign change
left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
left1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
//row = 0 right
edge0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 1));
// row = 1 right
cmp_gt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd + 1));
// row = 2 right
edge1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd + 1));
// row = 3 right
cmp_gt1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd + 1));
// packing rows together for 16 SIMD operations
edge0_16x8b = _mm_unpacklo_epi64(edge0_16x8b, cmp_gt0_16x8b);
edge1_16x8b = _mm_unpacklo_epi64(edge1_16x8b, cmp_gt1_16x8b);
//separating +ve and and -ve values.
cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
cmp_lt1_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
//combining the appropriate sign change
edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
edge1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
//combining sign-left and sign_right
edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
//adding constant 2
edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
//shuffle to get sao index
edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
//shuffle to get sao offset
//using availability mask
edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
//cnvert to 16 bit then add and then saturated pack
left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
//separting row 1 and row 3
cmp_lt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
cmp_lt1_16x8b = _mm_srli_si128(src_temp1_16x8b, 8);
_mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
//row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
_mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
// row = 1
_mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_lt0_16x8b);
// row = 2
_mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp1_16x8b);
// row = 3
_mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt1_16x8b);
pu1_src_cpy += (src_strd << 2);
pu1_src_left_cpy += 4;
pu1_src_left_str += 4;
}
pu1_src += wd;
pu1_src_left_cpy -= ht;
pu1_src_left_str -= ht;
pu1_left_tmp = pu1_src_left_cpy;
pu1_src_left_cpy = pu1_src_left_str;
pu1_src_left_str = pu1_left_tmp;
}
for(row = 0; row < ht; row++)
{
pu1_src_left[row] = pu1_src_left_cpy[row];
}
}
}
void ihevc_sao_edge_offset_class0_chroma_ssse3(UWORD8 *pu1_src,
WORD32 src_strd,
UWORD8 *pu1_src_left,
UWORD8 *pu1_src_top,
UWORD8 *pu1_src_top_left,
UWORD8 *pu1_src_top_right,
UWORD8 *pu1_src_bot_left,
UWORD8 *pu1_avail,
WORD8 *pi1_sao_offset_u,
WORD8 *pi1_sao_offset_v,
WORD32 wd,
WORD32 ht)
{
WORD32 row, col;
UWORD8 *pu1_src_cpy, *pu1_src_left_cpy, *pu1_src_left_str, *pu1_left_tmp;
UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)];
UWORD8 au1_src_left_tmp1[2 * (MAX_CTB_SIZE + 8)];
UWORD8 u1_avail0, u1_avail1;
WORD32 wd_rem;
WORD32 offset = 0;
__m128i src_temp0_16x8b, src_temp1_16x8b;
__m128i left0_16x8b, left1_16x8b;
__m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
__m128i edge0_16x8b, edge1_16x8b;
__m128i au1_mask8x16b;
__m128i edge_idx_8x16b, sao_offset_8x16b;
__m128i const2_16x8b, const0_16x8b;
__m128i left_store_16x8b;
__m128i chroma_offset_8x16b;
UNUSED(pu1_src_top_right);
UNUSED(pu1_src_bot_left);
au1_mask8x16b = _mm_set1_epi8(0xff);
/* Update top and top-left arrays */
pu1_src_top_left[0] = pu1_src_top[wd - 2];
pu1_src_top_left[1] = pu1_src_top[wd - 1];;
for(col = wd; col >= 16; col -= 16)
{
const0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + offset + (ht - 1) * src_strd));
_mm_storeu_si128((__m128i *)(pu1_src_top + offset), const0_16x8b);
offset += 16;
}
for(row = 0; row < 2 * ht; row++)
{
au1_src_left_tmp[row] = pu1_src_left[row];
}
//setting availability mask to ff size MAX_CTB_SIZE
for(col = 0; col < MAX_CTB_SIZE; col += 16)
_mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
chroma_offset_8x16b = _mm_set1_epi16(0x0800);
//availability mask creation
u1_avail0 = pu1_avail[0];
u1_avail1 = pu1_avail[1];
au1_mask[0] = u1_avail0;
au1_mask[1] = u1_avail0;
au1_mask[wd - 1] = u1_avail1;
au1_mask[wd - 2] = u1_avail1;
sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
const2_16x8b = _mm_set1_epi8(2);
const0_16x8b = _mm_setzero_si128();
{
pu1_src_left_cpy = au1_src_left_tmp;
pu1_src_left_str = au1_src_left_tmp1;
au1_mask_cpy = au1_mask;
for(col = wd; col >= 16; col -= 16)
{
pu1_src_cpy = pu1_src;
au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
for(row = ht; row > 0; row -= 2)
{
left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
//row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
// row = 1
src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 4);
//row 1 left
left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 14);
left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14);
//row 0 left
left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 14);
left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
//separating +ve and and -ve values.row 0 left
cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
//combining the appropriate sign change
left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
//separating +ve and and -ve values.row 1 left
cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
//combining the appropriate sign change
left1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
//row = 0 right
edge0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2));
// row = 1 right
edge1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
//separating +ve and and -ve values.row 0 right
cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
//combining the appropriate sign change
edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
//separating +ve and and -ve values.row 1 right
cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
//combining the appropriate sign change
edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
//combining sign-left and sign_right
edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
//adding constant 2
edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
//shuffle to get sao index
edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
//using availability mask
edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
//adding chroma offset to access U and V
edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
//shuffle to get sao offset
edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
//cnvert to 16 bit then add and then saturated pack
left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
_mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
//row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
_mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
// row = 1
_mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
pu1_src_cpy += (src_strd << 1);
pu1_src_left_cpy += 4;
pu1_src_left_str += 4;
}
au1_mask_cpy += 16;
pu1_src += 16;
pu1_src_left_cpy -= 2 * ht;
pu1_src_left_str -= 2 * ht;
pu1_left_tmp = pu1_src_left_cpy;
pu1_src_left_cpy = pu1_src_left_str;
pu1_src_left_str = pu1_left_tmp;
}
wd_rem = wd & 0xF;
if(wd_rem)
{
cmp_gt0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (ht - 1) * src_strd));
_mm_storel_epi64((__m128i *)(pu1_src_top + offset), cmp_gt0_16x8b);
au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy);
pu1_src_cpy = pu1_src;
au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
for(row = ht; row > 0; row -= 4)
{
left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
//row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
// row = 1
cmp_gt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
// row = 2
src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
// row = 3
cmp_lt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 8);
//row 3 left
edge0_16x8b = _mm_slli_si128(cmp_lt0_16x8b, 8);
left0_16x8b = _mm_alignr_epi8(cmp_lt0_16x8b, left_store_16x8b, 14);
left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
//row 2 left
edge0_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 14);
left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
// packing rows together for 16 SIMD operations
src_temp1_16x8b = _mm_unpacklo_epi64(src_temp1_16x8b, cmp_lt0_16x8b);
left1_16x8b = _mm_unpacklo_epi64(left1_16x8b, left0_16x8b);
//row 1 left
edge0_16x8b = _mm_slli_si128(cmp_gt0_16x8b, 8);
edge1_16x8b = _mm_alignr_epi8(cmp_gt0_16x8b, left_store_16x8b, 14);
left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
//row 0 left
edge0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 14);
left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
// packing rows together for 16 SIMD operations
src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, cmp_gt0_16x8b);
left0_16x8b = _mm_unpacklo_epi64(left0_16x8b, edge1_16x8b);
//separating +ve and and -ve values.for row 2 and row 3
cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
//combining the appropriate sign change
left1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
//separating +ve and and -ve values.
cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
//combining the appropriate sign change
left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
//row = 0 right
edge0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2));
// row = 1 right
cmp_gt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd + 2));
// row = 2 right
edge1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd + 2));
// row = 3 right
cmp_lt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd + 2));
// packing rows together for 16 SIMD operations
edge0_16x8b = _mm_unpacklo_epi64(edge0_16x8b, cmp_gt0_16x8b);
edge1_16x8b = _mm_unpacklo_epi64(edge1_16x8b, cmp_lt0_16x8b);
//separating +ve and and -ve values.
cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
//combining the appropriate sign change
edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
//combining the appropriate sign change
edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
//combining sign-left and sign_right
edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
//adding constant 2
edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
//shuffle to get sao index
edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
//shuffle to get sao offset
//using availability mask
edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
//adding chroma offset to access U and V
edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
//cnvert to 16 bit then add and then saturated pack
left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
//seaprting row 1 and row 3
cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
cmp_lt0_16x8b = _mm_srli_si128(src_temp1_16x8b, 8);
_mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
//row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
_mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
// row = 1
_mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
// row = 2
_mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp1_16x8b);
// row = 3
_mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
pu1_src_cpy += (src_strd << 2);
pu1_src_left_cpy += 8;
pu1_src_left_str += 8;
}
pu1_src += wd;
pu1_src_left_cpy -= 2 * ht;
pu1_src_left_str -= 2 * ht;
pu1_left_tmp = pu1_src_left_cpy;
pu1_src_left_cpy = pu1_src_left_str;
pu1_src_left_str = pu1_left_tmp;
}
for(row = 0; row < 2 * ht; row++)
{
pu1_src_left[row] = pu1_src_left_cpy[row];
}
}
}
void ihevc_sao_edge_offset_class1_ssse3(UWORD8 *pu1_src,
WORD32 src_strd,
UWORD8 *pu1_src_left,
UWORD8 *pu1_src_top,
UWORD8 *pu1_src_top_left,
UWORD8 *pu1_src_top_right,
UWORD8 *pu1_src_bot_left,
UWORD8 *pu1_avail,
WORD8 *pi1_sao_offset,
WORD32 wd,
WORD32 ht)
{
WORD32 row, col;
UWORD8 *pu1_src_top_cpy;
UWORD8 *pu1_src_cpy;
WORD32 wd_rem;
__m128i src_top_16x8b, src_bottom_16x8b;
__m128i src_temp0_16x8b, src_temp1_16x8b;
__m128i signup0_16x8b, signdwn1_16x8b;
__m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
__m128i edge0_16x8b, edge1_16x8b;
__m128i edge_idx_8x16b, sao_offset_8x16b;
__m128i const2_16x8b, const0_16x8b;
UNUSED(pu1_src_top_right);
UNUSED(pu1_src_bot_left);
/* Updating left and top-left */
for(row = 0; row < ht; row++)
{
pu1_src_left[row] = pu1_src[row * src_strd + (wd - 1)];
}
*pu1_src_top_left = pu1_src_top[wd - 1];
pu1_src_top_cpy = pu1_src_top;
edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
/* Update height and source pointers based on the availability flags */
if(0 == pu1_avail[2])
{
pu1_src_top_cpy = pu1_src;
pu1_src += src_strd;
ht--;
}
if(0 == pu1_avail[3])
{
ht--;
}
const2_16x8b = _mm_set1_epi8(2);
const0_16x8b = _mm_setzero_si128();
{
WORD32 ht_rem;
for(col = wd; col >= 16; col -= 16)
{
pu1_src_cpy = pu1_src;
src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
//row = 0
src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
//separating +ve and and -ve values.
cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
//combining the appropriate sign change
signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
for(row = ht; row >= 2; row -= 2)
{
//row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
// row = 2
src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
//row 0 -row1
//separating +ve and and -ve values.
cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
//combining the appropriate sign change
edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
//row1-row0
edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
//row1 -bottom
cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
//combining the appropriate sign change
signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
//combining sign-left and sign_right
edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
//for the next iteration signup0_16x8b = -signdwn1_16x8b
signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
//adding constant 2
edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
//shuffle to get sao index
edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
//shuffle to get sao offset
edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
//copying the next top
src_top_16x8b = src_temp1_16x8b;
//cnvert to 16 bit then add and then saturated pack
signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
//row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
_mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
// row = 1
_mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
src_temp0_16x8b = src_bottom_16x8b;
pu1_src_cpy += (src_strd << 1);
}
ht_rem = ht & 0x1;
if(ht_rem)
{
src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
//current row -next row
//separating +ve and and -ve values.
cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
//combining the appropriate sign change
edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
//adding top and botton and constant 2
edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
//copying the next top
src_top_16x8b = src_temp0_16x8b;
//cnvert to 16 bit then add and then saturated pack
signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
_mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
}
if(0 == pu1_avail[3])
{
src_top_16x8b = src_bottom_16x8b;
}
//updating top flag
_mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
pu1_src += 16;
}
wd_rem = wd & 0xF;
if(wd_rem)
{
pu1_src_cpy = pu1_src;
src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
//row = 0
src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
//separating +ve and and -ve values.
cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
//combining the appropriate sign change
signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
for(row = ht; row >= 4; row -= 4)
{
//row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
// row = 2
src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
//row 0 -row1
//separating +ve and and -ve values.
cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
//combining the appropriate sign change
edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
//row1-row0
edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
//row1 -row2
cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
//combining the appropriate sign change
signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
//packing row 0 n row 1
src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
//row = 3
src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
// row = 4
src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 4 * src_strd));
edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
signdwn1_16x8b = _mm_slli_si128(signdwn1_16x8b, 8); //allign left (1-2)
//separating +ve and and -ve values.(2,3)
cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_top_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_bottom_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
//combining the appropriate sign change
edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signdwn1_16x8b, 8); //(2-3) ,(1-2) (substract with down)
edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8);
//separating +ve and and -ve values.(3,4)
cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp1_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_top_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-4)
//combining sign-left and sign_right
edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-4),(2-3)
edge1_16x8b = _mm_sub_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
//packing row 2 n row 3
src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
//for the next iteration signup0_16x8b = -signdwn1_16x8b
signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(4-3)
//adding constant 2
edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
//shuffle to get sao index
edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
//shuffle to get sao offset
edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
//the next top already in src_top_16x8b
//src_top_16x8b = src_temp1_16x8b;
//cnvert to 16 bit then add and then saturated pack
signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b);
src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
//row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
_mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
// row = 1
_mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
//row = 2
_mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
// row = 3
_mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
src_temp0_16x8b = src_temp1_16x8b;
signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
pu1_src_cpy += (src_strd << 2);
}
ht_rem = ht & 0x2;
if(ht_rem)
{
//row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
// row = 2
src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
//row 0 -row1
//separating +ve and and -ve values.
cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
//combining the appropriate sign change
edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
//row1-row0
edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
//row1 -row2
cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
//combining the appropriate sign change
signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
//adding top and down substraction
edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
//for the next iteration signup0_16x8b = -signdwn1_16x8b
signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(2-1) for next
src_top_16x8b = src_temp1_16x8b;
//adding constant 2
edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
//shuffle to get sao index
edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
//shuffle to get sao offset
edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
//the next top already in src_top_16x8b
//cnvert to 16 bit then add and then saturated pack
signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
//row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
_mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
// row = 1
_mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
src_temp0_16x8b = src_bottom_16x8b;
pu1_src_cpy += (src_strd << 1);
}
ht_rem = ht & 0x1;
if(ht_rem)
{
//row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
//row 0 -row1
//separating +ve and and -ve values.
cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
//combining the appropriate sign change
edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
//adding top and down substraction
edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
//adding constant 2
edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
//shuffle to get sao index
edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
//shuffle to get sao offset
edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
src_top_16x8b = src_temp0_16x8b;
//cnvert to 16 bit then add and then saturated pack
signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
//row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
_mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
pu1_src_cpy += (src_strd);
}
if(0 == pu1_avail[3])
{
src_top_16x8b = src_bottom_16x8b;
}
_mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
pu1_src += 8;
}
}
}
void ihevc_sao_edge_offset_class1_chroma_ssse3(UWORD8 *pu1_src,
WORD32 src_strd,
UWORD8 *pu1_src_left,
UWORD8 *pu1_src_top,
UWORD8 *pu1_src_top_left,
UWORD8 *pu1_src_top_right,
UWORD8 *pu1_src_bot_left,
UWORD8 *pu1_avail,
WORD8 *pi1_sao_offset_u,
WORD8 *pi1_sao_offset_v,
WORD32 wd,
WORD32 ht)
{
WORD32 row, col;
UWORD8 *pu1_src_top_cpy;
UWORD8 *pu1_src_cpy;
WORD32 wd_rem;
__m128i src_top_16x8b, src_bottom_16x8b;
__m128i src_temp0_16x8b, src_temp1_16x8b;
__m128i signup0_16x8b, signdwn1_16x8b;
__m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
__m128i edge0_16x8b, edge1_16x8b;
__m128i edge_idx_8x16b, sao_offset_8x16b;
__m128i const2_16x8b, const0_16x8b;
__m128i chroma_offset_8x16b;
UNUSED(pu1_src_top_right);
UNUSED(pu1_src_bot_left);
/* Updating left and top and top-left */
for(row = 0; row < ht; row++)
{
pu1_src_left[2 * row] = pu1_src[row * src_strd + (wd - 2)];
pu1_src_left[2 * row + 1] = pu1_src[row * src_strd + (wd - 1)];
}
pu1_src_top_left[0] = pu1_src_top[wd - 2];
pu1_src_top_left[1] = pu1_src_top[wd - 1];
pu1_src_top_cpy = pu1_src_top;
edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
chroma_offset_8x16b = _mm_set1_epi16(0x0800);
/* Update height and source pointers based on the availability flags */
if(0 == pu1_avail[2])
{
pu1_src_top_cpy = pu1_src;
pu1_src += src_strd;
ht--;
}
if(0 == pu1_avail[3])
{
ht--;
}
sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
const2_16x8b = _mm_set1_epi8(2);
const0_16x8b = _mm_setzero_si128();
{
WORD32 ht_rem;
for(col = wd; col >= 16; col -= 16)
{
pu1_src_cpy = pu1_src;
src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
//row = 0
src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
//separating +ve and and -ve values.
cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
//combining the appropriate sign change
signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
for(row = ht; row >= 2; row -= 2)
{
//row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
// row = 2
src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
//row 0 -row1
//separating +ve and and -ve values.
cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
//combining the appropriate sign change
edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
//row1-row0
edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
//row1 -bottom
cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
//combining the appropriate sign change
signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
//combining sign-left and sign_right
edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
//for the next iteration signup0_16x8b = -signdwn1_16x8b
signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
//adding constant 2
edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
//copying the next top
src_top_16x8b = src_temp1_16x8b;
//shuffle to get sao index
edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
//adding chroma offset to access U and V
edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
//shuffle to get sao offset
edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
//cnvert to 16 bit then add and then saturated pack
signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
//row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
_mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
// row = 1
_mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
src_temp0_16x8b = src_bottom_16x8b;
pu1_src_cpy += (src_strd << 1);
}
ht_rem = ht & 0x1;
if(ht_rem)
{
src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
//current row -next row
//separating +ve and and -ve values.
cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
//combining the appropriate sign change
edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
//adding top and botton and constant 2
edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
//copying the next top
src_top_16x8b = src_temp0_16x8b;
edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
//adding chroma offset to access U and V
edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
//cnvert to 16 bit then add and then saturated pack
signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
_mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
}
if(0 == pu1_avail[3])
{
src_top_16x8b = src_bottom_16x8b;
}
//updating top flag
_mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
pu1_src += 16;
}
wd_rem = wd & 0xF;
if(wd_rem)
{
pu1_src_cpy = pu1_src;
src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
//row = 0
src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
//separating +ve and and -ve values.
cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
//combining the appropriate sign change
signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
for(row = ht; row >= 4; row -= 4)
{
//row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
// row = 2
src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
//row 0 -row1
//separating +ve and and -ve values.
cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
//combining the appropriate sign change
edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
//row1-row0
edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
//row1 -row2
cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
//combining the appropriate sign change
signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
//packing row 0 n row 1
src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
//row = 3
src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
// row = 4
src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 4 * src_strd));
edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
signdwn1_16x8b = _mm_slli_si128(signdwn1_16x8b, 8); //allign left (1-2)
//separating +ve and and -ve values.(2,3)
cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_top_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_bottom_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
//combining the appropriate sign change
edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signdwn1_16x8b, 8); //(2-3) ,(1-2) (substract with down)
edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8);
//separating +ve and and -ve values.(3,4)
cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp1_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_top_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-4)
//combining sign-left and sign_right
edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-4),(2-3)
edge1_16x8b = _mm_sub_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
//packing row 2 n row 3
src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
//for the next iteration signup0_16x8b = -signdwn1_16x8b
signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(4-3)
//adding constant 2
edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
//shuffle to get sao index
edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
//adding chroma offset to access U and V
edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
//shuffle to get sao offset
edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
//the next top already in src_top_16x8b
//cnvert to 16 bit then add and then saturated pack
signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b);
src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
//row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
_mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
// row = 1
_mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
//row = 2
_mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
// row = 3
_mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
src_temp0_16x8b = src_temp1_16x8b;
signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
pu1_src_cpy += (src_strd << 2);
}
ht_rem = ht & 0x2;
if(ht_rem)
{
//row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
// row = 2
src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
//row 0 -row1
//separating +ve and and -ve values.
cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
//combining the appropriate sign change
edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
//row1-row0
edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
//row1 -row2
cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
//combining the appropriate sign change
signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
//adding top and down substraction
edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
//for the next iteration signup0_16x8b = -signdwn1_16x8b
signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(2-1) for next
src_top_16x8b = src_temp1_16x8b;
//adding constant 2
edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
//shuffle to get sao index
edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
//adding chroma offset to access U and V
edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
//shuffle to get sao offset
edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
//the next top already in src_top_16x8b
//cnvert to 16 bit then add and then saturated pack
signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b);
src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
//row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
_mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
// row = 1
_mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
src_temp0_16x8b = src_bottom_16x8b;
pu1_src_cpy += (src_strd << 1);
}
ht_rem = ht & 0x1;
if(ht_rem)
{
//row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
//row 0 -row1
//separating +ve and and -ve values.
cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
//combining the appropriate sign change
edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
//adding top and down substraction
edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
//adding constant 2
edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
src_top_16x8b = src_temp0_16x8b;
edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
//shuffle to get sao index
edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
//adding chroma offset to access U and V
edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
//shuffle to get sao offset
edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
//cnvert to 16 bit then add and then saturated pack
signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
//row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
_mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
pu1_src_cpy += (src_strd);
}
if(0 == pu1_avail[3])
{
src_top_16x8b = src_bottom_16x8b;
}
_mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
pu1_src += 8;
}
}
}
/* 135 degree filtering */
void ihevc_sao_edge_offset_class2_ssse3(UWORD8 *pu1_src,
WORD32 src_strd,
UWORD8 *pu1_src_left,
UWORD8 *pu1_src_top,
UWORD8 *pu1_src_top_left,
UWORD8 *pu1_src_top_right,
UWORD8 *pu1_src_bot_left,
UWORD8 *pu1_avail,
WORD8 *pi1_sao_offset,
WORD32 wd,
WORD32 ht)
{
WORD32 row, col;
UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2;
UWORD8 *pu1_firstleft;
UWORD8 *pu1_src_cpy, *pu1_src_org;
UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8];
UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8];
WORD32 wd_rem;
UWORD8 u1_pos_0_0_tmp, u1_pos_wd_ht_tmp;
WORD32 ht_tmp, ht_0;
WORD32 bit_depth;
UWORD8 u1_avail0, u1_avail1;
__m128i src_top_16x8b, src_bottom_16x8b;
__m128i src_temp0_16x8b, src_temp1_16x8b;
__m128i signup0_16x8b, signdwn1_16x8b;
__m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
__m128i edge0_16x8b, edge1_16x8b;
__m128i au1_mask8x16b;
__m128i edge_idx_8x16b, sao_offset_8x16b;
__m128i const2_16x8b, const0_16x8b;
__m128i left_store_16x8b;
UNUSED(pu1_src_top_right);
UNUSED(pu1_src_bot_left);
ht_0 = ht; ht_tmp = ht;
au1_mask8x16b = _mm_set1_epi8(0xff);
//setting availability mask to ff size MAX_CTB_SIZE
for(col = 0; col < MAX_CTB_SIZE; col += 16)
_mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
for(row = 0; row < ht; row++)
{
au1_src_left_tmp[row] = pu1_src_left[row];
}
bit_depth = BIT_DEPTH_LUMA;
pu1_src_org = pu1_src;
pu1_src_top_cpy = pu1_src_top;
pu1_src_left_cpy2 = au1_src_left_tmp;
pu1_src_left_cpy = au1_src_left_tmp;
pu1_src_left_str2 = au1_src_left_tmp1;
pu1_src_left_str = au1_src_left_tmp1;
edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
/* If top-left is available, process separately */
if(0 != pu1_avail[4])
{
WORD8 edge_idx;
edge_idx = 2 + SIGN(pu1_src[0] - pu1_src_top_left[0]) +
SIGN(pu1_src[0] - pu1_src[1 + src_strd]);
edge_idx = gi1_table_edge_idx[edge_idx];
if(0 != edge_idx)
{
u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
}
else
{
u1_pos_0_0_tmp = pu1_src[0];
}
}
else
{
u1_pos_0_0_tmp = pu1_src[0];
}
/* If bottom-right is available, process separately */
if(0 != pu1_avail[7])
{
WORD8 edge_idx;
edge_idx = 2 + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 1 - src_strd]) +
SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]);
edge_idx = gi1_table_edge_idx[edge_idx];
if(0 != edge_idx)
{
u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
}
else
{
u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd];
}
}
else
{
u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd];
}
pu1_firstleft = pu1_src_top_left;
/* Update height and source pointers based on the availability flags */
if(0 == pu1_avail[2])
{
pu1_firstleft = pu1_src_left_cpy2;
pu1_src_left_cpy2++;
pu1_src_left_str2++;
pu1_src_top_cpy = pu1_src;
pu1_src += src_strd;
ht--;
}
if(0 == pu1_avail[3])
{
ht--;
ht_0--;
}
//storing top left in a mmx register
left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_firstleft);
const2_16x8b = _mm_set1_epi8(2);
const0_16x8b = _mm_setzero_si128();
left_store_16x8b = _mm_slli_si128(left_store_16x8b, 15);
//update top -left
*pu1_src_top_left = pu1_src_top[wd - 1];
//availability mask creation
u1_avail0 = pu1_avail[0];
u1_avail1 = pu1_avail[1];
au1_mask[0] = u1_avail0;
au1_mask[wd - 1] = u1_avail1;
{
WORD32 ht_rem;
pu1_src_left_cpy = pu1_src_left_cpy2;
pu1_src_left_str = pu1_src_left_str2;
au1_mask_cpy = au1_mask;
for(col = wd; col >= 16; col -= 16)
{
pu1_src_cpy = pu1_src;
src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
//row = 0
src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 15);
//loading the mask
au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
//separating +ve and and -ve values.
cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
//combining the appropriate sign change
signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
for(row = ht; row >= 2; row -= 2)
{
left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
//row = 1
src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
// row = 1 right
src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
//to insert left in row 0
signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 15);
//row 0 -row1
//separating +ve and and -ve values.
cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
//manipulation for row 1 - row 0
signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15);
//combining the appropriate sign change
edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
//row1-row0
//separating +ve and and -ve values.
cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
// row = 2 right
src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd + 1));
edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
//row1 -bottom
cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
//combining the appropriate sign change
signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
// row = 2
src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
//combining sign-left and sign_right
edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
//storing the row 1 left for next row.
signup0_16x8b = _mm_slli_si128(left_store_16x8b, 14);
//combining sign-left and sign_right
edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
//manipulation for bottom - row 1
signup0_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signup0_16x8b, 15);
//eliminating old left for row 0 and row 1
left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
//bottom - row1
cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signup0_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(signup0_16x8b, src_bottom_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
//for the next iteration bottom -row1
signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
//row1 getting it right for left of next block
left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15);
//adding constant 2
edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
//shuffle to get sao index
edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
//using availability mask
edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
//shuffle to get sao offset
edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
//row0 getting it right for left of next block
left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
//copying the next top
src_top_16x8b = src_temp1_16x8b;
//cnvert to 16 bit then add and then saturated pack
signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
//store left boundary
_mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
//row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
_mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
// row = 1
_mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
src_temp0_16x8b = src_bottom_16x8b;
pu1_src_cpy += (src_strd << 1);
pu1_src_left_cpy += 2;
pu1_src_left_str += 2;
}
ht_rem = ht & 0x1;
if(ht_rem)
{
left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
//current row -next row
//separating +ve and and -ve values.
cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
//creating mask 00 for +ve and -ve values and FF for zero.
cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
//combining the appropriate sign change
edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
//adding top and botton and constant 2
edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
//eliminating old left for row 0 and row 1
left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
//using availability mask
edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
//row0 getting it right for left of next block
left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
//copying the next top
src_top_16x8b = src_temp0_16x8b;
//cnvert to 16 bit then add and then saturated pack
signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
//store left boundary
_mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
_mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
pu1_src_cpy += (src_strd);<