blob: 0e89de3cd0fadf7b9cebe4e23c697f9ef0ba0b4e [file] [log] [blame]
/******************************************************************************
*
* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
/**
*******************************************************************************
* @file
* ihevc_intra_pred_filters_neon_intr.c
*
* @brief
* Contains function Definition for intra prediction interpolation filters
*
*
* @author
* Yogeswaran RS
*
* @par List of Functions:
* - ihevc_intra_pred_luma_planar()
* - ihevc_intra_pred_luma_dc()
* - ihevc_intra_pred_luma_horz()
* - ihevc_intra_pred_luma_ver()
* - ihevc_intra_pred_luma_mode2()
* - ihevc_intra_pred_luma_mode_18_34()
*
* @remarks
* None
*
*******************************************************************************
*/
/*****************************************************************************/
/* File Includes */
/*****************************************************************************/
#include <stdio.h>
#include "ihevc_typedefs.h"
#include "ihevc_intra_pred.h"
#include "ihevc_macros.h"
#include "ihevc_func_selector.h"
#include "arm_neon.h"
#include "ihevc_platform_macros.h"
#include "ihevc_common_tables.h"
/****************************************************************************/
/* Constant Macros */
/****************************************************************************/
#define MAX_CU_SIZE 64
#define BIT_DEPTH 8
#define T32_4NT 128
#define T16_4NT 64
/*****************************************************************************/
/* Table Look-up */
/*****************************************************************************/
#define GET_BITS(y,x) ((y) & (1 << x)) && (1 << x)
/*****************************************************************************/
/* Function Definition */
/*****************************************************************************/
/**
*******************************************************************************
*
* @brief
* Intra prediction interpolation filter for pu1_ref substitution
*
*
* @par Description:
* Reference substitution process for samples unavailable for prediction
* Refer to section 8.4.4.2.2
*
* @param[in] pu1_top_left
* UWORD8 pointer to the top-left
*
* @param[in] pu1_top
* UWORD8 pointer to the top
*
* @param[in] pu1_left
* UWORD8 pointer to the left
*
* @param[in] src_strd
* WORD32 Source stride
*
* @param[in] nbr_flags
* WORD32 neighbor availability flags
*
* @param[in] nt
* WORD32 transform Block size
*
* @param[in] dst_strd
* WORD32 Destination stride
*
* @returns
*
* @remarks
* None
*
*******************************************************************************
*/
void ihevc_intra_pred_luma_ref_substitution_neonintr(UWORD8 *pu1_top_left,
UWORD8 *pu1_top,
UWORD8 *pu1_left,
WORD32 src_strd,
WORD32 nt,
WORD32 nbr_flags,
UWORD8 *pu1_dst,
WORD32 dst_strd)
{
UWORD8 pu1_ref;
WORD32 dc_val, i;
WORD32 total_samples = (4 * nt) + 1;
WORD32 two_nt = 2 * nt;
WORD32 three_nt = 3 * nt;
WORD32 get_bits;
WORD32 next;
WORD32 bot_left, left, top, tp_right, tp_left;
WORD32 idx, nbr_id_from_bl, frwd_nbr_flag;
UNUSED(dst_strd);
dc_val = 1 << (BIT_DEPTH - 1);
/* Neighbor Flag Structure*/
/* Top-Left | Top-Right | Top | Left | Bottom-Left
1 4 4 4 4
*/
/* If no neighbor flags are present, fill the neighbor samples with DC value */
if(nbr_flags == 0)
{
for(i = 0; i < total_samples; i++)
{
pu1_dst[i] = dc_val;
}
}
else
{
/* Else fill the corresponding samples */
pu1_dst[two_nt] = *pu1_top_left;
UWORD8 *pu1_dst_tmp2 = pu1_dst;
UWORD8 *pu1_top_tmp = pu1_top;
pu1_dst_tmp2 += two_nt + 1;
for(i = 0; i < two_nt; i++)
pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
uint8x8_t src;
for(i = two_nt; i > 0; i -= 8)
{
src = vld1_u8(pu1_top_tmp);
pu1_top_tmp += 8;
vst1_u8(pu1_dst_tmp2, src);
pu1_dst_tmp2 += 8;
}
if(nt <= 8)
{
/* 1 bit extraction for all the neighboring blocks */
tp_left = (nbr_flags & 0x10000) >> 16;
bot_left = nbr_flags & 0x1;
left = (nbr_flags & 0x10) >> 4;
top = (nbr_flags & 0x100) >> 8;
tp_right = (nbr_flags & 0x1000) >> 12;
next = 1;
/* If bottom -left is not available, reverse substitution process*/
if(bot_left == 0)
{
WORD32 a_nbr_flag[5] = { bot_left, left, tp_left, top, tp_right };
/* Check for the 1st available sample from bottom-left*/
while(!a_nbr_flag[next])
next++;
/* If Left, top-left are available*/
if(next <= 2)
{
idx = nt * next;
pu1_ref = pu1_dst[idx];
for(i = 0; i < idx; i++)
pu1_dst[i] = pu1_ref;
}
else /* If top, top-right are available */
{
/* Idx is changed to copy 1 pixel value for top-left ,if top-left is not available*/
idx = (nt * (next - 1)) + 1;
pu1_ref = pu1_dst[idx];
for(i = 0; i < idx; i++)
pu1_dst[i] = pu1_ref;
}
}
/* Forward Substitution Process */
/* If left is Unavailable, copy the last bottom-left value */
if(left == 0)
{
uint8x8_t dup_pu1_dst1;
UWORD8 *pu1_dst_const_nt = pu1_dst;
pu1_dst_const_nt += nt;
if(0 == (nt & 7))
{
dup_pu1_dst1 = vdup_n_u8(pu1_dst[nt - 1]);
for(i = nt; i > 0; i -= 8)
{
vst1_u8(pu1_dst_const_nt, dup_pu1_dst1);
pu1_dst_const_nt += 8;
}
}
else
{
//uint32x2_t dup_pu1_dst4;
dup_pu1_dst1 = vdup_n_u8(pu1_dst[nt - 1]);
//dup_pu1_dst4 = vdup_n_u32((uint32_t) pu1_dst[nt - 1]);
for(i = nt; i > 0; i -= 4)
{
vst1_lane_u32((uint32_t *)pu1_dst_const_nt, vreinterpret_u32_u8(dup_pu1_dst1), 0);
pu1_dst_const_nt += 4;
}
}
}
if(tp_left == 0)
pu1_dst[two_nt] = pu1_dst[two_nt - 1];
if(top == 0)
{
if(0 == (nt & 7))
{
uint8x8_t dup_pu1_dst2;
UWORD8 *pu1_dst_const_two_nt_1 = pu1_dst;
pu1_dst_const_two_nt_1 += (two_nt + 1);
dup_pu1_dst2 = vdup_n_u8(pu1_dst[two_nt]);
for(i = nt; i > 0; i -= 8)
{
vst1_u8(pu1_dst_const_two_nt_1, dup_pu1_dst2);
pu1_dst_const_two_nt_1 += 8;
}
}
else
{
for(i = 0; i < nt; i++)
pu1_dst[two_nt + 1 + i] = pu1_dst[two_nt];
}
}
if(tp_right == 0)
{
uint8x8_t dup_pu1_dst3;
UWORD8 *pu1_dst_const_three_nt_1 = pu1_dst;
pu1_dst_const_three_nt_1 += (three_nt + 1);
dup_pu1_dst3 = vdup_n_u8(pu1_dst[two_nt]);
if(0 == (nt & 7))
{
for(i = nt; i > 0; i -= 8)
{
vst1_u8(pu1_dst_const_three_nt_1, dup_pu1_dst3);
pu1_dst_const_three_nt_1 += 8;
}
}
else
{
for(i = nt; i > 0; i -= 4)
{
vst1_lane_u32((uint32_t *)pu1_dst_const_three_nt_1, vreinterpret_u32_u8(dup_pu1_dst3), 0);
pu1_dst_const_three_nt_1 += 4;
}
}
}
}
if(nt == 16)
{
WORD32 nbr_flags_temp = 0;
nbr_flags_temp = (nbr_flags & 0x3) + ((nbr_flags & 0x30) >> 2)
+ ((nbr_flags & 0x300) >> 4)
+ ((nbr_flags & 0x3000) >> 6)
+ ((nbr_flags & 0x10000) >> 8);
/* compute trailing zeors based on nbr_flag for substitution process of below left see section .*/
/* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
{
nbr_id_from_bl = look_up_trailing_zeros(nbr_flags_temp & 0XF) * 8; /* for below left and left */
if(nbr_id_from_bl == 64)
nbr_id_from_bl = 32;
if(nbr_id_from_bl == 32)
{
/* for top left : 1 pel per nbr bit */
if(!((nbr_flags_temp >> 8) & 0x1))
{
nbr_id_from_bl++;
nbr_id_from_bl += look_up_trailing_zeros((nbr_flags_temp >> 4) & 0xF) * 8; /* top and top right; 8 pels per nbr bit */
}
}
/* Reverse Substitution Process*/
if(nbr_id_from_bl)
{
/* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
pu1_ref = pu1_dst[nbr_id_from_bl];
for(i = (nbr_id_from_bl - 1); i >= 0; i--)
{
pu1_dst[i] = pu1_ref;
}
}
}
/* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
while(nbr_id_from_bl < ((T16_4NT) + 1))
{
/* To Obtain the next unavailable idx flag after reverse neighbor substitution */
/* Devide by 8 to obtain the original index */
frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
/* The Top-left flag is at the last bit location of nbr_flags*/
if(nbr_id_from_bl == (T16_4NT / 2))
{
get_bits = GET_BITS(nbr_flags_temp, 8);
/* only pel substitution for TL */
if(!get_bits)
pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
}
else
{
get_bits = GET_BITS(nbr_flags_temp, frwd_nbr_flag);
if(!get_bits)
{
/* 8 pel substitution (other than TL) */
pu1_ref = pu1_dst[nbr_id_from_bl - 1];
for(i = 0; i < 8; i++)
pu1_dst[nbr_id_from_bl + i] = pu1_ref;
}
}
nbr_id_from_bl += (nbr_id_from_bl == (T16_4NT / 2)) ? 1 : 8;
}
}
if(nt == 32)
{
/* compute trailing ones based on mbr_flag for substitution process of below left see section .*/
/* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
{
nbr_id_from_bl = look_up_trailing_zeros((nbr_flags & 0XFF)) * 8; /* for below left and left */
if(nbr_id_from_bl == 64)
{
/* for top left : 1 pel per nbr bit */
if(!((nbr_flags >> 16) & 0x1))
{
/* top left not available */
nbr_id_from_bl++;
/* top and top right; 8 pels per nbr bit */
nbr_id_from_bl += look_up_trailing_zeros((nbr_flags >> 8) & 0xFF) * 8;
}
}
/* Reverse Substitution Process*/
if(nbr_id_from_bl)
{
/* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
pu1_ref = pu1_dst[nbr_id_from_bl];
for(i = (nbr_id_from_bl - 1); i >= 0; i--)
pu1_dst[i] = pu1_ref;
}
}
/* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
while(nbr_id_from_bl < ((T32_4NT)+1))
{
/* To Obtain the next unavailable idx flag after reverse neighbor substitution */
/* Devide by 8 to obtain the original index */
frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
/* The Top-left flag is at the last bit location of nbr_flags*/
if(nbr_id_from_bl == (T32_4NT / 2))
{
get_bits = GET_BITS(nbr_flags, 16);
/* only pel substitution for TL */
if(!get_bits)
pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
}
else
{
get_bits = GET_BITS(nbr_flags, frwd_nbr_flag);
if(!get_bits)
{
/* 8 pel substitution (other than TL) */
pu1_ref = pu1_dst[nbr_id_from_bl - 1];
for(i = 0; i < 8; i++)
pu1_dst[nbr_id_from_bl + i] = pu1_ref;
}
}
nbr_id_from_bl += (nbr_id_from_bl == (T32_4NT / 2)) ? 1 : 8;
}
}
}
}
/**
*******************************************************************************
*
* @brief
* Intra prediction interpolation filter for ref_filtering
*
*
* @par Description:
* Reference DC filtering for neighboring samples dependent on TU size and
* mode Refer to section 8.4.4.2.3 in the standard
*
* @param[in] pu1_src
* UWORD8 pointer to the source
*
* @param[out] pu1_dst
* UWORD8 pointer to the destination
*
* @param[in] nt
* integer Transform Block size
*
* @param[in] mode
* integer intraprediction mode
*
* @returns
*
* @remarks
* None
*
*******************************************************************************
*/
void ihevc_intra_pred_ref_filtering_neonintr(UWORD8 *pu1_src,
WORD32 nt,
UWORD8 *pu1_dst,
WORD32 mode,
WORD32 strong_intra_smoothing_enable_flag)
{
WORD32 filter_flag;
WORD32 i = 0;
WORD32 four_nt = 4 * nt;
WORD32 src_4nt;
/* Naming has been made as per the functionlity it has, For eg. pu1_src_tmp_1 is denoting pu1_src + 1 */
/* src_val_1 to load value from pointer pu1_src_tmp_1, add_res has the result of adding 2 values */
UWORD8 *pu1_src_tmp_0 = pu1_src;
UWORD8 *pu1_src_tmp_1;
UWORD8 *pu1_src_tmp_2;
UWORD8 *pu1_dst_tmp_0 = pu1_dst;
UWORD8 *pu1_dst_tmp_1;
uint8x8_t src_val_0, src_val_2;
uint8x8_t src_val_1, shift_res;
uint8x8_t dup_const_2;
uint16x8_t mul_res, add_res;
WORD32 bi_linear_int_flag = 0;
WORD32 abs_cond_left_flag = 0;
WORD32 abs_cond_top_flag = 0;
WORD32 dc_val = 1 << (BIT_DEPTH - 5);
shift_res = vdup_n_u8(0);
filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(nt) - 2));
if(0 == filter_flag)
{
if(pu1_src == pu1_dst)
{
return;
}
else
{
for(i = four_nt; i > 0; i -= 8)
{
src_val_0 = vld1_u8(pu1_src_tmp_0);
pu1_src_tmp_0 += 8;
vst1_u8(pu1_dst_tmp_0, src_val_0);
pu1_dst_tmp_0 += 8;
}
pu1_dst[four_nt] = pu1_src[four_nt];
}
}
else
{
/* If strong intra smoothin is enabled and transform size is 32 */
if((1 == strong_intra_smoothing_enable_flag) && (32 == nt))
{
/*Strong Intra Filtering*/
abs_cond_top_flag = (ABS(pu1_src[2 * nt] + pu1_src[4 * nt]
- (2 * pu1_src[3 * nt]))) < dc_val;
abs_cond_left_flag = (ABS(pu1_src[2 * nt] + pu1_src[0]
- (2 * pu1_src[nt]))) < dc_val;
bi_linear_int_flag = ((1 == abs_cond_left_flag)
&& (1 == abs_cond_top_flag));
}
src_4nt = pu1_src[4 * nt];
/* Strong filtering of reference samples */
if(1 == bi_linear_int_flag)
{
WORD32 two_nt = four_nt >> 1;
WORD32 pu1_src_0_val = pu1_src[0];
WORD32 pu1_src_2_nt_val = pu1_src[2 * nt];
WORD32 pu1_src_4_nt_val = pu1_src[4 * nt];
WORD32 prod_two_nt_src_0_val = two_nt * pu1_src_0_val;
uint16x8_t prod_two_nt_src_0_val_t = vdupq_n_u16(prod_two_nt_src_0_val);
WORD32 prod_two_nt_src_2_nt_val = two_nt * pu1_src_2_nt_val;
uint16x8_t prod_two_nt_src_2_nt_val_t = vdupq_n_u16(prod_two_nt_src_2_nt_val);
const UWORD8 *const_col_i;
uint8x8_t const_col_i_val;
uint16x8_t prod_val_1;
uint16x8_t prod_val_2;
uint16x8_t prod_val_3;
uint16x8_t prod_val_4;
uint8x8_t res_val_1;
uint8x8_t res_val_2;
uint8x8_t pu1_src_0_val_t = vdup_n_u8(pu1_src_0_val);
uint8x8_t pu1_src_2_nt_val_t = vdup_n_u8(pu1_src_2_nt_val);
uint8x8_t pu1_src_4_nt_val_t = vdup_n_u8(pu1_src_4_nt_val);
pu1_dst_tmp_0 = pu1_dst + 1;
pu1_dst_tmp_1 = pu1_dst + two_nt + 1;
const_col_i = gau1_ihevc_planar_factor + 1;
for(i = two_nt; i > 0; i -= 8)
{
const_col_i_val = vld1_u8(const_col_i);
const_col_i += 8;
prod_val_1 = vmlsl_u8(prod_two_nt_src_0_val_t, const_col_i_val, pu1_src_0_val_t);
prod_val_2 = vmlal_u8(prod_val_1, const_col_i_val, pu1_src_2_nt_val_t);
res_val_1 = vrshrn_n_u16(prod_val_2, 6);
prod_val_3 = vmlsl_u8(prod_two_nt_src_2_nt_val_t, const_col_i_val, pu1_src_2_nt_val_t);
vst1_u8(pu1_dst_tmp_0, res_val_1);
pu1_dst_tmp_0 += 8;
prod_val_4 = vmlal_u8(prod_val_3, const_col_i_val, pu1_src_4_nt_val_t);
res_val_2 = vrshrn_n_u16(prod_val_4, 6);
vst1_u8(pu1_dst_tmp_1, res_val_2);
pu1_dst_tmp_1 += 8;
}
pu1_dst[2 * nt] = pu1_src[2 * nt];
}
else
{
pu1_src_tmp_1 = pu1_src + 1;
pu1_src_tmp_2 = pu1_src + 2;
pu1_dst_tmp_0 += 1;
dup_const_2 = vdup_n_u8(2);
/* Extremities Untouched*/
pu1_dst[0] = pu1_src[0];
/* To avoid the issue when the dest and src has the same pointer this load has been done
* outside and the 2nd consecutive load is done before the store of the 1st */
/* Perform bilinear filtering of Reference Samples */
for(i = (four_nt - 1); i > 0; i -= 8)
{
src_val_0 = vld1_u8(pu1_src_tmp_0);
pu1_src_tmp_0 += 8;
src_val_2 = vld1_u8(pu1_src_tmp_2);
pu1_src_tmp_2 += 8;
src_val_1 = vld1_u8(pu1_src_tmp_1);
pu1_src_tmp_1 += 8;
if(i < four_nt - 1)
{
vst1_u8(pu1_dst_tmp_0, shift_res);
pu1_dst_tmp_0 += 8;
}
add_res = vaddl_u8(src_val_0, src_val_2);
mul_res = vmlal_u8(add_res, src_val_1, dup_const_2);
shift_res = vrshrn_n_u16(mul_res, 2);
}
vst1_u8(pu1_dst_tmp_0, shift_res);
pu1_dst_tmp_0 += 8;
}
pu1_dst[4 * nt] = src_4nt;
}
}
/**
*******************************************************************************
*
* @brief
* Intra prediction interpolation filter for luma planar
*
* @par Description:
* Planar Intraprediction with reference neighboring samples location
* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst'
*
* @param[in] pu1_src
* UWORD8 pointer to the source
*
* @param[out] pu1_dst
* UWORD8 pointer to the destination
*
* @param[in] src_strd
* integer source stride
*
* @param[in] dst_strd
* integer destination stride
*
* @param[in] nt
* integer Transform Block size
*
* @param[in] wd
* integer width of the array
*
* @returns
*
* @remarks
* None
*
*******************************************************************************
*/
void ihevc_intra_pred_luma_planar_neonintr(UWORD8 *pu1_ref,
WORD32 src_strd,
UWORD8 *pu1_dst,
WORD32 dst_strd,
WORD32 nt,
WORD32 mode)
{
/* named it in the way (nt - 1 - col) --> const_nt_1_col(const denotes g_ihevc_planar_factor) */
/* load const_nt_1_col values into a d register */
/* named it in the way pu1_ref[nt - 1] --> pu1_ref_nt_1 */
/* the value of pu1_ref_nt_1 is duplicated to d register hence pu1_ref_nt_1_dup */
/* log2nt + 1 is taken care while assigning the values itself */
/* In width multiple of 4 case the row also has been unrolled by 2 and store has been taken care*/
WORD32 row, col = 0;
WORD32 log2nt_plus1 = 6;
WORD32 two_nt, three_nt;
UWORD8 *pu1_ref_two_nt_1;
UWORD8 *pu1_dst_tmp;
const UWORD8 *const_nt_1_col;
uint8x8_t const_nt_1_col_t;
const UWORD8 *const_col_1;
uint8x8_t const_col_1_t;
uint8_t const_nt_1_row;
uint8x8_t const_nt_1_row_dup;
uint8_t const_row_1;
uint8x8_t const_row_1_dup;
uint8_t const_nt = nt;
uint16x8_t const_nt_dup;
uint8_t pu1_ref_nt_1 = pu1_ref[nt - 1];
uint8x8_t pu1_ref_nt_1_dup;
uint8_t pu1_ref_two_nt_1_row;
uint8_t pu1_ref_three_nt_1;
uint8x8_t pu1_ref_two_nt_1_row_dup;
uint8x8_t pu1_ref_two_nt_1_t;
uint8x8_t pu1_ref_three_nt_1_dup;
uint16x8_t prod_t1;
uint16x8_t prod_t2;
uint16x8_t sto_res_tmp;
uint8x8_t sto_res;
int16x8_t log2nt_dup;
UNUSED(src_strd);
UNUSED(mode);
log2nt_plus1 = 32 - CLZ(nt);
two_nt = 2 * nt;
three_nt = 3 * nt;
/* loops have been unrolld considering the fact width is multiple of 8 */
if(0 == (nt & 7))
{
pu1_dst_tmp = pu1_dst;
const_nt_1_col = gau1_ihevc_planar_factor + nt - 8;
const_col_1 = gau1_ihevc_planar_factor + 1;
pu1_ref_three_nt_1 = pu1_ref[three_nt + 1];
pu1_ref_nt_1_dup = vdup_n_u8(pu1_ref_nt_1);
const_nt_dup = vdupq_n_u16(const_nt);
log2nt_dup = vdupq_n_s16(log2nt_plus1);
log2nt_dup = vnegq_s16(log2nt_dup);
pu1_ref_three_nt_1_dup = vdup_n_u8(pu1_ref_three_nt_1);
for(row = 0; row < nt; row++)
{
pu1_ref_two_nt_1_row = pu1_ref[two_nt - 1 - row];
pu1_ref_two_nt_1_row_dup = vdup_n_u8(pu1_ref_two_nt_1_row);
const_nt_1_row = nt - 1 - row;
const_nt_1_row_dup = vdup_n_u8(const_nt_1_row);
const_row_1 = row + 1;
const_row_1_dup = vdup_n_u8(const_row_1);
const_nt_1_col = gau1_ihevc_planar_factor + nt - 8;
const_col_1 = gau1_ihevc_planar_factor + 1;
pu1_ref_two_nt_1 = pu1_ref + two_nt + 1;
for(col = nt; col > 0; col -= 8)
{
const_nt_1_col_t = vld1_u8(const_nt_1_col);
const_nt_1_col -= 8;
const_nt_1_col_t = vrev64_u8(const_nt_1_col_t);
const_col_1_t = vld1_u8(const_col_1);
const_col_1 += 8;
prod_t1 = vmull_u8(const_nt_1_col_t, pu1_ref_two_nt_1_row_dup);
pu1_ref_two_nt_1_t = vld1_u8(pu1_ref_two_nt_1);
pu1_ref_two_nt_1 += 8;
prod_t2 = vmull_u8(const_col_1_t, pu1_ref_three_nt_1_dup);
prod_t1 = vmlal_u8(prod_t1, const_nt_1_row_dup, pu1_ref_two_nt_1_t);
prod_t2 = vmlal_u8(prod_t2, const_row_1_dup, pu1_ref_nt_1_dup);
prod_t1 = vaddq_u16(prod_t1, const_nt_dup);
prod_t1 = vaddq_u16(prod_t1, prod_t2);
sto_res_tmp = vreinterpretq_u16_s16(vshlq_s16(vreinterpretq_s16_u16(prod_t1), log2nt_dup));
sto_res = vmovn_u16(sto_res_tmp);
vst1_u8(pu1_dst_tmp, sto_res);
pu1_dst_tmp += 8;
}
pu1_dst_tmp += dst_strd - nt;
}
}
/* loops have been unrolld considering the fact width is multiple of 4 */
/* If column is multiple of 4 then height should be multiple of 2 */
else
{
uint8x8_t const_row_1_dup1;
uint8x8_t pu1_ref_two_nt_1_t1;
uint8x8_t const_nt_1_col_t1;
uint8x8_t const_col_1_t1;
uint8x8_t pu1_ref_two_nt_1_row_dup1;
uint8x8_t const_nt_1_row_dup1;
pu1_ref_three_nt_1 = pu1_ref[three_nt + 1];
pu1_ref_nt_1_dup = vdup_n_u8(pu1_ref_nt_1);
const_nt_dup = vdupq_n_u16(const_nt);
log2nt_dup = vdupq_n_s16(log2nt_plus1);
log2nt_dup = vnegq_s16(log2nt_dup);
pu1_ref_three_nt_1_dup = vdup_n_u8(pu1_ref_three_nt_1);
for(row = 0; row < nt; row += 2)
{
pu1_ref_two_nt_1_row = pu1_ref[two_nt - 1 - row];
pu1_ref_two_nt_1_row_dup = vdup_n_u8(pu1_ref_two_nt_1_row);
pu1_ref_two_nt_1_row = pu1_ref[two_nt - 2 - row];
pu1_ref_two_nt_1_row_dup1 = vdup_n_u8(pu1_ref_two_nt_1_row);
pu1_ref_two_nt_1_row_dup = vext_u8(pu1_ref_two_nt_1_row_dup, pu1_ref_two_nt_1_row_dup1, 4);
const_nt_1_row = nt - 1 - row;
const_nt_1_row_dup = vdup_n_u8(const_nt_1_row);
const_nt_1_row = nt - 2 - row;
const_nt_1_row_dup1 = vdup_n_u8(const_nt_1_row);
const_nt_1_row_dup = vext_u8(const_nt_1_row_dup, const_nt_1_row_dup1, 4);
const_row_1 = row + 1;
const_row_1_dup = vdup_n_u8(const_row_1);
const_row_1 = row + 2;
const_row_1_dup1 = vdup_n_u8(const_row_1);
const_row_1_dup = vext_u8(const_row_1_dup, const_row_1_dup1, 4);
const_nt_1_col = gau1_ihevc_planar_factor + nt - 4;
const_col_1 = gau1_ihevc_planar_factor + 1;
pu1_ref_two_nt_1 = pu1_ref + two_nt + 1;
for(col = nt; col > 0; col -= 4)
{
const_nt_1_col_t = vld1_u8(const_nt_1_col);
const_nt_1_col -= 4;
const_nt_1_col_t = vrev64_u8(const_nt_1_col_t);
const_col_1_t = vld1_u8(const_col_1);
const_col_1 += 4;
const_nt_1_col_t1 = vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(const_nt_1_col_t), 32));
pu1_dst_tmp = pu1_dst;
const_nt_1_col_t = vext_u8(const_nt_1_col_t, const_nt_1_col_t1, 4);
const_col_1_t1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(const_col_1_t), 32));
prod_t1 = vmull_u8(const_nt_1_col_t, pu1_ref_two_nt_1_row_dup);
pu1_ref_two_nt_1_t = vld1_u8(pu1_ref_two_nt_1);
pu1_ref_two_nt_1 += 4;
const_col_1_t = vext_u8(const_col_1_t1, const_col_1_t, 4);
pu1_ref_two_nt_1_t1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(pu1_ref_two_nt_1_t), 32));
prod_t2 = vmull_u8(const_col_1_t, pu1_ref_three_nt_1_dup);
pu1_ref_two_nt_1_t = vext_u8(pu1_ref_two_nt_1_t1, pu1_ref_two_nt_1_t, 4);
prod_t2 = vmlal_u8(prod_t2, const_row_1_dup, pu1_ref_nt_1_dup);
prod_t1 = vmlal_u8(prod_t1, const_nt_1_row_dup, pu1_ref_two_nt_1_t);
prod_t1 = vaddq_u16(prod_t1, const_nt_dup);
prod_t1 = vaddq_u16(prod_t1, prod_t2);
sto_res_tmp = vreinterpretq_u16_s16(vshlq_s16(vreinterpretq_s16_u16(prod_t1), log2nt_dup));
sto_res = vmovn_u16(sto_res_tmp);
vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
pu1_dst_tmp += dst_strd;
vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
pu1_dst += 4;
}
pu1_dst += 2 * dst_strd - nt;
}
}
}
/* INTRA_PRED_LUMA_PLANAR */
/**
*******************************************************************************
*
* @brief
* Intra prediction interpolation filter for luma dc
*
* @par Description:
* Intraprediction for DC mode with reference neighboring samples location
* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst'
*
* @param[in] pu1_src
* UWORD8 pointer to the source
*
* @param[out] pu1_dst
* UWORD8 pointer to the destination
*
* @param[in] src_strd
* integer source stride
*
* @param[in] dst_strd
* integer destination stride
*
* @param[in] nt
* integer Transform Block size
*
* @param[in] wd
* integer width of the array
*
* @returns
*
* @remarks
* None
*
*******************************************************************************
*/
void ihevc_intra_pred_luma_dc_neonintr(UWORD8 *pu1_ref,
WORD32 src_strd,
UWORD8 *pu1_dst,
WORD32 dst_strd,
WORD32 nt,
WORD32 mode)
{
WORD32 dc_val = 0, two_dc_val = 0, three_dc_val = 0;
WORD32 i = 0;
WORD32 row = 0, col = 0, col_count;
WORD32 log2nt_plus1 = 6;
WORD32 two_nt = 0;
uint16x8_t ref_load_q;
uint16x8_t three_dc_val_t;
uint8x8_t sto_res_tmp;
uint8x8_t sto_res_tmp1;
uint8x8_t sto_res_tmp2;
uint8x8_t sto_res_tmp3;
uint8x8_t sto_res_tmp4;
uint8x8_t dc_val_t;
UWORD8 *pu1_ref_tmp;
UWORD8 *pu1_ref_tmp1;
UWORD8 *pu1_dst_tmp;
UWORD8 *pu1_dst_tmp1;
UWORD8 *pu1_dst_tmp2;
UNUSED(src_strd);
UNUSED(mode);
/* log2nt + 1 is taken care while assigning the values itself. */
log2nt_plus1 = 32 - CLZ(nt);
/* loops have been unrolld considering the fact width is multiple of 8 */
if(0 == (nt & 7))
{
uint8x8_t ref_load1;
uint8x8_t ref_load2;
uint16x4_t acc_dc_pair1;
uint32x2_t acc_dc_pair2;
uint64x1_t acc_dc = vdup_n_u64(col);
two_nt = 2 * nt;
pu1_ref_tmp = pu1_ref + nt;
pu1_ref_tmp1 = pu1_ref + two_nt + 1;
for(i = two_nt; i > nt; i -= 8)
{
ref_load1 = vld1_u8(pu1_ref_tmp);
pu1_ref_tmp += 8;
acc_dc_pair1 = vpaddl_u8(ref_load1);
ref_load2 = vld1_u8(pu1_ref_tmp1);
pu1_ref_tmp1 += 8;
acc_dc_pair2 = vpaddl_u16(acc_dc_pair1);
acc_dc = vpadal_u32(acc_dc, acc_dc_pair2);
acc_dc_pair1 = vpaddl_u8(ref_load2);
acc_dc_pair2 = vpaddl_u16(acc_dc_pair1);
acc_dc = vpadal_u32(acc_dc, acc_dc_pair2);
}
dc_val = (vget_lane_u32(vreinterpret_u32_u64(acc_dc), 0) + nt) >> (log2nt_plus1);
dc_val_t = vdup_n_u8(dc_val);
two_dc_val = 2 * dc_val;
three_dc_val = 3 * dc_val;
three_dc_val += 2;
three_dc_val_t = vdupq_n_u16((WORD16)three_dc_val);
pu1_ref_tmp = pu1_ref + two_nt + 1 + 0;
pu1_dst_tmp = pu1_dst;
if(nt == 32)
{
for(row = 0; row < nt; row++)
{
for(col = nt; col > 0; col -= 8)
{
vst1_u8(pu1_dst_tmp, dc_val_t);
pu1_dst_tmp += 8;
}
pu1_dst_tmp += dst_strd - nt;
}
}
else
{
for(col = nt; col > 0; col -= 8)
{
ref_load1 = vld1_u8(pu1_ref_tmp);
pu1_ref_tmp += 8;
ref_load_q = vmovl_u8(ref_load1);
ref_load_q = vaddq_u16(ref_load_q, three_dc_val_t);
ref_load_q = vshrq_n_u16(ref_load_q, 2);
sto_res_tmp = vmovn_u16(ref_load_q);
vst1_u8(pu1_dst_tmp, sto_res_tmp);
pu1_dst_tmp += 8;
}
pu1_ref_tmp = pu1_ref + two_nt - 9;
pu1_dst_tmp = pu1_dst + dst_strd;
col_count = nt - 8;
/* Except the first row the remaining rows are done here */
/* Both column and row has been unrolled by 8 */
/* Store has been taken care for the unrolling */
/* Except the 1st column of the remaining rows(other than 1st row), the values are */
/* constant hence it is extracted with an constant value and stored */
/* If the column is greater than 8, then the remaining values are constant which is */
/* taken care in the inner for loop */
for(row = nt; row > 0; row -= 8)
{
pu1_dst_tmp1 = pu1_dst_tmp + 8;
ref_load1 = vld1_u8(pu1_ref_tmp);
pu1_ref_tmp -= 8;
ref_load_q = vmovl_u8(ref_load1);
ref_load_q = vaddq_u16(ref_load_q, three_dc_val_t);
ref_load_q = vshrq_n_u16(ref_load_q, 2);
sto_res_tmp = vmovn_u16(ref_load_q);
sto_res_tmp1 = vext_u8(sto_res_tmp, dc_val_t, 7);
sto_res_tmp2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 8));
sto_res_tmp2 = vext_u8(sto_res_tmp2, dc_val_t, 7);
vst1_u8(pu1_dst_tmp, sto_res_tmp1);
pu1_dst_tmp += dst_strd;
sto_res_tmp3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 16));
sto_res_tmp3 = vext_u8(sto_res_tmp3, dc_val_t, 7);
vst1_u8(pu1_dst_tmp, sto_res_tmp2);
pu1_dst_tmp += dst_strd;
sto_res_tmp4 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 24));
sto_res_tmp4 = vext_u8(sto_res_tmp4, dc_val_t, 7);
vst1_u8(pu1_dst_tmp, sto_res_tmp3);
pu1_dst_tmp += dst_strd;
sto_res_tmp1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 32));
sto_res_tmp1 = vext_u8(sto_res_tmp1, dc_val_t, 7);
vst1_u8(pu1_dst_tmp, sto_res_tmp4);
pu1_dst_tmp += dst_strd;
sto_res_tmp2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 40));
sto_res_tmp2 = vext_u8(sto_res_tmp2, dc_val_t, 7);
vst1_u8(pu1_dst_tmp, sto_res_tmp1);
pu1_dst_tmp += dst_strd;
sto_res_tmp3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 48));
sto_res_tmp3 = vext_u8(sto_res_tmp3, dc_val_t, 7);
vst1_u8(pu1_dst_tmp, sto_res_tmp2);
pu1_dst_tmp += dst_strd;
sto_res_tmp4 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 56));
sto_res_tmp4 = vext_u8(sto_res_tmp4, dc_val_t, 7);
vst1_u8(pu1_dst_tmp, sto_res_tmp3);
pu1_dst_tmp += dst_strd;
/* For last set of 8 rows only 7 rows need to be updated since first row is already written */
if(row != 8)
vst1_u8(pu1_dst_tmp, sto_res_tmp4);
pu1_dst_tmp += dst_strd;
for(col = col_count; col > 0; col -= 8)
{
pu1_dst_tmp2 = pu1_dst_tmp1;
vst1_u8(pu1_dst_tmp1, dc_val_t);
pu1_dst_tmp1 += dst_strd;
vst1_u8(pu1_dst_tmp1, dc_val_t);
pu1_dst_tmp1 += dst_strd;
vst1_u8(pu1_dst_tmp1, dc_val_t);
pu1_dst_tmp1 += dst_strd;
vst1_u8(pu1_dst_tmp1, dc_val_t);
pu1_dst_tmp1 += dst_strd;
vst1_u8(pu1_dst_tmp1, dc_val_t);
pu1_dst_tmp1 += dst_strd;
vst1_u8(pu1_dst_tmp1, dc_val_t);
pu1_dst_tmp1 += dst_strd;
vst1_u8(pu1_dst_tmp1, dc_val_t);
pu1_dst_tmp1 += dst_strd;
/* For last set of 8 rows only 7 rows need to be updated since first row is already written */
if(row != 8)
vst1_u8(pu1_dst_tmp1, dc_val_t);
pu1_dst_tmp1 = pu1_dst_tmp2 + 8;
}
}
pu1_dst[0] = (pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) >> 2;
}
}
/* loops have been unrolld considering the fact width is multiple of 4 */
else
{
WORD32 acc_dc;
two_nt = 2 * nt;
acc_dc = 0;
pu1_ref_tmp = pu1_ref + nt + 1;
for(i = nt; i < two_nt; i++)
{
acc_dc += pu1_ref[i];
acc_dc += pu1_ref_tmp[i];
}
dc_val = (acc_dc + nt) >> (log2nt_plus1);
two_dc_val = 2 * dc_val;
three_dc_val = 3 * dc_val;
three_dc_val = three_dc_val + 2;
dc_val_t = vdup_n_u8(dc_val);
if(nt == 32)
{
pu1_dst_tmp = pu1_dst;
for(row = 0; row < nt; row++)
{
for(col = nt; col > 0; col -= 4)
{
vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(dc_val_t), 0);
pu1_dst_tmp += 4;
}
pu1_dst_tmp += dst_strd - nt;
}
}
else
{
for(col = 1; col < nt; col++)
{
pu1_dst[col] = (pu1_ref[two_nt + 1 + col] + three_dc_val) >> 2;
}
pu1_dst_tmp = pu1_dst + dst_strd + 0;
/* Since first row is already updated before, loop count is nt-1 */
for(row = nt - 1; row > 0; row -= 1)
{
for(col = nt; col > 0; col -= 4)
{
vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(dc_val_t), 0);
pu1_dst_tmp += 4;
}
pu1_dst_tmp += dst_strd - nt;
}
for(row = 1; row < nt; row++)
{
pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val) >> 2;
}
pu1_dst[0] = (pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) >> 2;
}
}
}
/* INTRA_PRED_LUMA_DC */
/**
*******************************************************************************
*
* @brief
* Intra prediction interpolation filter for horizontal luma variable.
*
* @par Description:
* Horizontal intraprediction with reference neighboring samples location
* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst'
*
* @param[in] pu1_src
* UWORD8 pointer to the source
*
* @param[out] pu1_dst
* UWORD8 pointer to the destination
*
* @param[in] src_strd
* integer source stride
*
* @param[in] dst_strd
* integer destination stride
*
* @param[in] nt
* integer Transform Block size
*
* @param[in] wd
* integer width of the array
*
* @returns
*
* @remarks
* None
*
*******************************************************************************
*/
void ihevc_intra_pred_luma_horz_neonintr(UWORD8 *pu1_ref,
WORD32 src_strd,
UWORD8 *pu1_dst,
WORD32 dst_strd,
WORD32 nt,
WORD32 mode)
{
WORD32 row, col;
WORD32 two_nt;
UNUSED(src_strd);
UNUSED(mode);
two_nt = 2 * nt;
UWORD8 *pu1_dst_tmp = pu1_dst;
UWORD32 pu1_val;
uint8x8_t pu1_val_two_nt_1_row;
if(nt == 32)
{
pu1_dst_tmp = pu1_dst;
for(row = 0; row < nt; row++)
{
pu1_val = pu1_ref[two_nt - 1 - row];
pu1_val_two_nt_1_row = vdup_n_u8(pu1_val);
for(col = nt; col > 0; col -= 8)
{
vst1_u8(pu1_dst_tmp, pu1_val_two_nt_1_row);
pu1_dst_tmp += 8;
}
pu1_dst_tmp += dst_strd - nt;
}
}
else
/* row loop has been unrolled, hence had pu1_ref_val1 and pu1_ref_val2 variables*/
/* naming of variables made according to the operation(instructions) it performs*/
/* (eg. shift_val which contains the shifted value, */
/* add_sat which has add and saturated value) */
/* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */
/* rows and columns are unrolled by 4, when the width is multiple of 4 */
{
if(0 != (nt & 7)) /* cond for multiple of 4 */
{
UWORD8 *pu1_ref_4_two_nt_plus1 = pu1_ref;
UWORD8 *pu1_ref_4_two_nt_minus_nt = pu1_ref;
UWORD8 *pu1_dst_4 = pu1_dst;
UWORD8 *pu1_dst_4_tmp = pu1_dst;
uint32x2_t pu1_ref_val1, pu1_ref_val2;
uint8x8_t dup_sub, round_val, dup_val;
uint16x8_t dup_add, sub_val;
int16x8_t shift_val, add_sat;
pu1_ref_val1 = vdup_n_u32(0);
pu1_ref_val2 = vdup_n_u32(0);
dup_sub = vdup_n_u8(pu1_ref[two_nt]);
dup_add = vdupq_n_u16(pu1_ref[two_nt - 1]);
pu1_ref_4_two_nt_plus1 += (two_nt + 1);
pu1_ref_4_two_nt_minus_nt += (two_nt - nt);
for(row = nt; row > 0; row -= 4)
{
for(col = nt; col > 0; col -= 4)
{
pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_4_two_nt_plus1, pu1_ref_val1, 0);
sub_val = vsubl_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_sub);
shift_val = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1);
add_sat = vqaddq_s16(shift_val, vreinterpretq_s16_u16(dup_add));
round_val = vqmovun_s16(add_sat);
vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(round_val), 0);
pu1_dst_4 += dst_strd;
pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_4_two_nt_minus_nt, pu1_ref_val2, 0);
dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 2);
vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0);
pu1_dst_4 += dst_strd;
dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 1);
vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0);
pu1_dst_4 += dst_strd;
dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 0);
vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0);
pu1_dst_4 += dst_strd;
}
/* worst cases */
pu1_ref_4_two_nt_minus_nt += 3;
pu1_ref_4_two_nt_plus1 += 4;
pu1_dst_4 = (pu1_dst_4_tmp + 4);
}
}
/* dup_1 - dup_8 are variables to load the duplicated values from the loaded source */
/* naming of variables made according to the operation(instructions) it performs */
/* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */
/* rows and columns are unrolled by 8, when the width is multiple of 8 */
else
{
UWORD8 *pu1_ref_tmp_1 = pu1_ref;
UWORD8 *pu1_ref_tmp_2 = pu1_ref;
UWORD8 *pu1_dst_tmp_1 = pu1_dst;
UWORD8 *pu1_dst_tmp_2 = pu1_dst + dst_strd;
UWORD8 *pu1_dst_tmp_3 = pu1_dst + dst_strd;
uint8x8_t dup_sub, src_tmp, src_tmp_1, round_val, dup_1, dup_2, dup_3, dup_4, dup_5, dup_6, dup_7, dup_8, rev_res;
uint16x8_t sub_res, dup_add;
int16x8_t shift_res, add_res;
dup_sub = vdup_n_u8(pu1_ref[two_nt]);
dup_add = vdupq_n_u16(pu1_ref[two_nt - 1]);
pu1_ref_tmp_1 += (two_nt + 1);
pu1_ref_tmp_2 += (two_nt - 1);
for(col = nt; col > 0; col -= 8)
{
src_tmp = vld1_u8(pu1_ref_tmp_1);
pu1_ref_tmp_1 += 8;
sub_res = vsubl_u8(src_tmp, dup_sub);
shift_res = vshrq_n_s16(vreinterpretq_s16_u16(sub_res), 1);
add_res = vqaddq_s16(shift_res, vreinterpretq_s16_u16(dup_add));
round_val = vqmovun_s16(add_res);
vst1_u8(pu1_dst_tmp_1, round_val);
pu1_dst_tmp_1 += 8;
}
for(row = nt; row > 0; row -= 8)
{
pu1_ref_tmp_2 -= 8;
src_tmp_1 = vld1_u8(pu1_ref_tmp_2);
rev_res = vrev64_u8(src_tmp_1); /* Reversing the loaded values */
dup_1 = vdup_lane_u8(rev_res, 0);
dup_2 = vdup_lane_u8(rev_res, 1);
dup_3 = vdup_lane_u8(rev_res, 2);
dup_4 = vdup_lane_u8(rev_res, 3);
dup_5 = vdup_lane_u8(rev_res, 4);
dup_6 = vdup_lane_u8(rev_res, 5);
dup_7 = vdup_lane_u8(rev_res, 6);
dup_8 = vdup_lane_u8(rev_res, 7);
for(col = nt; col > 0; col -= 8)
{
pu1_dst_tmp_2 = pu1_dst_tmp_3;
vst1_u8(pu1_dst_tmp_2, dup_1);
pu1_dst_tmp_2 += dst_strd;
vst1_u8(pu1_dst_tmp_2, dup_2);
pu1_dst_tmp_2 += dst_strd;
vst1_u8(pu1_dst_tmp_2, dup_3);
pu1_dst_tmp_2 += dst_strd;
vst1_u8(pu1_dst_tmp_2, dup_4);
pu1_dst_tmp_2 += dst_strd;
vst1_u8(pu1_dst_tmp_2, dup_5);
pu1_dst_tmp_2 += dst_strd;
vst1_u8(pu1_dst_tmp_2, dup_6);
pu1_dst_tmp_2 += dst_strd;
vst1_u8(pu1_dst_tmp_2, dup_7);
pu1_dst_tmp_2 += dst_strd;
/* For last set of 8 rows only 7 rows need to be updated since first row is already written */
if(row != 8)
vst1_u8(pu1_dst_tmp_2, dup_8);
pu1_dst_tmp_2 += dst_strd;
pu1_dst_tmp_3 += 8;
}
pu1_dst_tmp_2 -= (nt - 8);
pu1_dst_tmp_3 = pu1_dst_tmp_2;
}
}
}
}
/* INTRA_PRED_LUMA_HORZ */
/**
*******************************************************************************
*
* @brief
* Intra prediction interpolation filter for vertical luma variable.
*
* @par Description:
* Horizontal intraprediction with reference neighboring samples location
* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst'
*
* @param[in] pu1_src
* UWORD8 pointer to the source
*
* @param[out] pu1_dst
* UWORD8 pointer to the destination
*
* @param[in] src_strd
* integer source stride
*
* @param[in] dst_strd
* integer destination stride
*
* @param[in] nt
* integer Transform Block size
*
* @param[in] wd
* integer width of the array
*
* @returns
*
* @remarks
* None
*
*******************************************************************************
*/
void ihevc_intra_pred_luma_ver_neonintr(UWORD8 *pu1_ref,
WORD32 src_strd,
UWORD8 *pu1_dst,
WORD32 dst_strd,
WORD32 nt,
WORD32 mode)
{
WORD32 row, col;
WORD32 two_nt;
UNUSED(src_strd);
UNUSED(mode);
two_nt = 2 * nt;
UWORD8 *pu1_dst_tmp = pu1_dst;
UWORD8 *pu1_ref_tmp_1 = pu1_ref + two_nt + 1;
uint8x8_t pu1_val_two_nt_1_col;
if(nt == 32)
{
pu1_dst_tmp = pu1_dst;
for(row = 0; row < nt; row++)
{
for(col = nt; col > 0; col -= 8)
{
pu1_val_two_nt_1_col = vld1_u8(pu1_ref_tmp_1);
pu1_ref_tmp_1 += 8;
vst1_u8(pu1_dst_tmp, pu1_val_two_nt_1_col);
pu1_dst_tmp += 8;
}
pu1_ref_tmp_1 -= nt;
pu1_dst_tmp += dst_strd - nt;
}
}
else
{
/* naming of variables made according to the operation(instructions) it performs */
/* (eg. shift_val which contains the shifted value, */
/* add_sat which has add and saturated value) */
/* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */
/* rows and columns are unrolled by 4, when the width is multiple of 4 */
if(0 != (nt & 7))
{
WORD32 cond_4 = 0;
UWORD8 *pu1_ref_val1 = pu1_ref;
UWORD8 *pu1_ref_val2 = pu1_ref;
UWORD8 *pu1_ref_val3 = pu1_ref;
UWORD8 *pu1_dst_val1 = pu1_dst;
UWORD8 *pu1_dst_val2 = pu1_dst;
UWORD8 *pu1_dst_val3 = pu1_dst;
uint8x8_t dup_2_sub, round_val, vext_val;
uint16x8_t dup_2_add;
uint32x2_t src_val1, src_val2, src_val3;
uint16x8_t sub_val;
int16x8_t shift_val1, add_sat;
uint64x1_t shift_val2;
src_val1 = vdup_n_u32(0);
src_val2 = vdup_n_u32(0);
src_val3 = vdup_n_u32(0);
pu1_ref_val1 += (two_nt - nt);
pu1_ref_val3 += (two_nt + 2);
pu1_ref_val2 += (two_nt + 1);
dup_2_sub = vdup_n_u8(pu1_ref[two_nt]);
dup_2_add = vdupq_n_u16(pu1_ref[two_nt + 1]);
/* loops to store the first nt sets of values in the destination */
for(row = nt; row > 0; row -= 4)
{
for(col = nt; (col > 0) && (cond_4 == 0); col -= 4)
{
/* unrolling s2_predpixel = pu1_ref[two_nt + 1] + ((pu1_ref[two_nt - 1 - row] - pu1_ref[two_nt]) >> 1); here*/
src_val1 = vld1_lane_u32((uint32_t *)pu1_ref_val1, src_val1, 1);
sub_val = vsubl_u8(vreinterpret_u8_u32(src_val1), dup_2_sub);
shift_val1 = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1);
add_sat = vqaddq_s16(shift_val1, vreinterpretq_s16_u16(dup_2_add));
round_val = vqmovun_s16(add_sat);
/* unrolling pu1_dst[row * dst_strd + col] = pu1_ref[two_nt + 1 + col]; here*/
src_val2 = vld1_lane_u32((uint32_t *)pu1_ref_val3, src_val2, 0);
vext_val = vext_u8(round_val, vreinterpret_u8_u32(src_val2), 7);
vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
pu1_dst_val1 += dst_strd;
shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 8);
vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7);
vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
pu1_dst_val1 += dst_strd;
shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 16);
vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7);
vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
pu1_dst_val1 += dst_strd;
shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 24);
vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7);
vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
pu1_dst_val1 += dst_strd;
pu1_ref_val1 -= 4;
}
/* loop to store next sets of eight values in the destination */
for(col = nt - 3; (col > 0) && (cond_4 == 1); col -= 4)
{
src_val3 = vld1_lane_u32((uint32_t *)pu1_ref_val2, src_val3, 0);
vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
pu1_dst_val2 += dst_strd;
vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
pu1_dst_val2 += dst_strd;
vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
pu1_dst_val2 += dst_strd;
vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
pu1_dst_val2 += dst_strd;
}
pu1_ref_val2 += 4;
pu1_dst_val3 += 4;
pu1_dst_val2 = pu1_dst_val3;
cond_4 = 1;
}
}
/* rows and columns are unrolled by 8, when the width is multiple of 8 */
else
{
WORD32 cond = 0, col_1;
UWORD8 *pu1_dst_tmp_1 = pu1_dst;
UWORD8 *pu1_dst_tmp_2 = pu1_dst;
UWORD8 *pu1_dst_tmp_3 = pu1_dst;
UWORD8 *pu1_ref_tmp_1 = pu1_ref;
UWORD8 *pu1_ref_tmp_2 = pu1_ref;
UWORD8 *pu1_ref_tmp_3 = pu1_ref;
uint8x8_t pu1_src_tmp1;
uint8x8_t pu1_src_tmp2;
uint8x8_t dup_sub;
uint16x8_t dup_add;
int16x8_t subsh_val;
int16x8_t addsat_val;
uint16x8_t sub_val;
uint8x8_t round_val;
uint8x8_t vext_t;
uint64x1_t shift_64;
dup_sub = vdup_n_u8(pu1_ref[two_nt]);
dup_add = vdupq_n_u16(pu1_ref[two_nt + 1]);
pu1_ref_tmp_1 += (two_nt);
pu1_ref_tmp_1 -= 8;
pu1_ref_tmp_2 += (two_nt + 2);
pu1_ref_tmp_3 += (two_nt + 1);
/* loops to store the first nt sets of values in the destination */
for(row = nt; row > 0; row -= 8)
{
for(col = (nt - 1); (col > 0) && (cond == 0); col -= 8)
{
pu1_src_tmp1 = vld1_u8(pu1_ref_tmp_1);
sub_val = vsubl_u8(pu1_src_tmp1, dup_sub);
subsh_val = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1);
addsat_val = vqaddq_s16(subsh_val, vreinterpretq_s16_u16(dup_add));
round_val = vqmovun_s16(addsat_val);
/* unrolling pu1_dst[row * dst_strd + col] = pu1_ref[two_nt + 1 + col]; here*/
pu1_src_tmp2 = vld1_u8(pu1_ref_tmp_2);
vext_t = vext_u8(round_val, pu1_src_tmp2, 7);
vst1_u8(pu1_dst_tmp_1, vext_t);
pu1_dst_tmp_1 += dst_strd;
shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 8);
vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
vst1_u8(pu1_dst_tmp_1, vext_t);
pu1_dst_tmp_1 += dst_strd;
shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 16);
vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
vst1_u8(pu1_dst_tmp_1, vext_t);
pu1_dst_tmp_1 += dst_strd;
shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 24);
vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
vst1_u8(pu1_dst_tmp_1, vext_t);
pu1_dst_tmp_1 += dst_strd;
shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 32);
vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
vst1_u8(pu1_dst_tmp_1, vext_t);
pu1_dst_tmp_1 += dst_strd;
shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 40);
vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
vst1_u8(pu1_dst_tmp_1, vext_t);
pu1_dst_tmp_1 += dst_strd;
shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 48);
vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
vst1_u8(pu1_dst_tmp_1, vext_t);
pu1_dst_tmp_1 += dst_strd;
shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 56);
vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
vst1_u8(pu1_dst_tmp_1, vext_t);
pu1_dst_tmp_1 += dst_strd;
pu1_ref_tmp_1 -= 8;
}
/* loop to store next sets of eight values in the destination */
for(col_1 = nt - 7; (col_1 > 0) && (cond == 1); col_1 -= 8)
{
pu1_src_tmp2 = vld1_u8(pu1_ref_tmp_3);
vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
pu1_dst_tmp_2 += dst_strd;
vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
pu1_dst_tmp_2 += dst_strd;
vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
pu1_dst_tmp_2 += dst_strd;
vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
pu1_dst_tmp_2 += dst_strd;
vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
pu1_dst_tmp_2 += dst_strd;
vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
pu1_dst_tmp_2 += dst_strd;
vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
pu1_dst_tmp_2 += dst_strd;
vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
pu1_dst_tmp_2 += dst_strd;
}
pu1_ref_tmp_3 += 8;
pu1_dst_tmp_3 += 8;
pu1_dst_tmp_2 = pu1_dst_tmp_3;
cond = 1;
}
}
}
}
/* INTRA_PRED_LUMA_VER */
/**
*******************************************************************************
*
* @brief
* Intra prediction interpolation filter for luma mode2.
*
* @par Description:
* Intraprediction for mode 2 (sw angle) with reference neighboring samples
* location pointed by 'pu1_ref' to the TU block location pointed by
* 'pu1_dst'
*
* @param[in] pu1_src
* UWORD8 pointer to the source
*
* @param[out] pu1_dst
* UWORD8 pointer to the destination
*
* @param[in] src_strd
* integer source stride
*
* @param[in] dst_strd
* integer destination stride
*
* @param[in] nt
* integer Transform Block size
*
* @param[in] wd
* integer width of the array
*
* @returns
*
* @remarks
* None
*
*******************************************************************************
*/
void ihevc_intra_pred_luma_mode2_neonintr(UWORD8 *pu1_ref,
WORD32 src_strd,
UWORD8 *pu1_dst,
WORD32 dst_strd,
WORD32 nt,
WORD32 mode)
{
WORD32 row, col;
WORD32 two_nt;
UNUSED(src_strd);
UNUSED(mode);
/* rev_res naming has been made to have the reverse result value in it */
/* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */
/* rows and columns are unrolled by 4, when the width is multiple of 4 */
if(0 != (nt & 7))
{
UWORD8 *pu1_ref_tmp = pu1_ref;
UWORD8 *pu1_dst_tmp = pu1_dst;
uint8x8_t pu1_src_val, rev_res;
uint64x1_t shift_res;
for(col = nt; col > 0; col -= 4)
{
for(row = nt; row > 0; row -= 4)
{
/* unrolling all col & rows for pu1_dst[row + (col * dst_strd)] = pu1_ref[two_nt - col - idx - 1]; */
pu1_src_val = vld1_u8(pu1_ref_tmp);
shift_res = vshl_n_u64(vreinterpret_u64_u8(pu1_src_val), 8);
rev_res = vrev64_u8(vreinterpret_u8_u64(shift_res));
vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(rev_res), 0);
pu1_dst_tmp += dst_strd;
shift_res = vshr_n_u64(vreinterpret_u64_u8(rev_res), 8);
vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0);
pu1_dst_tmp += dst_strd;
shift_res = vshr_n_u64(shift_res, 8);
vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0);
pu1_dst_tmp += dst_strd;
shift_res = vshr_n_u64(shift_res, 8);
vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0);
pu1_dst_tmp += dst_strd;
}
}
}
/* rev_val_second, rev_val_first to reverse the loaded values in order to get the values in right order */
/* shift_64 to shift the reversed 2nd values to get the value what we need */
/* rows and columns are unrolled by 8, when the width is multiple of 8 */
else
{
UWORD8 *pu1_ref_two_nt_minus2 = pu1_ref;
UWORD8 *pu1_dst_tmp = pu1_dst;
UWORD8 *pu1_dst_tmp_plus8 = pu1_dst;
uint8x8_t pu1_src_val1, pu1_src_val2, vext_t, rev_val_second, rev_val_first;
uint64x1_t shift_val;
two_nt = 2 * nt;
pu1_ref_two_nt_minus2 += (two_nt);
pu1_ref_two_nt_minus2 -= 8;
for(col = nt; col > 0; col -= 8)
{
for(row = nt; row > 0; row -= 8)
{
pu1_src_val2 = vld1_u8(pu1_ref_two_nt_minus2);
rev_val_first = vrev64_u8(pu1_src_val2);
pu1_ref_two_nt_minus2 -= 8;
pu1_src_val1 = vld1_u8(pu1_ref_two_nt_minus2);
rev_val_second = vrev64_u8(pu1_src_val1);
vext_t = vext_u8(rev_val_first, rev_val_second, 1);
vst1_u8(pu1_dst_tmp, vext_t);
pu1_dst_tmp += dst_strd;
shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 8);
vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
vst1_u8(pu1_dst_tmp, vext_t);
pu1_dst_tmp += dst_strd;
shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 16);
vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
vst1_u8(pu1_dst_tmp, vext_t);
pu1_dst_tmp += dst_strd;
shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 24);
vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
vst1_u8(pu1_dst_tmp, vext_t);
pu1_dst_tmp += dst_strd;
shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 32);
vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
vst1_u8(pu1_dst_tmp, vext_t);
pu1_dst_tmp += dst_strd;
shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 40);
vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
vst1_u8(pu1_dst_tmp, vext_t);
pu1_dst_tmp += dst_strd;
shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 48);
vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
vst1_u8(pu1_dst_tmp, vext_t);
pu1_dst_tmp += dst_strd;
shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 56);
vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
vst1_u8(pu1_dst_tmp, vext_t);
pu1_dst_tmp += dst_strd;
}
pu1_dst_tmp_plus8 += 8;
pu1_dst_tmp = pu1_dst_tmp_plus8;
pu1_ref_two_nt_minus2 += (nt - 8);
}
}
}
/* INTRA_PRED_LUMA_MODE2 */
/**
*******************************************************************************
*
* @brief
* Intra prediction interpolation filter for luma mode 18 & mode 34.
*
* @par Description:
* Intraprediction for mode 34 (ne angle) with reference neighboring
* samples location pointed by 'pu1_ref' to the TU block location pointed by
* 'pu1_dst'
*
* @param[in] pu1_src
* UWORD8 pointer to the source
*
* @param[out] pu1_dst
* UWORD8 pointer to the destination
*
* @param[in] src_strd
* integer source stride
*
* @param[in] dst_strd
* integer destination stride
*
* @param[in] nt
* integer Transform Block size
*
* @param[in] wd
* integer width of the array
*
* @returns
*
* @remarks
* None
*
*******************************************************************************
*/
void ihevc_intra_pred_luma_mode_18_34_neonintr(UWORD8 *pu1_ref,
WORD32 src_strd,
UWORD8 *pu1_dst,
WORD32 dst_strd,
WORD32 nt,
WORD32 mode)
{
WORD32 row, col, idx;
WORD32 intraPredAngle = 32;
WORD32 two_nt;
UNUSED(src_strd);
two_nt = 2 * nt;
UWORD8 *pu1_ref_tmp = pu1_ref;
UWORD8 *pu1_ref_tmp1 = pu1_ref;
UWORD8 *pu1_dst_tmp = pu1_dst;
UWORD8 *pu1_dst_tmp_plus8 = pu1_dst;
uint8x8_t src_tmp_1st, src_tmp_2nd, vext1, vext2, vext3, vext4, vext5, vext6, vext7;
/* src_tmp_1st, src_tmp_2nd are named as to load the 1st eight and next 8 values from source(pu1_ref) */
/* vext1 - vext7 are named to do vext operation between 2 loaded values and to handle dual issue */
/* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */
/* rows and columns are unrolled by 8, when the width is multiple of 8 */
/* loops are maintained separately for mode18 and mode34 */
/* cond to allow multiples of 8 */
if(0 == (nt & 7))
{
if(mode == 34)
{
pu1_ref_tmp += (two_nt + 2);
for(row = nt; row > 0; row -= 8)
{
for(col = nt; col > 0; col -= 8)
{
/* Loading 1st eight values */
src_tmp_1st = vld1_u8(pu1_ref_tmp);
pu1_ref_tmp += 8;
/* Loading next eight values */
src_tmp_2nd = vld1_u8(pu1_ref_tmp);
/* UNROLLED pu1_dst[col + (row * dst_strd)] = pu1_ref[two_nt + col + idx + 1] */
vext1 = vext_u8(src_tmp_1st, src_tmp_2nd, 1);
vst1_u8(pu1_dst_tmp, src_tmp_1st);
pu1_dst_tmp += dst_strd;
vext2 = vext_u8(src_tmp_1st, src_tmp_2nd, 2);
vst1_u8(pu1_dst_tmp, vext1);
pu1_dst_tmp += dst_strd;
vext3 = vext_u8(src_tmp_1st, src_tmp_2nd, 3);
vst1_u8(pu1_dst_tmp, vext2);
pu1_dst_tmp += dst_strd;
vext4 = vext_u8(src_tmp_1st, src_tmp_2nd, 4);
vst1_u8(pu1_dst_tmp, vext3);
pu1_dst_tmp += dst_strd;
vext5 = vext_u8(src_tmp_1st, src_tmp_2nd, 5);
vst1_u8(pu1_dst_tmp, vext4);
pu1_dst_tmp += dst_strd;
vext6 = vext_u8(src_tmp_1st, src_tmp_2nd, 6);
vst1_u8(pu1_dst_tmp, vext5);
pu1_dst_tmp += dst_strd;
vext7 = vext_u8(src_tmp_1st, src_tmp_2nd, 7);
vst1_u8(pu1_dst_tmp, vext6);
pu1_dst_tmp += dst_strd;
vst1_u8(pu1_dst_tmp, vext7);
pu1_dst_tmp += dst_strd;
}
pu1_dst_tmp_plus8 += 8;
pu1_dst_tmp = pu1_dst_tmp_plus8;
pu1_ref_tmp -= (nt - 8);
}
}
else /* Loop for mode 18 */
{
pu1_ref_tmp += (two_nt);
for(row = nt; row > 0; row -= 8)
{
for(col = nt; col > 0; col -= 8)
{
/* Loading 1st eight values */
src_tmp_1st = vld1_u8(pu1_ref_tmp);
pu1_ref_tmp -= 8;
/* Loading next eight values */
src_tmp_2nd = vld1_u8(pu1_ref_tmp);
/* UNROLLED pu1_dst[col + (row * dst_strd)] = pu1_ref[two_nt + col + idx + 1] */
vext1 = vext_u8(src_tmp_2nd, src_tmp_1st, 7);
vst1_u8(pu1_dst_tmp, src_tmp_1st);
pu1_dst_tmp += dst_strd;
vext2 = vext_u8(src_tmp_2nd, src_tmp_1st, 6);
vst1_u8(pu1_dst_tmp, vext1);
pu1_dst_tmp += dst_strd;
vext3 = vext_u8(src_tmp_2nd, src_tmp_1st, 5);
vst1_u8(pu1_dst_tmp, vext2);
pu1_dst_tmp += dst_strd;
vext4 = vext_u8(src_tmp_2nd, src_tmp_1st, 4);
vst1_u8(pu1_dst_tmp, vext3);
pu1_dst_tmp += dst_strd;
vext5 = vext_u8(src_tmp_2nd, src_tmp_1st, 3);
vst1_u8(pu1_dst_tmp, vext4);
pu1_dst_tmp += dst_strd;
vext6 = vext_u8(src_tmp_2nd, src_tmp_1st, 2);
vst1_u8(pu1_dst_tmp, vext5);
pu1_dst_tmp += dst_strd;
vext7 = vext_u8(src_tmp_2nd, src_tmp_1st, 1);
vst1_u8(pu1_dst_tmp, vext6);
pu1_dst_tmp += dst_strd;
vst1_u8(pu1_dst_tmp, vext7);
pu1_dst_tmp += dst_strd;
}
pu1_dst_tmp_plus8 += 8;
pu1_dst_tmp = pu1_dst_tmp_plus8;
pu1_ref_tmp += (nt + 8);
}
}
}
/* rows and columns are unrolled by 4, when the width is multiple of 4 */
else /* loop for multiples of 4 */
{
uint8x8_t src_val1;
uint8x8_t src_val2;
if(mode == 18)
intraPredAngle = -32;
else if(mode == 34)
intraPredAngle = 32;
for(row = 0; row < nt; row += 2)
{
/* unrolling 2 rows */
idx = ((row + 1) * intraPredAngle) >> 5;
pu1_ref_tmp = pu1_ref + two_nt + idx + 1;
src_val1 = vld1_u8(pu1_ref_tmp);
idx = ((row + 2) * intraPredAngle) >> 5;
pu1_ref_tmp1 = pu1_ref + two_nt + idx + 1;
src_val2 = vld1_u8(pu1_ref_tmp1);
/* unrolling 4 col */
for(col = nt; col > 0; col -= 4)
{
pu1_dst_tmp = pu1_dst;
vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(src_val1), 0);
pu1_dst_tmp += dst_strd;
vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(src_val2), 0);
pu1_dst += 4;
}
pu1_dst += 2 * dst_strd - nt;
}
}
}
/* INTRA_PRED_LUMA_MODE_18_34 */
/**
*******************************************************************************
*
* @brief
* Intra prediction interpolation filter for luma mode 3 to mode 9
*
* @par Description:
* Intraprediction for mode 3 to 9 (positive angle, horizontal mode ) with
* reference neighboring samples location pointed by 'pu1_ref' to the TU
* block location pointed by 'pu1_dst'
*
* @param[in] pu1_src
* UWORD8 pointer to the source
*
* @param[out] pu1_dst
* UWORD8 pointer to the destination
*
* @param[in] src_strd
* integer source stride
*
* @param[in] dst_strd
* integer destination stride
*
* @param[in] nt
* integer Transform Block size
*
* @param[in] mode
* integer intraprediction mode
*
* @returns
*
* @remarks
* None
*
*******************************************************************************
*/
void ihevc_intra_pred_luma_mode_3_to_9_neonintr(UWORD8 *pu1_ref,
WORD32 src_strd,
UWORD8 *pu1_dst,
WORD32 dst_strd,
WORD32 nt,
WORD32 mode)
{
WORD32 row, col;
WORD32 intra_pred_ang;
WORD32 pos, fract = 100, fract_prev;
UNUSED(src_strd);
if(0 == (nt & 7))
{
UWORD8 *pu1_ref_main_idx = pu1_ref;
UWORD8 *pu1_ref_main_idx_1 = pu1_ref;
UWORD8 *pu1_dst_tmp1 = pu1_dst;
UWORD8 *pu1_dst_tmp2 = pu1_dst;
WORD32 two_nt = 2 * nt;
pu1_ref_main_idx += two_nt;
pu1_ref_main_idx_1 += two_nt - 1;
uint8x8_t dup_const_fract, dup_const_32_fract, ref_main_idx, ref_main_idx_1;
uint8x8_t shift_res;
uint16x8_t mul_res1, mul_res2, add_res;
/* Intra Pred Angle according to the mode */
intra_pred_ang = gai4_ihevc_ang_table[mode];
pu1_ref_main_idx -= 8;
pu1_ref_main_idx_1 -= 8;
for(col = 0; col < nt; col++)
{
fract_prev = fract;
pos = ((col + 1) * intra_pred_ang);
fract = pos & (31);
if(fract_prev < fract)
{
pu1_ref_main_idx += 1;
pu1_ref_main_idx_1 += 1;
}
dup_const_fract = vdup_n_u8((uint8_t)fract);
dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
for(row = nt; row > 0; row -= 8)
{
ref_main_idx = vld1_u8(pu1_ref_main_idx);
ref_main_idx_1 = vld1_u8(pu1_ref_main_idx_1);
mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
add_res = vaddq_u16(mul_res1, mul_res2);
shift_res = vrshrn_n_u16(add_res, 5);
vst1_lane_u8(pu1_dst_tmp1, shift_res, 7);
pu1_dst_tmp1 += dst_strd;
vst1_lane_u8(pu1_dst_tmp1, shift_res, 6);
pu1_dst_tmp1 += dst_strd;
vst1_lane_u8(pu1_dst_tmp1, shift_res, 5);
pu1_dst_tmp1 += dst_strd;
vst1_lane_u8(pu1_dst_tmp1, shift_res, 4);
pu1_dst_tmp1 += dst_strd;
vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
pu1_dst_tmp1 += dst_strd;
vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
pu1_dst_tmp1 += dst_strd;
vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
pu1_dst_tmp1 += dst_strd;
vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
pu1_dst_tmp1 += dst_strd;
pu1_ref_main_idx -= 8;
pu1_ref_main_idx_1 -= 8;
}
pu1_dst_tmp2 += 1;
pu1_dst_tmp1 = pu1_dst_tmp2;
pu1_ref_main_idx += nt;
pu1_ref_main_idx_1 += nt;
pu1_ref_main_idx -= 1;
pu1_ref_main_idx_1 -= 1;
}
}
else
{
UWORD8 *pu1_ref_tmp1 = pu1_ref;
UWORD8 *pu1_ref_tmp2 = pu1_ref;
UWORD8 *pu1_dst_tmp1 = pu1_dst;
UWORD8 *pu1_dst_tmp2 = pu1_dst;
pu1_ref_tmp1 += nt;
pu1_ref_tmp2 += (nt - 1);
uint8x8_t dup_fract, dup_32_fract, shift_res;
uint16x8_t mul_res1, mul_res2, add_res;
uint32x2_t pu1_ref_val1, pu1_ref_val2;
pu1_ref_val1 = vdup_n_u32(0);
pu1_ref_val2 = vdup_n_u32(0);
/* Intra Pred Angle according to the mode */
intra_pred_ang = gai4_ihevc_ang_table[mode];
for(col = 0; col < nt; col++)
{
fract_prev = fract;
pos = ((col + 1) * intra_pred_ang);
fract = pos & (31);
if(fract_prev < fract)
{
pu1_ref_tmp1 += 1;
pu1_ref_tmp2 += 1;
}
dup_fract = vdup_n_u8((uint8_t)fract);
dup_32_fract = vdup_n_u8((uint8_t)(32 - fract));
for(row = nt; row > 0; row -= 4)
{
pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, pu1_ref_val1, 0);
pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, pu1_ref_val2, 0);
mul_res1 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_32_fract);
mul_res2 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val2), dup_fract);
add_res = vaddq_u16(mul_res1, mul_res2);
shift_res = vrshrn_n_u16(add_res, 5);
vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
pu1_dst_tmp1 += dst_strd;
vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
pu1_dst_tmp1 += dst_strd;
vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
pu1_dst_tmp1 += dst_strd;
vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
}
pu1_ref_tmp1 -= 1;
pu1_ref_tmp2 -= 1;
pu1_dst_tmp2 += 1;
pu1_dst_tmp1 = pu1_dst_tmp2;
}
}
}
/**
*******************************************************************************
*
* @brief
* Intra prediction interpolation filter for luma mode 11 to mode 17
*
* @par Description:
* Intraprediction for mode 11 to 17 (negative angle, horizontal mode )
* with reference neighboring samples location pointed by 'pu1_ref' to the
* TU block location pointed by 'pu1_dst'
*
* @param[in] pu1_src
* UWORD8 pointer to the source
*
* @param[out] pu1_dst
* UWORD8 pointer to the destination
*
* @param[in] src_strd
* integer source stride
*
* @param[in] dst_strd
* integer destination stride
*
* @param[in] nt
* integer Transform Block size
*
* @param[in] mode
* integer intraprediction mode
*
* @returns
*
* @remarks
* None
*
*******************************************************************************
*/
void ihevc_intra_pred_luma_mode_11_to_17_neonintr(UWORD8 *pu1_ref,
WORD32 src_strd,
UWORD8 *pu1_dst,
WORD32 dst_strd,
WORD32 nt,
WORD32 mode)
{
WORD32 row, col, k;
WORD32 two_nt;
WORD32 intra_pred_ang, inv_ang, inv_ang_sum;
WORD32 pos, fract = 1000, fract_prev;
WORD32 ref_idx;
UWORD8 *ref_main;
UWORD8 *ref_main_tmp;
UWORD8 *pu1_ref_tmp1 = pu1_ref;
UWORD8 *pu1_ref_tmp2 = pu1_ref;
UWORD8 *pu1_dst_tmp1 = pu1_dst;
UWORD8 *pu1_dst_tmp2 = pu1_dst;
UWORD8 ref_temp[2 * MAX_CU_SIZE + 1];
uint16x8_t mul_res1, mul_res2, add_res;
uint8x8_t dup_const_fract, dup_const_32_fract;
uint8x8_t ref_main_idx, ref_main_idx_1, shift_res;
uint8x8_t ref_left_t;
uint32x2_t ref_left_tmp;
UNUSED(src_strd);
ref_left_tmp = vdup_n_u32(0);
inv_ang_sum = 128;
two_nt = 2 * nt;
intra_pred_ang = gai4_ihevc_ang_table[mode];
inv_ang = gai4_ihevc_inv_ang_table[mode - 11];
pu1_ref_tmp1 += two_nt;
ref_main = ref_temp + (nt - 1);
ref_main_tmp = ref_main;
if(0 == (nt & 7))
{
pu1_ref_tmp2 += (two_nt - 7);
for(k = nt - 1; k >= 0; k -= 8)
{
ref_left_t = vld1_u8(pu1_ref_tmp2);
ref_left_t = vrev64_u8(ref_left_t);
vst1_u8(ref_main_tmp, ref_left_t);
ref_main_tmp += 8;
pu1_ref_tmp2 -= 8;
}
}
else
{
uint8x8_t rev_val;
pu1_ref_tmp2 += (two_nt - (nt - 1));
for(k = nt - 1; k >= 0; k -= 8)
{
ref_left_tmp = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, ref_left_tmp, 1);
rev_val = vrev64_u8(vreinterpret_u8_u32(ref_left_tmp));
vst1_lane_u32((uint32_t *)ref_main_tmp, vreinterpret_u32_u8(rev_val), 0);
}
}
ref_main[nt] = pu1_ref[two_nt - nt];
/* For horizontal modes, (ref main = ref left) (ref side = ref above) */
ref_idx = (nt * intra_pred_ang) >> 5;
/* SIMD Optimization can be done using look-up table for the loop */
/* For negative angled derive the main reference samples from side */
/* reference samples refer to section 8.4.4.2.6 */
for(k = -1; k > ref_idx; k--)
{
inv_ang_sum += inv_ang;
ref_main[k] = pu1_ref[two_nt + (inv_ang_sum >> 8)];
}
UWORD8 *ref_main_tmp1 = ref_main;
UWORD8 *ref_main_tmp2 = ref_main;
ref_main_tmp2 += 1;
if(0 == (nt & 7))
{
/* For the angles other then 45 degree, interpolation btw 2 neighboring */
/* samples dependent on distance to obtain destination sample */
for(col = 0; col < nt; col++)
{
fract_prev = fract;
pos = ((col + 1) * intra_pred_ang);
fract = pos & (31);
if(fract_prev < fract)
{
ref_main_tmp1 -= 1;
ref_main_tmp2 -= 1;
}
dup_const_fract = vdup_n_u8((uint8_t)fract);
dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
// Do linear filtering
for(row = nt; row > 0; row -= 8)
{
ref_main_idx = vld1_u8(ref_main_tmp1);
ref_main_idx_1 = vld1_u8(ref_main_tmp2);
mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
add_res = vaddq_u16(mul_res1, mul_res2);
shift_res = vrshrn_n_u16(add_res, 5);
vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
pu1_dst_tmp1 += dst_strd;
vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
pu1_dst_tmp1 += dst_strd;
vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
pu1_dst_tmp1 += dst_strd;
vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
pu1_dst_tmp1 += dst_strd;
vst1_lane_u8(pu1_dst_tmp1, shift_res, 4);
pu1_dst_tmp1 += dst_strd;
vst1_lane_u8(pu1_dst_tmp1, shift_res, 5);
pu1_dst_tmp1 += dst_strd;
vst1_lane_u8(pu1_dst_tmp1, shift_res, 6);
pu1_dst_tmp1 += dst_strd;
vst1_lane_u8(pu1_dst_tmp1, shift_res, 7);
pu1_dst_tmp1 += dst_strd;
ref_main_tmp1 += 8;
ref_main_tmp2 += 8;
}
ref_main_tmp1 -= nt;
ref_main_tmp2 -= nt;
pu1_dst_tmp2 += 1;
pu1_dst_tmp1 = pu1_dst_tmp2;
}
}
else
{
uint32x2_t ref_main_idx1, ref_main_idx2;
ref_main_idx1 = vdup_n_u32(0);
ref_main_idx2 = vdup_n_u32(0);
for(col = 0; col < nt; col++)
{
fract_prev = fract;
pos = ((col + 1) * intra_pred_ang);
fract = pos & (31);
if(fract_prev < fract)
{
ref_main_tmp1 -= 1;
ref_main_tmp2 -= 1;
}
dup_const_fract = vdup_n_u8((uint8_t)fract);
dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
for(row = nt; row > 0; row -= 4)
{
ref_main_idx1 = vld1_lane_u32((uint32_t *)ref_main_tmp1, ref_main_idx1, 0);
ref_main_idx2 = vld1_lane_u32((uint32_t *)ref_main_tmp2, ref_main_idx2, 0);
mul_res1 = vmull_u8(vreinterpret_u8_u32(ref_main_idx1), dup_const_32_fract);
mul_res2 = vmull_u8(vreinterpret_u8_u32(ref_main_idx2), dup_const_fract);
add_res = vaddq_u16(mul_res1, mul_res2);
shift_res = vrshrn_n_u16(add_res, 5);
vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
pu1_dst_tmp1 += dst_strd;
vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
pu1_dst_tmp1 += dst_strd;
vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
pu1_dst_tmp1 += dst_strd;
vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
pu1_dst_tmp1 += dst_strd;
}
pu1_dst_tmp2 += 1;
pu1_dst_tmp1 = pu1_dst_tmp2;
}
}
}
/**
*******************************************************************************
*
* @brief
* Intra prediction interpolation filter for luma mode 19 to mode 25
*
* @par Description:
* Intraprediction for mode 19 to 25 (negative angle, vertical mode ) with
* reference neighboring samples location pointed by 'pu1_ref' to the TU
* block location pointed by 'pu1_dst'
*
* @param[in] pu1_src
* UWORD8 pointer to the source
*
* @param[out] pu1_dst
* UWORD8 pointer to the destination
*
* @param[in] src_strd
* integer source stride
*
* @param[in] dst_strd
* integer destination stride
*
* @param[in] nt
* integer Transform Block size
*
* @param[in] mode
* integer intraprediction mode
*
* @returns
*
* @remarks
* None
*
*******************************************************************************
*/
void ihevc_intra_pred_luma_mode_19_to_25_neonintr(UWORD8 *pu1_ref,
WORD32 src_strd,
UWORD8 *pu1_dst,
WORD32 dst_strd,
WORD32 nt,
WORD32 mode)
{
WORD32 row, col, k;
WORD32 two_nt, intra_pred_ang;
WORD32 inv_ang, inv_ang_sum, pos, fract = 1000, fract_prev;;
WORD32 ref_idx;
UWORD8 *ref_main;
UWORD8 *ref_main_tmp;
UWORD8 ref_temp[(2 * MAX_CU_SIZE) + 1];
UWORD8 *pu1_ref_tmp1 = pu1_ref;
UWORD8 *pu1_ref_tmp2 = pu1_ref;
UWORD8 *pu1_dst_tmp1 = pu1_dst;
uint16x8_t mul_res1, mul_res2, add_res;
uint8x8_t dup_const_fract, dup_const_32_fract;
uint8x8_t ref_main_idx, ref_main_idx_1, shift_res;
uint8x8_t ref_above_t;
uint32x2_t ref_above_tmp;
UNUSED(src_strd);
ref_above_tmp = vdup_n_u32(0);
two_nt = 2 * nt;
intra_pred_ang = gai4_ihevc_ang_table[mode];
inv_ang = gai4_ihevc_inv_ang_table[mode - 12];
/* Intermediate reference samples for negative angle modes */
/* This have to be removed during optimization*/
pu1_ref_tmp1 += two_nt;
ref_main = ref_temp + (nt - 1);
ref_main_tmp = ref_main;
if(0 == (nt & 7))
{
pu1_ref_tmp2 += (two_nt - 7);
for(k = nt - 1; k >= 0; k -= 8)
{
ref_above_t = vld1_u8(pu1_ref_tmp1);
vst1_u8(ref_main_tmp, ref_above_t);
ref_main_tmp += 8;
pu1_ref_tmp1 += 8;
}
}
else
{
pu1_ref_tmp2 += (two_nt - (nt - 1));
for(k = nt - 1; k >= 0; k -= 4)
{
ref_above_tmp = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, ref_above_tmp, 0);
vst1_lane_u32((uint32_t *)ref_main_tmp, ref_above_tmp, 0);
}
}
ref_main[nt] = pu1_ref[two_nt + nt];
/* For horizontal modes, (ref main = ref above) (ref side = ref left) */
ref_idx = (nt * intra_pred_ang) >> 5;
inv_ang_sum = 128;
/* SIMD Optimization can be done using look-up table for the loop */
/* For negative angled derive the main reference samples from side */
/* reference samples refer to section 8.4.4.2.6 */
for(k = -1; k > ref_idx; k--)
{
inv_ang_sum += inv_ang;
ref_main[k] = pu1_ref[two_nt - (inv_ang_sum >> 8)];
}
UWORD8 *ref_main_tmp1 = ref_main;
UWORD8 *ref_main_tmp2 = ref_main;
ref_main_tmp2 += 1;
if(0 == (nt & 7))
{
/* For the angles other then 45 degree, interpolation btw 2 neighboring */
/* samples dependent on distance to obtain destination sample */
for(row = 0; row < nt; row++)
{
fract_prev = fract;
pos = ((row + 1) * intra_pred_ang);
fract = pos & (31);
if(fract_prev < fract)
{
ref_main_tmp1 -= 1;
ref_main_tmp2 -= 1;
}
dup_const_fract = vdup_n_u8((uint8_t)fract);
dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
// Do linear filtering
for(col = nt; col > 0; col -= 8)
{
ref_main_idx = vld1_u8(ref_main_tmp1);
ref_main_idx_1 = vld1_u8(ref_main_tmp2);
mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
add_res = vaddq_u16(mul_res1, mul_res2);
shift_res = vrshrn_n_u16(add_res, 5);
vst1_u8(pu1_dst_tmp1, shift_res);
pu1_dst_tmp1 += 8;
ref_main_tmp1 += 8;
ref_main_tmp2 += 8;
}
ref_main_tmp1 -= nt;
ref_main_tmp2 -= nt;
pu1_dst_tmp1 += (dst_strd - nt);
}
}
else
{
uint32x2_t ref_main_idx1, ref_main_idx2;
ref_main_idx1 = vdup_n_u32(0);
ref_main_idx2 = vdup_n_u32(0);
for(row = 0; row < nt; row++)
{
fract_prev = fract;
pos = ((row + 1) * intra_pred_ang);
fract = pos & (31);
if(fract_prev < fract)
{
ref_main_tmp1 -= 1;
ref_main_tmp2 -= 1;
}
dup_const_fract = vdup_n_u8((uint8_t)fract);
dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
for(col = nt; col > 0; col -= 4)
{
ref_main_idx1 = vld1_lane_u32((uint32_t *)ref_main_tmp1, ref_main_idx1, 0);
ref_main_idx2 = vld1_lane_u32((uint32_t *)ref_main_tmp2, ref_main_idx2, 0);
mul_res1 = vmull_u8(vreinterpret_u8_u32(ref_main_idx1), dup_const_32_fract);
mul_res2 = vmull_u8(vreinterpret_u8_u32(ref_main_idx2), dup_const_fract);
add_res = vaddq_u16(mul_res1, mul_res2);
shift_res = vrshrn_n_u16(add_res, 5);
vst1_lane_u32((uint32_t *)pu1_dst_tmp1, vreinterpret_u32_u8(shift_res), 0);
pu1_dst_tmp1 += 4;
}
pu1_dst_tmp1 += (dst_strd - nt);
}
}
}
/**
*******************************************************************************
*
* @brief
* Intra prediction interpolation filter for luma mode 27 to mode 33
*
* @par Description:
* Intraprediction for mode 27 to 33 (positive angle, vertical mode ) with
* reference neighboring samples location pointed by 'pu1_ref' to the TU
* block location pointed by 'pu1_dst'
*
* @param[in] pu1_src
* UWORD8 pointer to the source
*
* @param[out] pu1_dst
* UWORD8 pointer to the destination
*
* @param[in] src_strd
* integer source stride
*
* @param[in] dst_strd
* integer destination stride
*
* @param[in] nt
* integer Transform Block size
*
* @param[in] mode
* integer intraprediction mode
*
* @returns
*
* @remarks
* None
*
*******************************************************************************
*/
void ihevc_intra_pred_luma_mode_27_to_33_neonintr(UWORD8 *pu1_ref,
WORD32 src_strd,
UWORD8 *pu1_dst,
WORD32 dst_strd,
WORD32 nt,
WORD32 mode)
{
WORD32 row, col;
WORD32 intra_pred_ang;
WORD32 pos, fract = 0, fract_prev;
WORD32 two_nt = 2 * nt;
UNUSED(src_strd);
if(0 == (nt & 7))
{
UWORD8 *pu1_ref_main_idx = pu1_ref;
UWORD8 *pu1_ref_main_idx_1 = pu1_ref;
UWORD8 *pu1_dst_tmp1 = pu1_dst;
pu1_ref_main_idx += (two_nt + 1);
pu1_ref_main_idx_1 += (two_nt + 2);
uint8x8_t dup_const_fract, dup_const_32_fract, ref_main_idx, ref_main_idx_1;
uint8x8_t shift_res;
uint16x8_t mul_res1, mul_res2, add_res;
/* Intra Pred Angle according to the mode */
intra_pred_ang = gai4_ihevc_ang_table[mode];
for(row = 0; row < nt; row++)
{
fract_prev = fract;
pos = ((row + 1) * intra_pred_ang);
fract = pos & (31);
if(fract_prev > fract)
{
pu1_ref_main_idx += 1;
pu1_ref_main_idx_1 += 1;
}
dup_const_fract = vdup_n_u8((uint8_t)fract);
dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
for(col = nt; col > 0; col -= 8)
{
ref_main_idx = vld1_u8(pu1_ref_main_idx);
ref_main_idx_1 = vld1_u8(pu1_ref_main_idx_1);
mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
add_res = vaddq_u16(mul_res1, mul_res2);
shift_res = vrshrn_n_u16(add_res, <