| /****************************************************************************** |
| * |
| * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at: |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| * |
| ******************************************************************************/ |
| /** |
| ******************************************************************************* |
| * @file |
| * ihevc_intra_pred_filters_neon_intr.c |
| * |
| * @brief |
| * Contains function Definition for intra prediction interpolation filters |
| * |
| * |
| * @author |
| * Yogeswaran RS |
| * |
| * @par List of Functions: |
| * - ihevc_intra_pred_luma_planar() |
| * - ihevc_intra_pred_luma_dc() |
| * - ihevc_intra_pred_luma_horz() |
| * - ihevc_intra_pred_luma_ver() |
| * - ihevc_intra_pred_luma_mode2() |
| * - ihevc_intra_pred_luma_mode_18_34() |
| * |
| * @remarks |
| * None |
| * |
| ******************************************************************************* |
| */ |
| /*****************************************************************************/ |
| /* File Includes */ |
| /*****************************************************************************/ |
| #include <stdio.h> |
| |
| #include "ihevc_typedefs.h" |
| #include "ihevc_intra_pred.h" |
| #include "ihevc_macros.h" |
| #include "ihevc_func_selector.h" |
| #include "arm_neon.h" |
| #include "ihevc_platform_macros.h" |
| #include "ihevc_common_tables.h" |
| |
| /****************************************************************************/ |
| /* Constant Macros */ |
| /****************************************************************************/ |
| #define MAX_CU_SIZE 64 |
| #define BIT_DEPTH 8 |
| #define T32_4NT 128 |
| #define T16_4NT 64 |
| |
| |
| |
| /*****************************************************************************/ |
| /* Table Look-up */ |
| /*****************************************************************************/ |
| |
| #define GET_BITS(y,x) ((y) & (1 << x)) && (1 << x) |
| |
| /*****************************************************************************/ |
| /* Function Definition */ |
| /*****************************************************************************/ |
| |
| /** |
| ******************************************************************************* |
| * |
| * @brief |
| * Intra prediction interpolation filter for pu1_ref substitution |
| * |
| * |
| * @par Description: |
| * Reference substitution process for samples unavailable for prediction |
| * Refer to section 8.4.4.2.2 |
| * |
| * @param[in] pu1_top_left |
| * UWORD8 pointer to the top-left |
| * |
| * @param[in] pu1_top |
| * UWORD8 pointer to the top |
| * |
| * @param[in] pu1_left |
| * UWORD8 pointer to the left |
| * |
| * @param[in] src_strd |
| * WORD32 Source stride |
| * |
| * @param[in] nbr_flags |
| * WORD32 neighbor availability flags |
| * |
| * @param[in] nt |
| * WORD32 transform Block size |
| * |
| * @param[in] dst_strd |
| * WORD32 Destination stride |
| * |
| * @returns |
| * |
| * @remarks |
| * None |
| * |
| ******************************************************************************* |
| */ |
| |
| |
| void ihevc_intra_pred_luma_ref_substitution_neonintr(UWORD8 *pu1_top_left, |
| UWORD8 *pu1_top, |
| UWORD8 *pu1_left, |
| WORD32 src_strd, |
| WORD32 nt, |
| WORD32 nbr_flags, |
| UWORD8 *pu1_dst, |
| WORD32 dst_strd) |
| { |
| UWORD8 pu1_ref; |
| WORD32 dc_val, i; |
| WORD32 total_samples = (4 * nt) + 1; |
| WORD32 two_nt = 2 * nt; |
| WORD32 three_nt = 3 * nt; |
| WORD32 get_bits; |
| WORD32 next; |
| WORD32 bot_left, left, top, tp_right, tp_left; |
| WORD32 idx, nbr_id_from_bl, frwd_nbr_flag; |
| UNUSED(dst_strd); |
| dc_val = 1 << (BIT_DEPTH - 1); |
| |
| /* Neighbor Flag Structure*/ |
| /* Top-Left | Top-Right | Top | Left | Bottom-Left |
| 1 4 4 4 4 |
| */ |
| |
| /* If no neighbor flags are present, fill the neighbor samples with DC value */ |
| if(nbr_flags == 0) |
| { |
| for(i = 0; i < total_samples; i++) |
| { |
| pu1_dst[i] = dc_val; |
| } |
| } |
| else |
| { |
| /* Else fill the corresponding samples */ |
| pu1_dst[two_nt] = *pu1_top_left; |
| UWORD8 *pu1_dst_tmp2 = pu1_dst; |
| UWORD8 *pu1_top_tmp = pu1_top; |
| pu1_dst_tmp2 += two_nt + 1; |
| |
| for(i = 0; i < two_nt; i++) |
| pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd]; |
| |
| uint8x8_t src; |
| for(i = two_nt; i > 0; i -= 8) |
| { |
| src = vld1_u8(pu1_top_tmp); |
| pu1_top_tmp += 8; |
| vst1_u8(pu1_dst_tmp2, src); |
| pu1_dst_tmp2 += 8; |
| } |
| |
| if(nt <= 8) |
| { |
| /* 1 bit extraction for all the neighboring blocks */ |
| tp_left = (nbr_flags & 0x10000) >> 16; |
| bot_left = nbr_flags & 0x1; |
| left = (nbr_flags & 0x10) >> 4; |
| top = (nbr_flags & 0x100) >> 8; |
| tp_right = (nbr_flags & 0x1000) >> 12; |
| |
| next = 1; |
| |
| /* If bottom -left is not available, reverse substitution process*/ |
| if(bot_left == 0) |
| { |
| WORD32 a_nbr_flag[5] = { bot_left, left, tp_left, top, tp_right }; |
| |
| /* Check for the 1st available sample from bottom-left*/ |
| while(!a_nbr_flag[next]) |
| next++; |
| |
| /* If Left, top-left are available*/ |
| if(next <= 2) |
| { |
| idx = nt * next; |
| pu1_ref = pu1_dst[idx]; |
| for(i = 0; i < idx; i++) |
| pu1_dst[i] = pu1_ref; |
| } |
| else /* If top, top-right are available */ |
| { |
| /* Idx is changed to copy 1 pixel value for top-left ,if top-left is not available*/ |
| idx = (nt * (next - 1)) + 1; |
| pu1_ref = pu1_dst[idx]; |
| for(i = 0; i < idx; i++) |
| pu1_dst[i] = pu1_ref; |
| } |
| } |
| |
| /* Forward Substitution Process */ |
| /* If left is Unavailable, copy the last bottom-left value */ |
| |
| if(left == 0) |
| { |
| uint8x8_t dup_pu1_dst1; |
| UWORD8 *pu1_dst_const_nt = pu1_dst; |
| pu1_dst_const_nt += nt; |
| |
| if(0 == (nt & 7)) |
| { |
| dup_pu1_dst1 = vdup_n_u8(pu1_dst[nt - 1]); |
| for(i = nt; i > 0; i -= 8) |
| { |
| vst1_u8(pu1_dst_const_nt, dup_pu1_dst1); |
| pu1_dst_const_nt += 8; |
| |
| } |
| } |
| else |
| { |
| //uint32x2_t dup_pu1_dst4; |
| dup_pu1_dst1 = vdup_n_u8(pu1_dst[nt - 1]); |
| //dup_pu1_dst4 = vdup_n_u32((uint32_t) pu1_dst[nt - 1]); |
| for(i = nt; i > 0; i -= 4) |
| { |
| vst1_lane_u32((uint32_t *)pu1_dst_const_nt, vreinterpret_u32_u8(dup_pu1_dst1), 0); |
| pu1_dst_const_nt += 4; |
| |
| } |
| |
| } |
| |
| } |
| if(tp_left == 0) |
| pu1_dst[two_nt] = pu1_dst[two_nt - 1]; |
| if(top == 0) |
| { |
| |
| if(0 == (nt & 7)) |
| { |
| uint8x8_t dup_pu1_dst2; |
| UWORD8 *pu1_dst_const_two_nt_1 = pu1_dst; |
| pu1_dst_const_two_nt_1 += (two_nt + 1); |
| dup_pu1_dst2 = vdup_n_u8(pu1_dst[two_nt]); |
| for(i = nt; i > 0; i -= 8) |
| { |
| vst1_u8(pu1_dst_const_two_nt_1, dup_pu1_dst2); |
| pu1_dst_const_two_nt_1 += 8; |
| |
| } |
| } |
| else |
| { |
| for(i = 0; i < nt; i++) |
| pu1_dst[two_nt + 1 + i] = pu1_dst[two_nt]; |
| } |
| } |
| if(tp_right == 0) |
| { |
| uint8x8_t dup_pu1_dst3; |
| UWORD8 *pu1_dst_const_three_nt_1 = pu1_dst; |
| pu1_dst_const_three_nt_1 += (three_nt + 1); |
| dup_pu1_dst3 = vdup_n_u8(pu1_dst[two_nt]); |
| if(0 == (nt & 7)) |
| { |
| for(i = nt; i > 0; i -= 8) |
| { |
| vst1_u8(pu1_dst_const_three_nt_1, dup_pu1_dst3); |
| pu1_dst_const_three_nt_1 += 8; |
| |
| } |
| } |
| else |
| { |
| for(i = nt; i > 0; i -= 4) |
| { |
| vst1_lane_u32((uint32_t *)pu1_dst_const_three_nt_1, vreinterpret_u32_u8(dup_pu1_dst3), 0); |
| pu1_dst_const_three_nt_1 += 4; |
| } |
| |
| } |
| |
| } |
| } |
| if(nt == 16) |
| { |
| WORD32 nbr_flags_temp = 0; |
| nbr_flags_temp = (nbr_flags & 0x3) + ((nbr_flags & 0x30) >> 2) |
| + ((nbr_flags & 0x300) >> 4) |
| + ((nbr_flags & 0x3000) >> 6) |
| + ((nbr_flags & 0x10000) >> 8); |
| |
| /* compute trailing zeors based on nbr_flag for substitution process of below left see section .*/ |
| /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */ |
| { |
| nbr_id_from_bl = look_up_trailing_zeros(nbr_flags_temp & 0XF) * 8; /* for below left and left */ |
| |
| if(nbr_id_from_bl == 64) |
| nbr_id_from_bl = 32; |
| |
| if(nbr_id_from_bl == 32) |
| { |
| /* for top left : 1 pel per nbr bit */ |
| if(!((nbr_flags_temp >> 8) & 0x1)) |
| { |
| nbr_id_from_bl++; |
| nbr_id_from_bl += look_up_trailing_zeros((nbr_flags_temp >> 4) & 0xF) * 8; /* top and top right; 8 pels per nbr bit */ |
| } |
| } |
| /* Reverse Substitution Process*/ |
| if(nbr_id_from_bl) |
| { |
| /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */ |
| pu1_ref = pu1_dst[nbr_id_from_bl]; |
| for(i = (nbr_id_from_bl - 1); i >= 0; i--) |
| { |
| pu1_dst[i] = pu1_ref; |
| } |
| } |
| } |
| |
| /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */ |
| while(nbr_id_from_bl < ((T16_4NT) + 1)) |
| { |
| /* To Obtain the next unavailable idx flag after reverse neighbor substitution */ |
| /* Devide by 8 to obtain the original index */ |
| frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/ |
| |
| /* The Top-left flag is at the last bit location of nbr_flags*/ |
| if(nbr_id_from_bl == (T16_4NT / 2)) |
| { |
| get_bits = GET_BITS(nbr_flags_temp, 8); |
| |
| /* only pel substitution for TL */ |
| if(!get_bits) |
| pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1]; |
| } |
| else |
| { |
| get_bits = GET_BITS(nbr_flags_temp, frwd_nbr_flag); |
| if(!get_bits) |
| { |
| /* 8 pel substitution (other than TL) */ |
| pu1_ref = pu1_dst[nbr_id_from_bl - 1]; |
| for(i = 0; i < 8; i++) |
| pu1_dst[nbr_id_from_bl + i] = pu1_ref; |
| } |
| |
| } |
| nbr_id_from_bl += (nbr_id_from_bl == (T16_4NT / 2)) ? 1 : 8; |
| } |
| } |
| |
| if(nt == 32) |
| { |
| /* compute trailing ones based on mbr_flag for substitution process of below left see section .*/ |
| /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */ |
| { |
| nbr_id_from_bl = look_up_trailing_zeros((nbr_flags & 0XFF)) * 8; /* for below left and left */ |
| |
| if(nbr_id_from_bl == 64) |
| { |
| /* for top left : 1 pel per nbr bit */ |
| if(!((nbr_flags >> 16) & 0x1)) |
| { |
| /* top left not available */ |
| nbr_id_from_bl++; |
| /* top and top right; 8 pels per nbr bit */ |
| nbr_id_from_bl += look_up_trailing_zeros((nbr_flags >> 8) & 0xFF) * 8; |
| } |
| } |
| /* Reverse Substitution Process*/ |
| if(nbr_id_from_bl) |
| { |
| /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */ |
| pu1_ref = pu1_dst[nbr_id_from_bl]; |
| for(i = (nbr_id_from_bl - 1); i >= 0; i--) |
| pu1_dst[i] = pu1_ref; |
| } |
| } |
| |
| /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */ |
| while(nbr_id_from_bl < ((T32_4NT)+1)) |
| { |
| /* To Obtain the next unavailable idx flag after reverse neighbor substitution */ |
| /* Devide by 8 to obtain the original index */ |
| frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/ |
| |
| /* The Top-left flag is at the last bit location of nbr_flags*/ |
| if(nbr_id_from_bl == (T32_4NT / 2)) |
| { |
| get_bits = GET_BITS(nbr_flags, 16); |
| /* only pel substitution for TL */ |
| if(!get_bits) |
| pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1]; |
| } |
| else |
| { |
| get_bits = GET_BITS(nbr_flags, frwd_nbr_flag); |
| if(!get_bits) |
| { |
| /* 8 pel substitution (other than TL) */ |
| pu1_ref = pu1_dst[nbr_id_from_bl - 1]; |
| for(i = 0; i < 8; i++) |
| pu1_dst[nbr_id_from_bl + i] = pu1_ref; |
| } |
| |
| } |
| nbr_id_from_bl += (nbr_id_from_bl == (T32_4NT / 2)) ? 1 : 8; |
| } |
| } |
| |
| } |
| |
| } |
| |
| /** |
| ******************************************************************************* |
| * |
| * @brief |
| * Intra prediction interpolation filter for ref_filtering |
| * |
| * |
| * @par Description: |
| * Reference DC filtering for neighboring samples dependent on TU size and |
| * mode Refer to section 8.4.4.2.3 in the standard |
| * |
| * @param[in] pu1_src |
| * UWORD8 pointer to the source |
| * |
| * @param[out] pu1_dst |
| * UWORD8 pointer to the destination |
| * |
| * @param[in] nt |
| * integer Transform Block size |
| * |
| * @param[in] mode |
| * integer intraprediction mode |
| * |
| * @returns |
| * |
| * @remarks |
| * None |
| * |
| ******************************************************************************* |
| */ |
| |
| |
| void ihevc_intra_pred_ref_filtering_neonintr(UWORD8 *pu1_src, |
| WORD32 nt, |
| UWORD8 *pu1_dst, |
| WORD32 mode, |
| WORD32 strong_intra_smoothing_enable_flag) |
| { |
| WORD32 filter_flag; |
| WORD32 i = 0; |
| WORD32 four_nt = 4 * nt; |
| |
| WORD32 src_4nt; |
| WORD32 src_0nt; |
| /* Naming has been made as per the functionlity it has, For eg. pu1_src_tmp_1 is denoting pu1_src + 1 */ |
| /* src_val_1 to load value from pointer pu1_src_tmp_1, add_res has the result of adding 2 values */ |
| UWORD8 *pu1_src_tmp_0 = pu1_src; |
| UWORD8 *pu1_src_tmp_1; |
| UWORD8 *pu1_src_tmp_2; |
| UWORD8 *pu1_dst_tmp_0 = pu1_dst; |
| UWORD8 *pu1_dst_tmp_1; |
| |
| uint8x8_t src_val_0, src_val_2; |
| uint8x8_t src_val_1, shift_res; |
| uint8x8_t dup_const_2; |
| uint16x8_t mul_res, add_res; |
| WORD32 bi_linear_int_flag = 0; |
| WORD32 abs_cond_left_flag = 0; |
| WORD32 abs_cond_top_flag = 0; |
| WORD32 dc_val = 1 << (BIT_DEPTH - 5); |
| shift_res = vdup_n_u8(0); |
| |
| filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(nt) - 2)); |
| |
| if(0 == filter_flag) |
| { |
| if(pu1_src == pu1_dst) |
| { |
| return; |
| } |
| else |
| { |
| for(i = four_nt; i > 0; i -= 8) |
| { |
| src_val_0 = vld1_u8(pu1_src_tmp_0); |
| pu1_src_tmp_0 += 8; |
| vst1_u8(pu1_dst_tmp_0, src_val_0); |
| pu1_dst_tmp_0 += 8; |
| } |
| pu1_dst[four_nt] = pu1_src[four_nt]; |
| } |
| } |
| |
| else |
| { |
| /* If strong intra smoothin is enabled and transform size is 32 */ |
| if((1 == strong_intra_smoothing_enable_flag) && (32 == nt)) |
| { |
| /*Strong Intra Filtering*/ |
| abs_cond_top_flag = (ABS(pu1_src[2 * nt] + pu1_src[4 * nt] |
| - (2 * pu1_src[3 * nt]))) < dc_val; |
| abs_cond_left_flag = (ABS(pu1_src[2 * nt] + pu1_src[0] |
| - (2 * pu1_src[nt]))) < dc_val; |
| |
| bi_linear_int_flag = ((1 == abs_cond_left_flag) |
| && (1 == abs_cond_top_flag)); |
| } |
| |
| src_4nt = pu1_src[4 * nt]; |
| src_0nt = pu1_src[0]; |
| /* Strong filtering of reference samples */ |
| if(1 == bi_linear_int_flag) |
| { |
| WORD32 two_nt = four_nt >> 1; |
| |
| WORD32 pu1_src_0_val = pu1_src[0]; |
| WORD32 pu1_src_2_nt_val = pu1_src[2 * nt]; |
| WORD32 pu1_src_4_nt_val = pu1_src[4 * nt]; |
| |
| WORD32 prod_two_nt_src_0_val = two_nt * pu1_src_0_val; |
| uint16x8_t prod_two_nt_src_0_val_t = vdupq_n_u16(prod_two_nt_src_0_val); |
| |
| WORD32 prod_two_nt_src_2_nt_val = two_nt * pu1_src_2_nt_val; |
| uint16x8_t prod_two_nt_src_2_nt_val_t = vdupq_n_u16(prod_two_nt_src_2_nt_val); |
| |
| const UWORD8 *const_col_i; |
| uint8x8_t const_col_i_val; |
| uint16x8_t prod_val_1; |
| uint16x8_t prod_val_2; |
| uint16x8_t prod_val_3; |
| uint16x8_t prod_val_4; |
| uint8x8_t res_val_1; |
| uint8x8_t res_val_2; |
| uint8x8_t pu1_src_0_val_t = vdup_n_u8(pu1_src_0_val); |
| uint8x8_t pu1_src_2_nt_val_t = vdup_n_u8(pu1_src_2_nt_val); |
| uint8x8_t pu1_src_4_nt_val_t = vdup_n_u8(pu1_src_4_nt_val); |
| pu1_dst_tmp_0 = pu1_dst + 1; |
| pu1_dst_tmp_1 = pu1_dst + two_nt + 1; |
| |
| const_col_i = gau1_ihevc_planar_factor + 1; |
| |
| for(i = two_nt; i > 0; i -= 8) |
| { |
| const_col_i_val = vld1_u8(const_col_i); |
| const_col_i += 8; |
| |
| prod_val_1 = vmlsl_u8(prod_two_nt_src_0_val_t, const_col_i_val, pu1_src_0_val_t); |
| prod_val_2 = vmlal_u8(prod_val_1, const_col_i_val, pu1_src_2_nt_val_t); |
| |
| res_val_1 = vrshrn_n_u16(prod_val_2, 6); |
| prod_val_3 = vmlsl_u8(prod_two_nt_src_2_nt_val_t, const_col_i_val, pu1_src_2_nt_val_t); |
| |
| vst1_u8(pu1_dst_tmp_0, res_val_1); |
| pu1_dst_tmp_0 += 8; |
| prod_val_4 = vmlal_u8(prod_val_3, const_col_i_val, pu1_src_4_nt_val_t); |
| |
| res_val_2 = vrshrn_n_u16(prod_val_4, 6); |
| vst1_u8(pu1_dst_tmp_1, res_val_2); |
| pu1_dst_tmp_1 += 8; |
| } |
| pu1_dst[2 * nt] = pu1_src[2 * nt]; |
| } |
| else |
| { |
| pu1_src_tmp_1 = pu1_src + 1; |
| pu1_src_tmp_2 = pu1_src + 2; |
| pu1_dst_tmp_0 += 1; |
| |
| dup_const_2 = vdup_n_u8(2); |
| |
| /* Extremities Untouched*/ |
| pu1_dst[0] = pu1_src[0]; |
| |
| /* To avoid the issue when the dest and src has the same pointer this load has been done |
| * outside and the 2nd consecutive load is done before the store of the 1st */ |
| |
| /* Perform bilinear filtering of Reference Samples */ |
| for(i = (four_nt - 1); i > 0; i -= 8) |
| { |
| src_val_0 = vld1_u8(pu1_src_tmp_0); |
| pu1_src_tmp_0 += 8; |
| |
| src_val_2 = vld1_u8(pu1_src_tmp_2); |
| pu1_src_tmp_2 += 8; |
| |
| src_val_1 = vld1_u8(pu1_src_tmp_1); |
| pu1_src_tmp_1 += 8; |
| |
| if(i < four_nt - 1) |
| { |
| vst1_u8(pu1_dst_tmp_0, shift_res); |
| pu1_dst_tmp_0 += 8; |
| } |
| |
| add_res = vaddl_u8(src_val_0, src_val_2); |
| |
| mul_res = vmlal_u8(add_res, src_val_1, dup_const_2); |
| shift_res = vrshrn_n_u16(mul_res, 2); |
| |
| } |
| vst1_u8(pu1_dst_tmp_0, shift_res); |
| pu1_dst_tmp_0 += 8; |
| } |
| pu1_dst[4 * nt] = src_4nt; |
| pu1_dst[0] = src_0nt; |
| } |
| |
| } |
| |
| |
| |
| /** |
| ******************************************************************************* |
| * |
| * @brief |
| * Intra prediction interpolation filter for luma planar |
| * |
| * @par Description: |
| * Planar Intraprediction with reference neighboring samples location |
| * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' |
| * |
| * @param[in] pu1_src |
| * UWORD8 pointer to the source |
| * |
| * @param[out] pu1_dst |
| * UWORD8 pointer to the destination |
| * |
| * @param[in] src_strd |
| * integer source stride |
| * |
| * @param[in] dst_strd |
| * integer destination stride |
| * |
| * @param[in] nt |
| * integer Transform Block size |
| * |
| * @param[in] wd |
| * integer width of the array |
| * |
| * @returns |
| * |
| * @remarks |
| * None |
| * |
| ******************************************************************************* |
| */ |
| |
| void ihevc_intra_pred_luma_planar_neonintr(UWORD8 *pu1_ref, |
| WORD32 src_strd, |
| UWORD8 *pu1_dst, |
| WORD32 dst_strd, |
| WORD32 nt, |
| WORD32 mode) |
| { |
| /* named it in the way (nt - 1 - col) --> const_nt_1_col(const denotes g_ihevc_planar_factor) */ |
| /* load const_nt_1_col values into a d register */ |
| /* named it in the way pu1_ref[nt - 1] --> pu1_ref_nt_1 */ |
| /* the value of pu1_ref_nt_1 is duplicated to d register hence pu1_ref_nt_1_dup */ |
| /* log2nt + 1 is taken care while assigning the values itself */ |
| /* In width multiple of 4 case the row also has been unrolled by 2 and store has been taken care*/ |
| |
| WORD32 row, col = 0; |
| WORD32 log2nt_plus1 = 6; |
| WORD32 two_nt, three_nt; |
| UWORD8 *pu1_ref_two_nt_1; |
| UWORD8 *pu1_dst_tmp; |
| const UWORD8 *const_nt_1_col; |
| uint8x8_t const_nt_1_col_t; |
| const UWORD8 *const_col_1; |
| uint8x8_t const_col_1_t; |
| uint8_t const_nt_1_row; |
| uint8x8_t const_nt_1_row_dup; |
| uint8_t const_row_1; |
| uint8x8_t const_row_1_dup; |
| uint8_t const_nt = nt; |
| uint16x8_t const_nt_dup; |
| uint8_t pu1_ref_nt_1 = pu1_ref[nt - 1]; |
| uint8x8_t pu1_ref_nt_1_dup; |
| uint8_t pu1_ref_two_nt_1_row; |
| uint8_t pu1_ref_three_nt_1; |
| uint8x8_t pu1_ref_two_nt_1_row_dup; |
| uint8x8_t pu1_ref_two_nt_1_t; |
| uint8x8_t pu1_ref_three_nt_1_dup; |
| uint16x8_t prod_t1; |
| uint16x8_t prod_t2; |
| uint16x8_t sto_res_tmp; |
| uint8x8_t sto_res; |
| int16x8_t log2nt_dup; |
| UNUSED(src_strd); |
| UNUSED(mode); |
| log2nt_plus1 = 32 - CLZ(nt); |
| two_nt = 2 * nt; |
| three_nt = 3 * nt; |
| /* loops have been unrolld considering the fact width is multiple of 8 */ |
| if(0 == (nt & 7)) |
| { |
| pu1_dst_tmp = pu1_dst; |
| const_nt_1_col = gau1_ihevc_planar_factor + nt - 8; |
| |
| const_col_1 = gau1_ihevc_planar_factor + 1; |
| pu1_ref_three_nt_1 = pu1_ref[three_nt + 1]; |
| |
| pu1_ref_nt_1_dup = vdup_n_u8(pu1_ref_nt_1); |
| const_nt_dup = vdupq_n_u16(const_nt); |
| |
| log2nt_dup = vdupq_n_s16(log2nt_plus1); |
| log2nt_dup = vnegq_s16(log2nt_dup); |
| |
| pu1_ref_three_nt_1_dup = vdup_n_u8(pu1_ref_three_nt_1); |
| |
| for(row = 0; row < nt; row++) |
| { |
| pu1_ref_two_nt_1_row = pu1_ref[two_nt - 1 - row]; |
| pu1_ref_two_nt_1_row_dup = vdup_n_u8(pu1_ref_two_nt_1_row); |
| |
| const_nt_1_row = nt - 1 - row; |
| const_nt_1_row_dup = vdup_n_u8(const_nt_1_row); |
| |
| const_row_1 = row + 1; |
| const_row_1_dup = vdup_n_u8(const_row_1); |
| |
| const_nt_1_col = gau1_ihevc_planar_factor + nt - 8; |
| |
| const_col_1 = gau1_ihevc_planar_factor + 1; |
| pu1_ref_two_nt_1 = pu1_ref + two_nt + 1; |
| |
| for(col = nt; col > 0; col -= 8) |
| { |
| const_nt_1_col_t = vld1_u8(const_nt_1_col); |
| const_nt_1_col -= 8; |
| const_nt_1_col_t = vrev64_u8(const_nt_1_col_t); |
| |
| const_col_1_t = vld1_u8(const_col_1); |
| const_col_1 += 8; |
| prod_t1 = vmull_u8(const_nt_1_col_t, pu1_ref_two_nt_1_row_dup); |
| |
| pu1_ref_two_nt_1_t = vld1_u8(pu1_ref_two_nt_1); |
| pu1_ref_two_nt_1 += 8; |
| prod_t2 = vmull_u8(const_col_1_t, pu1_ref_three_nt_1_dup); |
| |
| prod_t1 = vmlal_u8(prod_t1, const_nt_1_row_dup, pu1_ref_two_nt_1_t); |
| prod_t2 = vmlal_u8(prod_t2, const_row_1_dup, pu1_ref_nt_1_dup); |
| prod_t1 = vaddq_u16(prod_t1, const_nt_dup); |
| prod_t1 = vaddq_u16(prod_t1, prod_t2); |
| |
| sto_res_tmp = vreinterpretq_u16_s16(vshlq_s16(vreinterpretq_s16_u16(prod_t1), log2nt_dup)); |
| sto_res = vmovn_u16(sto_res_tmp); |
| vst1_u8(pu1_dst_tmp, sto_res); |
| pu1_dst_tmp += 8; |
| } |
| pu1_dst_tmp += dst_strd - nt; |
| } |
| } |
| /* loops have been unrolld considering the fact width is multiple of 4 */ |
| /* If column is multiple of 4 then height should be multiple of 2 */ |
| else |
| { |
| uint8x8_t const_row_1_dup1; |
| uint8x8_t pu1_ref_two_nt_1_t1; |
| uint8x8_t const_nt_1_col_t1; |
| uint8x8_t const_col_1_t1; |
| uint8x8_t pu1_ref_two_nt_1_row_dup1; |
| uint8x8_t const_nt_1_row_dup1; |
| |
| pu1_ref_three_nt_1 = pu1_ref[three_nt + 1]; |
| |
| pu1_ref_nt_1_dup = vdup_n_u8(pu1_ref_nt_1); |
| const_nt_dup = vdupq_n_u16(const_nt); |
| |
| log2nt_dup = vdupq_n_s16(log2nt_plus1); |
| log2nt_dup = vnegq_s16(log2nt_dup); |
| |
| pu1_ref_three_nt_1_dup = vdup_n_u8(pu1_ref_three_nt_1); |
| |
| for(row = 0; row < nt; row += 2) |
| { |
| pu1_ref_two_nt_1_row = pu1_ref[two_nt - 1 - row]; |
| pu1_ref_two_nt_1_row_dup = vdup_n_u8(pu1_ref_two_nt_1_row); |
| pu1_ref_two_nt_1_row = pu1_ref[two_nt - 2 - row]; |
| pu1_ref_two_nt_1_row_dup1 = vdup_n_u8(pu1_ref_two_nt_1_row); |
| pu1_ref_two_nt_1_row_dup = vext_u8(pu1_ref_two_nt_1_row_dup, pu1_ref_two_nt_1_row_dup1, 4); |
| |
| const_nt_1_row = nt - 1 - row; |
| const_nt_1_row_dup = vdup_n_u8(const_nt_1_row); |
| const_nt_1_row = nt - 2 - row; |
| const_nt_1_row_dup1 = vdup_n_u8(const_nt_1_row); |
| const_nt_1_row_dup = vext_u8(const_nt_1_row_dup, const_nt_1_row_dup1, 4); |
| |
| const_row_1 = row + 1; |
| const_row_1_dup = vdup_n_u8(const_row_1); |
| const_row_1 = row + 2; |
| const_row_1_dup1 = vdup_n_u8(const_row_1); |
| const_row_1_dup = vext_u8(const_row_1_dup, const_row_1_dup1, 4); |
| |
| const_nt_1_col = gau1_ihevc_planar_factor + nt - 4; |
| |
| const_col_1 = gau1_ihevc_planar_factor + 1; |
| |
| pu1_ref_two_nt_1 = pu1_ref + two_nt + 1; |
| |
| for(col = nt; col > 0; col -= 4) |
| { |
| const_nt_1_col_t = vld1_u8(const_nt_1_col); |
| const_nt_1_col -= 4; |
| const_nt_1_col_t = vrev64_u8(const_nt_1_col_t); |
| |
| const_col_1_t = vld1_u8(const_col_1); |
| const_col_1 += 4; |
| const_nt_1_col_t1 = vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(const_nt_1_col_t), 32)); |
| |
| pu1_dst_tmp = pu1_dst; |
| const_nt_1_col_t = vext_u8(const_nt_1_col_t, const_nt_1_col_t1, 4); |
| |
| const_col_1_t1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(const_col_1_t), 32)); |
| prod_t1 = vmull_u8(const_nt_1_col_t, pu1_ref_two_nt_1_row_dup); |
| |
| pu1_ref_two_nt_1_t = vld1_u8(pu1_ref_two_nt_1); |
| pu1_ref_two_nt_1 += 4; |
| const_col_1_t = vext_u8(const_col_1_t1, const_col_1_t, 4); |
| |
| pu1_ref_two_nt_1_t1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(pu1_ref_two_nt_1_t), 32)); |
| prod_t2 = vmull_u8(const_col_1_t, pu1_ref_three_nt_1_dup); |
| |
| pu1_ref_two_nt_1_t = vext_u8(pu1_ref_two_nt_1_t1, pu1_ref_two_nt_1_t, 4); |
| prod_t2 = vmlal_u8(prod_t2, const_row_1_dup, pu1_ref_nt_1_dup); |
| |
| prod_t1 = vmlal_u8(prod_t1, const_nt_1_row_dup, pu1_ref_two_nt_1_t); |
| prod_t1 = vaddq_u16(prod_t1, const_nt_dup); |
| prod_t1 = vaddq_u16(prod_t1, prod_t2); |
| |
| sto_res_tmp = vreinterpretq_u16_s16(vshlq_s16(vreinterpretq_s16_u16(prod_t1), log2nt_dup)); |
| sto_res = vmovn_u16(sto_res_tmp); |
| |
| vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0); |
| pu1_dst_tmp += dst_strd; |
| |
| vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1); |
| pu1_dst += 4; |
| } |
| pu1_dst += 2 * dst_strd - nt; |
| } |
| } |
| |
| } |
| /* INTRA_PRED_LUMA_PLANAR */ |
| |
| /** |
| ******************************************************************************* |
| * |
| * @brief |
| * Intra prediction interpolation filter for luma dc |
| * |
| * @par Description: |
| * Intraprediction for DC mode with reference neighboring samples location |
| * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' |
| * |
| * @param[in] pu1_src |
| * UWORD8 pointer to the source |
| * |
| * @param[out] pu1_dst |
| * UWORD8 pointer to the destination |
| * |
| * @param[in] src_strd |
| * integer source stride |
| * |
| * @param[in] dst_strd |
| * integer destination stride |
| * |
| * @param[in] nt |
| * integer Transform Block size |
| * |
| * @param[in] wd |
| * integer width of the array |
| * |
| * @returns |
| * |
| * @remarks |
| * None |
| * |
| ******************************************************************************* |
| */ |
| |
| void ihevc_intra_pred_luma_dc_neonintr(UWORD8 *pu1_ref, |
| WORD32 src_strd, |
| UWORD8 *pu1_dst, |
| WORD32 dst_strd, |
| WORD32 nt, |
| WORD32 mode) |
| { |
| WORD32 dc_val = 0, two_dc_val = 0, three_dc_val = 0; |
| WORD32 i = 0; |
| WORD32 row = 0, col = 0, col_count; |
| WORD32 log2nt_plus1 = 6; |
| WORD32 two_nt = 0; |
| uint16x8_t ref_load_q; |
| uint16x8_t three_dc_val_t; |
| uint8x8_t sto_res_tmp; |
| uint8x8_t sto_res_tmp1; |
| uint8x8_t sto_res_tmp2; |
| uint8x8_t sto_res_tmp3; |
| uint8x8_t sto_res_tmp4; |
| uint8x8_t dc_val_t; |
| |
| UWORD8 *pu1_ref_tmp; |
| UWORD8 *pu1_ref_tmp1; |
| UWORD8 *pu1_dst_tmp; |
| UWORD8 *pu1_dst_tmp1; |
| UWORD8 *pu1_dst_tmp2; |
| UNUSED(src_strd); |
| UNUSED(mode); |
| |
| /* log2nt + 1 is taken care while assigning the values itself. */ |
| log2nt_plus1 = 32 - CLZ(nt); |
| |
| /* loops have been unrolld considering the fact width is multiple of 8 */ |
| if(0 == (nt & 7)) |
| { |
| uint8x8_t ref_load1; |
| uint8x8_t ref_load2; |
| uint16x4_t acc_dc_pair1; |
| uint32x2_t acc_dc_pair2; |
| uint64x1_t acc_dc = vdup_n_u64(col); |
| |
| two_nt = 2 * nt; |
| pu1_ref_tmp = pu1_ref + nt; |
| pu1_ref_tmp1 = pu1_ref + two_nt + 1; |
| |
| for(i = two_nt; i > nt; i -= 8) |
| { |
| ref_load1 = vld1_u8(pu1_ref_tmp); |
| pu1_ref_tmp += 8; |
| acc_dc_pair1 = vpaddl_u8(ref_load1); |
| |
| ref_load2 = vld1_u8(pu1_ref_tmp1); |
| pu1_ref_tmp1 += 8; |
| |
| acc_dc_pair2 = vpaddl_u16(acc_dc_pair1); |
| acc_dc = vpadal_u32(acc_dc, acc_dc_pair2); |
| |
| acc_dc_pair1 = vpaddl_u8(ref_load2); |
| acc_dc_pair2 = vpaddl_u16(acc_dc_pair1); |
| acc_dc = vpadal_u32(acc_dc, acc_dc_pair2); |
| } |
| |
| dc_val = (vget_lane_u32(vreinterpret_u32_u64(acc_dc), 0) + nt) >> (log2nt_plus1); |
| dc_val_t = vdup_n_u8(dc_val); |
| two_dc_val = 2 * dc_val; |
| three_dc_val = 3 * dc_val; |
| three_dc_val += 2; |
| |
| three_dc_val_t = vdupq_n_u16((WORD16)three_dc_val); |
| pu1_ref_tmp = pu1_ref + two_nt + 1 + 0; |
| pu1_dst_tmp = pu1_dst; |
| |
| |
| if(nt == 32) |
| { |
| for(row = 0; row < nt; row++) |
| { |
| for(col = nt; col > 0; col -= 8) |
| { |
| vst1_u8(pu1_dst_tmp, dc_val_t); |
| pu1_dst_tmp += 8; |
| } |
| pu1_dst_tmp += dst_strd - nt; |
| } |
| } |
| else |
| |
| { |
| for(col = nt; col > 0; col -= 8) |
| { |
| ref_load1 = vld1_u8(pu1_ref_tmp); |
| pu1_ref_tmp += 8; |
| ref_load_q = vmovl_u8(ref_load1); |
| ref_load_q = vaddq_u16(ref_load_q, three_dc_val_t); |
| ref_load_q = vshrq_n_u16(ref_load_q, 2); |
| sto_res_tmp = vmovn_u16(ref_load_q); |
| vst1_u8(pu1_dst_tmp, sto_res_tmp); |
| pu1_dst_tmp += 8; |
| } |
| |
| pu1_ref_tmp = pu1_ref + two_nt - 9; |
| pu1_dst_tmp = pu1_dst + dst_strd; |
| col_count = nt - 8; |
| |
| /* Except the first row the remaining rows are done here */ |
| /* Both column and row has been unrolled by 8 */ |
| /* Store has been taken care for the unrolling */ |
| /* Except the 1st column of the remaining rows(other than 1st row), the values are */ |
| /* constant hence it is extracted with an constant value and stored */ |
| /* If the column is greater than 8, then the remaining values are constant which is */ |
| /* taken care in the inner for loop */ |
| |
| for(row = nt; row > 0; row -= 8) |
| { |
| pu1_dst_tmp1 = pu1_dst_tmp + 8; |
| ref_load1 = vld1_u8(pu1_ref_tmp); |
| pu1_ref_tmp -= 8; |
| ref_load_q = vmovl_u8(ref_load1); |
| ref_load_q = vaddq_u16(ref_load_q, three_dc_val_t); |
| ref_load_q = vshrq_n_u16(ref_load_q, 2); |
| sto_res_tmp = vmovn_u16(ref_load_q); |
| |
| sto_res_tmp1 = vext_u8(sto_res_tmp, dc_val_t, 7); |
| |
| sto_res_tmp2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 8)); |
| sto_res_tmp2 = vext_u8(sto_res_tmp2, dc_val_t, 7); |
| vst1_u8(pu1_dst_tmp, sto_res_tmp1); |
| pu1_dst_tmp += dst_strd; |
| |
| sto_res_tmp3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 16)); |
| sto_res_tmp3 = vext_u8(sto_res_tmp3, dc_val_t, 7); |
| vst1_u8(pu1_dst_tmp, sto_res_tmp2); |
| pu1_dst_tmp += dst_strd; |
| |
| sto_res_tmp4 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 24)); |
| sto_res_tmp4 = vext_u8(sto_res_tmp4, dc_val_t, 7); |
| vst1_u8(pu1_dst_tmp, sto_res_tmp3); |
| pu1_dst_tmp += dst_strd; |
| |
| sto_res_tmp1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 32)); |
| sto_res_tmp1 = vext_u8(sto_res_tmp1, dc_val_t, 7); |
| vst1_u8(pu1_dst_tmp, sto_res_tmp4); |
| pu1_dst_tmp += dst_strd; |
| |
| sto_res_tmp2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 40)); |
| sto_res_tmp2 = vext_u8(sto_res_tmp2, dc_val_t, 7); |
| vst1_u8(pu1_dst_tmp, sto_res_tmp1); |
| pu1_dst_tmp += dst_strd; |
| |
| sto_res_tmp3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 48)); |
| sto_res_tmp3 = vext_u8(sto_res_tmp3, dc_val_t, 7); |
| vst1_u8(pu1_dst_tmp, sto_res_tmp2); |
| pu1_dst_tmp += dst_strd; |
| |
| sto_res_tmp4 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 56)); |
| sto_res_tmp4 = vext_u8(sto_res_tmp4, dc_val_t, 7); |
| vst1_u8(pu1_dst_tmp, sto_res_tmp3); |
| pu1_dst_tmp += dst_strd; |
| /* For last set of 8 rows only 7 rows need to be updated since first row is already written */ |
| if(row != 8) |
| vst1_u8(pu1_dst_tmp, sto_res_tmp4); |
| pu1_dst_tmp += dst_strd; |
| |
| for(col = col_count; col > 0; col -= 8) |
| { |
| pu1_dst_tmp2 = pu1_dst_tmp1; |
| vst1_u8(pu1_dst_tmp1, dc_val_t); |
| pu1_dst_tmp1 += dst_strd; |
| vst1_u8(pu1_dst_tmp1, dc_val_t); |
| pu1_dst_tmp1 += dst_strd; |
| vst1_u8(pu1_dst_tmp1, dc_val_t); |
| pu1_dst_tmp1 += dst_strd; |
| vst1_u8(pu1_dst_tmp1, dc_val_t); |
| pu1_dst_tmp1 += dst_strd; |
| vst1_u8(pu1_dst_tmp1, dc_val_t); |
| pu1_dst_tmp1 += dst_strd; |
| vst1_u8(pu1_dst_tmp1, dc_val_t); |
| pu1_dst_tmp1 += dst_strd; |
| vst1_u8(pu1_dst_tmp1, dc_val_t); |
| pu1_dst_tmp1 += dst_strd; |
| |
| /* For last set of 8 rows only 7 rows need to be updated since first row is already written */ |
| if(row != 8) |
| vst1_u8(pu1_dst_tmp1, dc_val_t); |
| pu1_dst_tmp1 = pu1_dst_tmp2 + 8; |
| } |
| } |
| pu1_dst[0] = (pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) >> 2; |
| } |
| } |
| /* loops have been unrolld considering the fact width is multiple of 4 */ |
| else |
| { |
| WORD32 acc_dc; |
| two_nt = 2 * nt; |
| |
| acc_dc = 0; |
| pu1_ref_tmp = pu1_ref + nt + 1; |
| for(i = nt; i < two_nt; i++) |
| { |
| acc_dc += pu1_ref[i]; |
| acc_dc += pu1_ref_tmp[i]; |
| } |
| dc_val = (acc_dc + nt) >> (log2nt_plus1); |
| two_dc_val = 2 * dc_val; |
| three_dc_val = 3 * dc_val; |
| three_dc_val = three_dc_val + 2; |
| dc_val_t = vdup_n_u8(dc_val); |
| |
| if(nt == 32) |
| { |
| pu1_dst_tmp = pu1_dst; |
| for(row = 0; row < nt; row++) |
| { |
| for(col = nt; col > 0; col -= 4) |
| { |
| vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(dc_val_t), 0); |
| pu1_dst_tmp += 4; |
| } |
| pu1_dst_tmp += dst_strd - nt; |
| } |
| } |
| else |
| |
| { |
| for(col = 1; col < nt; col++) |
| { |
| pu1_dst[col] = (pu1_ref[two_nt + 1 + col] + three_dc_val) >> 2; |
| } |
| |
| pu1_dst_tmp = pu1_dst + dst_strd + 0; |
| /* Since first row is already updated before, loop count is nt-1 */ |
| for(row = nt - 1; row > 0; row -= 1) |
| { |
| for(col = nt; col > 0; col -= 4) |
| { |
| vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(dc_val_t), 0); |
| pu1_dst_tmp += 4; |
| } |
| pu1_dst_tmp += dst_strd - nt; |
| } |
| |
| for(row = 1; row < nt; row++) |
| { |
| pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val) >> 2; |
| } |
| pu1_dst[0] = (pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) >> 2; |
| } |
| } |
| } |
| /* INTRA_PRED_LUMA_DC */ |
| |
| /** |
| ******************************************************************************* |
| * |
| * @brief |
| * Intra prediction interpolation filter for horizontal luma variable. |
| * |
| * @par Description: |
| * Horizontal intraprediction with reference neighboring samples location |
| * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' |
| * |
| * @param[in] pu1_src |
| * UWORD8 pointer to the source |
| * |
| * @param[out] pu1_dst |
| * UWORD8 pointer to the destination |
| * |
| * @param[in] src_strd |
| * integer source stride |
| * |
| * @param[in] dst_strd |
| * integer destination stride |
| * |
| * @param[in] nt |
| * integer Transform Block size |
| * |
| * @param[in] wd |
| * integer width of the array |
| * |
| * @returns |
| * |
| * @remarks |
| * None |
| * |
| ******************************************************************************* |
| */ |
| |
| void ihevc_intra_pred_luma_horz_neonintr(UWORD8 *pu1_ref, |
| WORD32 src_strd, |
| UWORD8 *pu1_dst, |
| WORD32 dst_strd, |
| WORD32 nt, |
| WORD32 mode) |
| { |
| |
| WORD32 row, col; |
| WORD32 two_nt; |
| UNUSED(src_strd); |
| UNUSED(mode); |
| |
| two_nt = 2 * nt; |
| |
| |
| UWORD8 *pu1_dst_tmp = pu1_dst; |
| UWORD32 pu1_val; |
| uint8x8_t pu1_val_two_nt_1_row; |
| if(nt == 32) |
| { |
| pu1_dst_tmp = pu1_dst; |
| for(row = 0; row < nt; row++) |
| { |
| pu1_val = pu1_ref[two_nt - 1 - row]; |
| pu1_val_two_nt_1_row = vdup_n_u8(pu1_val); |
| for(col = nt; col > 0; col -= 8) |
| { |
| vst1_u8(pu1_dst_tmp, pu1_val_two_nt_1_row); |
| pu1_dst_tmp += 8; |
| } |
| pu1_dst_tmp += dst_strd - nt; |
| } |
| } |
| else |
| |
| |
| /* row loop has been unrolled, hence had pu1_ref_val1 and pu1_ref_val2 variables*/ |
| /* naming of variables made according to the operation(instructions) it performs*/ |
| /* (eg. shift_val which contains the shifted value, */ |
| /* add_sat which has add and saturated value) */ |
| /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */ |
| /* rows and columns are unrolled by 4, when the width is multiple of 4 */ |
| { |
| if(0 != (nt & 7)) /* cond for multiple of 4 */ |
| { |
| UWORD8 *pu1_ref_4_two_nt_plus1 = pu1_ref; |
| UWORD8 *pu1_ref_4_two_nt_minus_nt = pu1_ref; |
| UWORD8 *pu1_dst_4 = pu1_dst; |
| UWORD8 *pu1_dst_4_tmp = pu1_dst; |
| |
| uint32x2_t pu1_ref_val1, pu1_ref_val2; |
| uint8x8_t dup_sub, round_val, dup_val; |
| uint16x8_t dup_add, sub_val; |
| int16x8_t shift_val, add_sat; |
| |
| pu1_ref_val1 = vdup_n_u32(0); |
| pu1_ref_val2 = vdup_n_u32(0); |
| |
| dup_sub = vdup_n_u8(pu1_ref[two_nt]); |
| |
| dup_add = vdupq_n_u16(pu1_ref[two_nt - 1]); |
| |
| pu1_ref_4_two_nt_plus1 += (two_nt + 1); |
| |
| pu1_ref_4_two_nt_minus_nt += (two_nt - nt); |
| |
| for(row = nt; row > 0; row -= 4) |
| { |
| for(col = nt; col > 0; col -= 4) |
| { |
| pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_4_two_nt_plus1, pu1_ref_val1, 0); |
| sub_val = vsubl_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_sub); |
| shift_val = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1); |
| |
| add_sat = vqaddq_s16(shift_val, vreinterpretq_s16_u16(dup_add)); |
| round_val = vqmovun_s16(add_sat); |
| vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(round_val), 0); |
| pu1_dst_4 += dst_strd; |
| |
| pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_4_two_nt_minus_nt, pu1_ref_val2, 0); |
| dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 2); |
| vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0); |
| pu1_dst_4 += dst_strd; |
| |
| dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 1); |
| vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0); |
| pu1_dst_4 += dst_strd; |
| |
| dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 0); |
| vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0); |
| pu1_dst_4 += dst_strd; |
| |
| |
| } |
| /* worst cases */ |
| pu1_ref_4_two_nt_minus_nt += 3; |
| pu1_ref_4_two_nt_plus1 += 4; |
| pu1_dst_4 = (pu1_dst_4_tmp + 4); |
| } |
| |
| } |
| |
| /* dup_1 - dup_8 are variables to load the duplicated values from the loaded source */ |
| /* naming of variables made according to the operation(instructions) it performs */ |
| /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */ |
| /* rows and columns are unrolled by 8, when the width is multiple of 8 */ |
| |
| else |
| { |
| UWORD8 *pu1_ref_tmp_1 = pu1_ref; |
| UWORD8 *pu1_ref_tmp_2 = pu1_ref; |
| |
| UWORD8 *pu1_dst_tmp_1 = pu1_dst; |
| UWORD8 *pu1_dst_tmp_2 = pu1_dst + dst_strd; |
| UWORD8 *pu1_dst_tmp_3 = pu1_dst + dst_strd; |
| |
| uint8x8_t dup_sub, src_tmp, src_tmp_1, round_val, dup_1, dup_2, dup_3, dup_4, dup_5, dup_6, dup_7, dup_8, rev_res; |
| uint16x8_t sub_res, dup_add; |
| int16x8_t shift_res, add_res; |
| |
| dup_sub = vdup_n_u8(pu1_ref[two_nt]); |
| dup_add = vdupq_n_u16(pu1_ref[two_nt - 1]); |
| |
| pu1_ref_tmp_1 += (two_nt + 1); |
| pu1_ref_tmp_2 += (two_nt - 1); |
| |
| for(col = nt; col > 0; col -= 8) |
| { |
| src_tmp = vld1_u8(pu1_ref_tmp_1); |
| pu1_ref_tmp_1 += 8; |
| |
| sub_res = vsubl_u8(src_tmp, dup_sub); |
| shift_res = vshrq_n_s16(vreinterpretq_s16_u16(sub_res), 1); |
| add_res = vqaddq_s16(shift_res, vreinterpretq_s16_u16(dup_add)); |
| round_val = vqmovun_s16(add_res); |
| vst1_u8(pu1_dst_tmp_1, round_val); |
| pu1_dst_tmp_1 += 8; |
| } |
| |
| for(row = nt; row > 0; row -= 8) |
| { |
| pu1_ref_tmp_2 -= 8; |
| |
| src_tmp_1 = vld1_u8(pu1_ref_tmp_2); |
| rev_res = vrev64_u8(src_tmp_1); /* Reversing the loaded values */ |
| |
| dup_1 = vdup_lane_u8(rev_res, 0); |
| dup_2 = vdup_lane_u8(rev_res, 1); |
| dup_3 = vdup_lane_u8(rev_res, 2); |
| dup_4 = vdup_lane_u8(rev_res, 3); |
| dup_5 = vdup_lane_u8(rev_res, 4); |
| dup_6 = vdup_lane_u8(rev_res, 5); |
| dup_7 = vdup_lane_u8(rev_res, 6); |
| dup_8 = vdup_lane_u8(rev_res, 7); |
| |
| for(col = nt; col > 0; col -= 8) |
| { |
| pu1_dst_tmp_2 = pu1_dst_tmp_3; |
| |
| vst1_u8(pu1_dst_tmp_2, dup_1); |
| pu1_dst_tmp_2 += dst_strd; |
| |
| vst1_u8(pu1_dst_tmp_2, dup_2); |
| pu1_dst_tmp_2 += dst_strd; |
| |
| vst1_u8(pu1_dst_tmp_2, dup_3); |
| pu1_dst_tmp_2 += dst_strd; |
| |
| vst1_u8(pu1_dst_tmp_2, dup_4); |
| pu1_dst_tmp_2 += dst_strd; |
| |
| vst1_u8(pu1_dst_tmp_2, dup_5); |
| pu1_dst_tmp_2 += dst_strd; |
| |
| vst1_u8(pu1_dst_tmp_2, dup_6); |
| pu1_dst_tmp_2 += dst_strd; |
| |
| vst1_u8(pu1_dst_tmp_2, dup_7); |
| pu1_dst_tmp_2 += dst_strd; |
| |
| /* For last set of 8 rows only 7 rows need to be updated since first row is already written */ |
| if(row != 8) |
| vst1_u8(pu1_dst_tmp_2, dup_8); |
| pu1_dst_tmp_2 += dst_strd; |
| |
| pu1_dst_tmp_3 += 8; |
| } |
| pu1_dst_tmp_2 -= (nt - 8); |
| pu1_dst_tmp_3 = pu1_dst_tmp_2; |
| } |
| } |
| } |
| } |
| /* INTRA_PRED_LUMA_HORZ */ |
| |
| /** |
| ******************************************************************************* |
| * |
| * @brief |
| * Intra prediction interpolation filter for vertical luma variable. |
| * |
| * @par Description: |
| * Horizontal intraprediction with reference neighboring samples location |
| * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' |
| * |
| * @param[in] pu1_src |
| * UWORD8 pointer to the source |
| * |
| * @param[out] pu1_dst |
| * UWORD8 pointer to the destination |
| * |
| * @param[in] src_strd |
| * integer source stride |
| * |
| * @param[in] dst_strd |
| * integer destination stride |
| * |
| * @param[in] nt |
| * integer Transform Block size |
| * |
| * @param[in] wd |
| * integer width of the array |
| * |
| * @returns |
| * |
| * @remarks |
| * None |
| * |
| ******************************************************************************* |
| */ |
| |
| void ihevc_intra_pred_luma_ver_neonintr(UWORD8 *pu1_ref, |
| WORD32 src_strd, |
| UWORD8 *pu1_dst, |
| WORD32 dst_strd, |
| WORD32 nt, |
| WORD32 mode) |
| { |
| WORD32 row, col; |
| WORD32 two_nt; |
| UNUSED(src_strd); |
| UNUSED(mode); |
| |
| two_nt = 2 * nt; |
| |
| UWORD8 *pu1_dst_tmp = pu1_dst; |
| UWORD8 *pu1_ref_tmp_1 = pu1_ref + two_nt + 1; |
| uint8x8_t pu1_val_two_nt_1_col; |
| if(nt == 32) |
| { |
| pu1_dst_tmp = pu1_dst; |
| for(row = 0; row < nt; row++) |
| { |
| for(col = nt; col > 0; col -= 8) |
| { |
| pu1_val_two_nt_1_col = vld1_u8(pu1_ref_tmp_1); |
| pu1_ref_tmp_1 += 8; |
| vst1_u8(pu1_dst_tmp, pu1_val_two_nt_1_col); |
| pu1_dst_tmp += 8; |
| } |
| pu1_ref_tmp_1 -= nt; |
| pu1_dst_tmp += dst_strd - nt; |
| } |
| } |
| else |
| |
| { |
| /* naming of variables made according to the operation(instructions) it performs */ |
| /* (eg. shift_val which contains the shifted value, */ |
| /* add_sat which has add and saturated value) */ |
| /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */ |
| /* rows and columns are unrolled by 4, when the width is multiple of 4 */ |
| |
| if(0 != (nt & 7)) |
| { |
| WORD32 cond_4 = 0; |
| UWORD8 *pu1_ref_val1 = pu1_ref; |
| UWORD8 *pu1_ref_val2 = pu1_ref; |
| UWORD8 *pu1_ref_val3 = pu1_ref; |
| |
| UWORD8 *pu1_dst_val1 = pu1_dst; |
| UWORD8 *pu1_dst_val2 = pu1_dst; |
| UWORD8 *pu1_dst_val3 = pu1_dst; |
| |
| uint8x8_t dup_2_sub, round_val, vext_val; |
| uint16x8_t dup_2_add; |
| uint32x2_t src_val1, src_val2, src_val3; |
| uint16x8_t sub_val; |
| int16x8_t shift_val1, add_sat; |
| uint64x1_t shift_val2; |
| |
| src_val1 = vdup_n_u32(0); |
| src_val2 = vdup_n_u32(0); |
| src_val3 = vdup_n_u32(0); |
| pu1_ref_val1 += (two_nt - nt); |
| pu1_ref_val3 += (two_nt + 2); |
| pu1_ref_val2 += (two_nt + 1); |
| |
| dup_2_sub = vdup_n_u8(pu1_ref[two_nt]); |
| dup_2_add = vdupq_n_u16(pu1_ref[two_nt + 1]); |
| |
| /* loops to store the first nt sets of values in the destination */ |
| |
| for(row = nt; row > 0; row -= 4) |
| { |
| for(col = nt; (col > 0) && (cond_4 == 0); col -= 4) |
| { |
| /* unrolling s2_predpixel = pu1_ref[two_nt + 1] + ((pu1_ref[two_nt - 1 - row] - pu1_ref[two_nt]) >> 1); here*/ |
| src_val1 = vld1_lane_u32((uint32_t *)pu1_ref_val1, src_val1, 1); |
| sub_val = vsubl_u8(vreinterpret_u8_u32(src_val1), dup_2_sub); |
| shift_val1 = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1); |
| add_sat = vqaddq_s16(shift_val1, vreinterpretq_s16_u16(dup_2_add)); |
| round_val = vqmovun_s16(add_sat); |
| |
| /* unrolling pu1_dst[row * dst_strd + col] = pu1_ref[two_nt + 1 + col]; here*/ |
| src_val2 = vld1_lane_u32((uint32_t *)pu1_ref_val3, src_val2, 0); |
| vext_val = vext_u8(round_val, vreinterpret_u8_u32(src_val2), 7); |
| vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0); |
| pu1_dst_val1 += dst_strd; |
| |
| shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 8); |
| |
| vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7); |
| vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0); |
| pu1_dst_val1 += dst_strd; |
| |
| shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 16); |
| |
| vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7); |
| vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0); |
| pu1_dst_val1 += dst_strd; |
| |
| shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 24); |
| |
| vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7); |
| vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0); |
| pu1_dst_val1 += dst_strd; |
| |
| pu1_ref_val1 -= 4; |
| } |
| |
| /* loop to store next sets of eight values in the destination */ |
| |
| for(col = nt - 3; (col > 0) && (cond_4 == 1); col -= 4) |
| { |
| src_val3 = vld1_lane_u32((uint32_t *)pu1_ref_val2, src_val3, 0); |
| |
| vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3)); |
| pu1_dst_val2 += dst_strd; |
| |
| vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3)); |
| pu1_dst_val2 += dst_strd; |
| |
| vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3)); |
| pu1_dst_val2 += dst_strd; |
| |
| vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3)); |
| pu1_dst_val2 += dst_strd; |
| } |
| pu1_ref_val2 += 4; |
| pu1_dst_val3 += 4; |
| pu1_dst_val2 = pu1_dst_val3; |
| cond_4 = 1; |
| } |
| } |
| |
| /* rows and columns are unrolled by 8, when the width is multiple of 8 */ |
| else |
| { |
| WORD32 cond = 0, col_1; |
| UWORD8 *pu1_dst_tmp_1 = pu1_dst; |
| UWORD8 *pu1_dst_tmp_2 = pu1_dst; |
| UWORD8 *pu1_dst_tmp_3 = pu1_dst; |
| |
| UWORD8 *pu1_ref_tmp_1 = pu1_ref; |
| UWORD8 *pu1_ref_tmp_2 = pu1_ref; |
| UWORD8 *pu1_ref_tmp_3 = pu1_ref; |
| |
| uint8x8_t pu1_src_tmp1; |
| uint8x8_t pu1_src_tmp2; |
| |
| uint8x8_t dup_sub; |
| uint16x8_t dup_add; |
| int16x8_t subsh_val; |
| int16x8_t addsat_val; |
| uint16x8_t sub_val; |
| uint8x8_t round_val; |
| uint8x8_t vext_t; |
| uint64x1_t shift_64; |
| |
| dup_sub = vdup_n_u8(pu1_ref[two_nt]); |
| dup_add = vdupq_n_u16(pu1_ref[two_nt + 1]); |
| |
| pu1_ref_tmp_1 += (two_nt); |
| pu1_ref_tmp_1 -= 8; |
| pu1_ref_tmp_2 += (two_nt + 2); |
| pu1_ref_tmp_3 += (two_nt + 1); |
| |
| /* loops to store the first nt sets of values in the destination */ |
| |
| for(row = nt; row > 0; row -= 8) |
| { |
| for(col = (nt - 1); (col > 0) && (cond == 0); col -= 8) |
| { |
| pu1_src_tmp1 = vld1_u8(pu1_ref_tmp_1); |
| |
| sub_val = vsubl_u8(pu1_src_tmp1, dup_sub); |
| subsh_val = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1); |
| addsat_val = vqaddq_s16(subsh_val, vreinterpretq_s16_u16(dup_add)); |
| round_val = vqmovun_s16(addsat_val); |
| |
| /* unrolling pu1_dst[row * dst_strd + col] = pu1_ref[two_nt + 1 + col]; here*/ |
| |
| pu1_src_tmp2 = vld1_u8(pu1_ref_tmp_2); |
| vext_t = vext_u8(round_val, pu1_src_tmp2, 7); |
| vst1_u8(pu1_dst_tmp_1, vext_t); |
| pu1_dst_tmp_1 += dst_strd; |
| |
| shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 8); |
| |
| vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7); |
| vst1_u8(pu1_dst_tmp_1, vext_t); |
| pu1_dst_tmp_1 += dst_strd; |
| |
| shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 16); |
| vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7); |
| vst1_u8(pu1_dst_tmp_1, vext_t); |
| pu1_dst_tmp_1 += dst_strd; |
| |
| shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 24); |
| vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7); |
| vst1_u8(pu1_dst_tmp_1, vext_t); |
| pu1_dst_tmp_1 += dst_strd; |
| |
| shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 32); |
| vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7); |
| vst1_u8(pu1_dst_tmp_1, vext_t); |
| pu1_dst_tmp_1 += dst_strd; |
| |
| shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 40); |
| vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7); |
| vst1_u8(pu1_dst_tmp_1, vext_t); |
| pu1_dst_tmp_1 += dst_strd; |
| |
| shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 48); |
| vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7); |
| vst1_u8(pu1_dst_tmp_1, vext_t); |
| pu1_dst_tmp_1 += dst_strd; |
| |
| shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 56); |
| vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7); |
| vst1_u8(pu1_dst_tmp_1, vext_t); |
| pu1_dst_tmp_1 += dst_strd; |
| |
| pu1_ref_tmp_1 -= 8; |
| } |
| |
| /* loop to store next sets of eight values in the destination */ |
| |
| for(col_1 = nt - 7; (col_1 > 0) && (cond == 1); col_1 -= 8) |
| { |
| pu1_src_tmp2 = vld1_u8(pu1_ref_tmp_3); |
| |
| vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2); |
| pu1_dst_tmp_2 += dst_strd; |
| |
| vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2); |
| pu1_dst_tmp_2 += dst_strd; |
| |
| vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2); |
| pu1_dst_tmp_2 += dst_strd; |
| |
| vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2); |
| pu1_dst_tmp_2 += dst_strd; |
| |
| vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2); |
| pu1_dst_tmp_2 += dst_strd; |
| |
| vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2); |
| pu1_dst_tmp_2 += dst_strd; |
| |
| vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2); |
| pu1_dst_tmp_2 += dst_strd; |
| |
| vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2); |
| pu1_dst_tmp_2 += dst_strd; |
| } |
| pu1_ref_tmp_3 += 8; |
| pu1_dst_tmp_3 += 8; |
| pu1_dst_tmp_2 = pu1_dst_tmp_3; |
| cond = 1; |
| } |
| } |
| } |
| } |
| /* INTRA_PRED_LUMA_VER */ |
| |
| /** |
| ******************************************************************************* |
| * |
| * @brief |
| * Intra prediction interpolation filter for luma mode2. |
| * |
| * @par Description: |
| * Intraprediction for mode 2 (sw angle) with reference neighboring samples |
| * location pointed by 'pu1_ref' to the TU block location pointed by |
| * 'pu1_dst' |
| * |
| * @param[in] pu1_src |
| * UWORD8 pointer to the source |
| * |
| * @param[out] pu1_dst |
| * UWORD8 pointer to the destination |
| * |
| * @param[in] src_strd |
| * integer source stride |
| * |
| * @param[in] dst_strd |
| * integer destination stride |
| * |
| * @param[in] nt |
| * integer Transform Block size |
| * |
| * @param[in] wd |
| * integer width of the array |
| * |
| * @returns |
| * |
| * @remarks |
| * None |
| * |
| ******************************************************************************* |
| */ |
| |
| void ihevc_intra_pred_luma_mode2_neonintr(UWORD8 *pu1_ref, |
| WORD32 src_strd, |
| UWORD8 *pu1_dst, |
| WORD32 dst_strd, |
| WORD32 nt, |
| WORD32 mode) |
| { |
| |
| WORD32 row, col; |
| WORD32 two_nt; |
| UNUSED(src_strd); |
| UNUSED(mode); |
| |
| /* rev_res naming has been made to have the reverse result value in it */ |
| /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */ |
| /* rows and columns are unrolled by 4, when the width is multiple of 4 */ |
| |
| if(0 != (nt & 7)) |
| { |
| UWORD8 *pu1_ref_tmp = pu1_ref; |
| UWORD8 *pu1_dst_tmp = pu1_dst; |
| uint8x8_t pu1_src_val, rev_res; |
| uint64x1_t shift_res; |
| |
| for(col = nt; col > 0; col -= 4) |
| { |
| for(row = nt; row > 0; row -= 4) |
| { |
| /* unrolling all col & rows for pu1_dst[row + (col * dst_strd)] = pu1_ref[two_nt - col - idx - 1]; */ |
| |
| pu1_src_val = vld1_u8(pu1_ref_tmp); |
| shift_res = vshl_n_u64(vreinterpret_u64_u8(pu1_src_val), 8); |
| rev_res = vrev64_u8(vreinterpret_u8_u64(shift_res)); |
| |
| vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(rev_res), 0); |
| pu1_dst_tmp += dst_strd; |
| |
| shift_res = vshr_n_u64(vreinterpret_u64_u8(rev_res), 8); |
| vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0); |
| pu1_dst_tmp += dst_strd; |
| |
| shift_res = vshr_n_u64(shift_res, 8); |
| vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0); |
| pu1_dst_tmp += dst_strd; |
| |
| shift_res = vshr_n_u64(shift_res, 8); |
| vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0); |
| pu1_dst_tmp += dst_strd; |
| } |
| } |
| } |
| |
| /* rev_val_second, rev_val_first to reverse the loaded values in order to get the values in right order */ |
| /* shift_64 to shift the reversed 2nd values to get the value what we need */ |
| /* rows and columns are unrolled by 8, when the width is multiple of 8 */ |
| |
| else |
| { |
| UWORD8 *pu1_ref_two_nt_minus2 = pu1_ref; |
| UWORD8 *pu1_dst_tmp = pu1_dst; |
| UWORD8 *pu1_dst_tmp_plus8 = pu1_dst; |
| |
| uint8x8_t pu1_src_val1, pu1_src_val2, vext_t, rev_val_second, rev_val_first; |
| uint64x1_t shift_val; |
| |
| two_nt = 2 * nt; |
| pu1_ref_two_nt_minus2 += (two_nt); |
| pu1_ref_two_nt_minus2 -= 8; |
| |
| for(col = nt; col > 0; col -= 8) |
| { |
| for(row = nt; row > 0; row -= 8) |
| { |
| pu1_src_val2 = vld1_u8(pu1_ref_two_nt_minus2); |
| rev_val_first = vrev64_u8(pu1_src_val2); |
| |
| pu1_ref_two_nt_minus2 -= 8; |
| pu1_src_val1 = vld1_u8(pu1_ref_two_nt_minus2); |
| rev_val_second = vrev64_u8(pu1_src_val1); |
| |
| vext_t = vext_u8(rev_val_first, rev_val_second, 1); |
| vst1_u8(pu1_dst_tmp, vext_t); |
| pu1_dst_tmp += dst_strd; |
| |
| shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 8); |
| vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1); |
| vst1_u8(pu1_dst_tmp, vext_t); |
| pu1_dst_tmp += dst_strd; |
| |
| shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 16); |
| vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1); |
| vst1_u8(pu1_dst_tmp, vext_t); |
| pu1_dst_tmp += dst_strd; |
| |
| shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 24); |
| vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1); |
| vst1_u8(pu1_dst_tmp, vext_t); |
| pu1_dst_tmp += dst_strd; |
| |
| shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 32); |
| vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1); |
| vst1_u8(pu1_dst_tmp, vext_t); |
| pu1_dst_tmp += dst_strd; |
| |
| shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 40); |
| vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1); |
| vst1_u8(pu1_dst_tmp, vext_t); |
| pu1_dst_tmp += dst_strd; |
| |
| shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 48); |
| vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1); |
| vst1_u8(pu1_dst_tmp, vext_t); |
| pu1_dst_tmp += dst_strd; |
| |
| shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 56); |
| vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1); |
| vst1_u8(pu1_dst_tmp, vext_t); |
| pu1_dst_tmp += dst_strd; |
| } |
| pu1_dst_tmp_plus8 += 8; |
| pu1_dst_tmp = pu1_dst_tmp_plus8; |
| pu1_ref_two_nt_minus2 += (nt - 8); |
| } |
| } |
| } |
| /* INTRA_PRED_LUMA_MODE2 */ |
| |
| /** |
| ******************************************************************************* |
| * |
| * @brief |
| * Intra prediction interpolation filter for luma mode 18 & mode 34. |
| * |
| * @par Description: |
| * Intraprediction for mode 34 (ne angle) with reference neighboring |
| * samples location pointed by 'pu1_ref' to the TU block location pointed by |
| * 'pu1_dst' |
| * |
| * @param[in] pu1_src |
| * UWORD8 pointer to the source |
| * |
| * @param[out] pu1_dst |
| * UWORD8 pointer to the destination |
| * |
| * @param[in] src_strd |
| * integer source stride |
| * |
| * @param[in] dst_strd |
| * integer destination stride |
| * |
| * @param[in] nt |
| * integer Transform Block size |
| * |
| * @param[in] wd |
| * integer width of the array |
| * |
| * @returns |
| * |
| * @remarks |
| * None |
| * |
| ******************************************************************************* |
| */ |
| |
| void ihevc_intra_pred_luma_mode_18_34_neonintr(UWORD8 *pu1_ref, |
| WORD32 src_strd, |
| UWORD8 *pu1_dst, |
| WORD32 dst_strd, |
| WORD32 nt, |
| WORD32 mode) |
| { |
| |
| WORD32 row, col, idx; |
| WORD32 intraPredAngle = 32; |
| WORD32 two_nt; |
| UNUSED(src_strd); |
| two_nt = 2 * nt; |
| |
| UWORD8 *pu1_ref_tmp = pu1_ref; |
| UWORD8 *pu1_ref_tmp1 = pu1_ref; |
| UWORD8 *pu1_dst_tmp = pu1_dst; |
| UWORD8 *pu1_dst_tmp_plus8 = pu1_dst; |
| |
| uint8x8_t src_tmp_1st, src_tmp_2nd, vext1, vext2, vext3, vext4, vext5, vext6, vext7; |
| |
| /* src_tmp_1st, src_tmp_2nd are named as to load the 1st eight and next 8 values from source(pu1_ref) */ |
| /* vext1 - vext7 are named to do vext operation between 2 loaded values and to handle dual issue */ |
| /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */ |
| /* rows and columns are unrolled by 8, when the width is multiple of 8 */ |
| /* loops are maintained separately for mode18 and mode34 */ |
| |
| /* cond to allow multiples of 8 */ |
| if(0 == (nt & 7)) |
| { |
| if(mode == 34) |
| { |
| pu1_ref_tmp += (two_nt + 2); |
| |
| for(row = nt; row > 0; row -= 8) |
| { |
| for(col = nt; col > 0; col -= 8) |
| { |
| /* Loading 1st eight values */ |
| src_tmp_1st = vld1_u8(pu1_ref_tmp); |
| pu1_ref_tmp += 8; |
| |
| /* Loading next eight values */ |
| src_tmp_2nd = vld1_u8(pu1_ref_tmp); |
| |
| /* UNROLLED pu1_dst[col + (row * dst_strd)] = pu1_ref[two_nt + col + idx + 1] */ |
| vext1 = vext_u8(src_tmp_1st, src_tmp_2nd, 1); |
| vst1_u8(pu1_dst_tmp, src_tmp_1st); |
| pu1_dst_tmp += dst_strd; |
| |
| vext2 = vext_u8(src_tmp_1st, src_tmp_2nd, 2); |
| vst1_u8(pu1_dst_tmp, vext1); |
| pu1_dst_tmp += dst_strd; |
| |
| vext3 = vext_u8(src_tmp_1st, src_tmp_2nd, 3); |
| vst1_u8(pu1_dst_tmp, vext2); |
| pu1_dst_tmp += dst_strd; |
| |
| vext4 = vext_u8(src_tmp_1st, src_tmp_2nd, 4); |
| vst1_u8(pu1_dst_tmp, vext3); |
| pu1_dst_tmp += dst_strd; |
| |
| vext5 = vext_u8(src_tmp_1st, src_tmp_2nd, 5); |
| vst1_u8(pu1_dst_tmp, vext4); |
| pu1_dst_tmp += dst_strd; |
| |
| vext6 = vext_u8(src_tmp_1st, src_tmp_2nd, 6); |
| vst1_u8(pu1_dst_tmp, vext5); |
| pu1_dst_tmp += dst_strd; |
| |
| vext7 = vext_u8(src_tmp_1st, src_tmp_2nd, 7); |
| vst1_u8(pu1_dst_tmp, vext6); |
| pu1_dst_tmp += dst_strd; |
| |
| vst1_u8(pu1_dst_tmp, vext7); |
| pu1_dst_tmp += dst_strd; |
| } |
| |
| pu1_dst_tmp_plus8 += 8; |
| pu1_dst_tmp = pu1_dst_tmp_plus8; |
| pu1_ref_tmp -= (nt - 8); |
| } |
| } |
| else /* Loop for mode 18 */ |
| { |
| pu1_ref_tmp += (two_nt); |
| |
| for(row = nt; row > 0; row -= 8) |
| { |
| for(col = nt; col > 0; col -= 8) |
| { |
| /* Loading 1st eight values */ |
| src_tmp_1st = vld1_u8(pu1_ref_tmp); |
| pu1_ref_tmp -= 8; |
| |
| /* Loading next eight values */ |
| src_tmp_2nd = vld1_u8(pu1_ref_tmp); |
| |
| /* UNROLLED pu1_dst[col + (row * dst_strd)] = pu1_ref[two_nt + col + idx + 1] */ |
| vext1 = vext_u8(src_tmp_2nd, src_tmp_1st, 7); |
| vst1_u8(pu1_dst_tmp, src_tmp_1st); |
| pu1_dst_tmp += dst_strd; |
| |
| vext2 = vext_u8(src_tmp_2nd, src_tmp_1st, 6); |
| vst1_u8(pu1_dst_tmp, vext1); |
| pu1_dst_tmp += dst_strd; |
| |
| vext3 = vext_u8(src_tmp_2nd, src_tmp_1st, 5); |
| vst1_u8(pu1_dst_tmp, vext2); |
| pu1_dst_tmp += dst_strd; |
| |
| vext4 = vext_u8(src_tmp_2nd, src_tmp_1st, 4); |
| vst1_u8(pu1_dst_tmp, vext3); |
| pu1_dst_tmp += dst_strd; |
| |
| vext5 = vext_u8(src_tmp_2nd, src_tmp_1st, 3); |
| vst1_u8(pu1_dst_tmp, vext4); |
| pu1_dst_tmp += dst_strd; |
| |
| vext6 = vext_u8(src_tmp_2nd, src_tmp_1st, 2); |
| vst1_u8(pu1_dst_tmp, vext5); |
| pu1_dst_tmp += dst_strd; |
| |
| vext7 = vext_u8(src_tmp_2nd, src_tmp_1st, 1); |
| vst1_u8(pu1_dst_tmp, vext6); |
| pu1_dst_tmp += dst_strd; |
| |
| vst1_u8(pu1_dst_tmp, vext7); |
| pu1_dst_tmp += dst_strd; |
| } |
| pu1_dst_tmp_plus8 += 8; |
| pu1_dst_tmp = pu1_dst_tmp_plus8; |
| pu1_ref_tmp += (nt + 8); |
| } |
| } |
| } |
| |
| /* rows and columns are unrolled by 4, when the width is multiple of 4 */ |
| |
| else /* loop for multiples of 4 */ |
| { |
| uint8x8_t src_val1; |
| uint8x8_t src_val2; |
| |
| if(mode == 18) |
| intraPredAngle = -32; |
| else if(mode == 34) |
| intraPredAngle = 32; |
| |
| for(row = 0; row < nt; row += 2) |
| { |
| /* unrolling 2 rows */ |
| idx = ((row + 1) * intraPredAngle) >> 5; |
| pu1_ref_tmp = pu1_ref + two_nt + idx + 1; |
| src_val1 = vld1_u8(pu1_ref_tmp); |
| |
| idx = ((row + 2) * intraPredAngle) >> 5; |
| pu1_ref_tmp1 = pu1_ref + two_nt + idx + 1; |
| src_val2 = vld1_u8(pu1_ref_tmp1); |
| |
| /* unrolling 4 col */ |
| for(col = nt; col > 0; col -= 4) |
| { |
| pu1_dst_tmp = pu1_dst; |
| vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(src_val1), 0); |
| pu1_dst_tmp += dst_strd; |
| vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(src_val2), 0); |
| pu1_dst += 4; |
| } |
| pu1_dst += 2 * dst_strd - nt; |
| } |
| } |
| } |
| /* INTRA_PRED_LUMA_MODE_18_34 */ |
| |
| /** |
| ******************************************************************************* |
| * |
| * @brief |
| * Intra prediction interpolation filter for luma mode 3 to mode 9 |
| * |
| * @par Description: |
| * Intraprediction for mode 3 to 9 (positive angle, horizontal mode ) with |
| * reference neighboring samples location pointed by 'pu1_ref' to the TU |
| * block location pointed by 'pu1_dst' |
| * |
| * @param[in] pu1_src |
| * UWORD8 pointer to the source |
| * |
| * @param[out] pu1_dst |
| * UWORD8 pointer to the destination |
| * |
| * @param[in] src_strd |
| * integer source stride |
| * |
| * @param[in] dst_strd |
| * integer destination stride |
| * |
| * @param[in] nt |
| * integer Transform Block size |
| * |
| * @param[in] mode |
| * integer intraprediction mode |
| * |
| * @returns |
| * |
| * @remarks |
| * None |
| * |
| ******************************************************************************* |
| */ |
| |
| |
| void ihevc_intra_pred_luma_mode_3_to_9_neonintr(UWORD8 *pu1_ref, |
| WORD32 src_strd, |
| UWORD8 *pu1_dst, |
| WORD32 dst_strd, |
| WORD32 nt, |
| WORD32 mode) |
| { |
| |
| WORD32 row, col; |
| WORD32 intra_pred_ang; |
| WORD32 pos, fract = 100, fract_prev; |
| UNUSED(src_strd); |
| if(0 == (nt & 7)) |
| { |
| |
| UWORD8 *pu1_ref_main_idx = pu1_ref; |
| UWORD8 *pu1_ref_main_idx_1 = pu1_ref; |
| |
| UWORD8 *pu1_dst_tmp1 = pu1_dst; |
| UWORD8 *pu1_dst_tmp2 = pu1_dst; |
| |
| WORD32 two_nt = 2 * nt; |
| |
| pu1_ref_main_idx += two_nt; |
| pu1_ref_main_idx_1 += two_nt - 1; |
| |
| uint8x8_t dup_const_fract, dup_const_32_fract, ref_main_idx, ref_main_idx_1; |
| uint8x8_t shift_res; |
| uint16x8_t mul_res1, mul_res2, add_res; |
| |
| /* Intra Pred Angle according to the mode */ |
| intra_pred_ang = gai4_ihevc_ang_table[mode]; |
| |
| pu1_ref_main_idx -= 8; |
| pu1_ref_main_idx_1 -= 8; |
| |
| for(col = 0; col < nt; col++) |
| { |
| fract_prev = fract; |
| |
| pos = ((col + 1) * intra_pred_ang); |
| fract = pos & (31); |
| |
| if(fract_prev < fract) |
| { |
| pu1_ref_main_idx += 1; |
| pu1_ref_main_idx_1 += 1; |
| } |
| |
| dup_const_fract = vdup_n_u8((uint8_t)fract); |
| dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract)); |
| |
| for(row = nt; row > 0; row -= 8) |
| { |
| ref_main_idx = vld1_u8(pu1_ref_main_idx); |
| ref_main_idx_1 = vld1_u8(pu1_ref_main_idx_1); |
| |
| mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract); |
| mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract); |
| |
| add_res = vaddq_u16(mul_res1, mul_res2); |
| |
| shift_res = vrshrn_n_u16(add_res, 5); |
| |
| vst1_lane_u8(pu1_dst_tmp1, shift_res, 7); |
| pu1_dst_tmp1 += dst_strd; |
| |
| vst1_lane_u8(pu1_dst_tmp1, shift_res, 6); |
| pu1_dst_tmp1 += dst_strd; |
| |
| vst1_lane_u8(pu1_dst_tmp1, shift_res, 5); |
| pu1_dst_tmp1 += dst_strd; |
| |
| vst1_lane_u8(pu1_dst_tmp1, shift_res, 4); |
| pu1_dst_tmp1 += dst_strd; |
| |
| vst1_lane_u8(pu1_dst_tmp1, shift_res, 3); |
| pu1_dst_tmp1 += dst_strd; |
| |
| vst1_lane_u8(pu1_dst_tmp1, shift_res, 2); |
| pu1_dst_tmp1 += dst_strd; |
| |
| vst1_lane_u8(pu1_dst_tmp1, shift_res, 1); |
| pu1_dst_tmp1 += dst_strd; |
| |
| vst1_lane_u8(pu1_dst_tmp1, shift_res, 0); |
| pu1_dst_tmp1 += dst_strd; |
| |
| pu1_ref_main_idx -= 8; |
| pu1_ref_main_idx_1 -= 8; |
| |
| } |
| pu1_dst_tmp2 += 1; |
| pu1_dst_tmp1 = pu1_dst_tmp2; |
| |
| pu1_ref_main_idx += nt; |
| pu1_ref_main_idx_1 += nt; |
| |
| pu1_ref_main_idx -= 1; |
| pu1_ref_main_idx_1 -= 1; |
| |
| } |
| } |
| else |
| { |
| UWORD8 *pu1_ref_tmp1 = pu1_ref; |
| UWORD8 *pu1_ref_tmp2 = pu1_ref; |
| UWORD8 *pu1_dst_tmp1 = pu1_dst; |
| UWORD8 *pu1_dst_tmp2 = pu1_dst; |
| |
| pu1_ref_tmp1 += nt; |
| pu1_ref_tmp2 += (nt - 1); |
| |
| uint8x8_t dup_fract, dup_32_fract, shift_res; |
| uint16x8_t mul_res1, mul_res2, add_res; |
| uint32x2_t pu1_ref_val1, pu1_ref_val2; |
| |
| pu1_ref_val1 = vdup_n_u32(0); |
| pu1_ref_val2 = vdup_n_u32(0); |
| |
| /* Intra Pred Angle according to the mode */ |
| intra_pred_ang = gai4_ihevc_ang_table[mode]; |
| |
| |
| for(col = 0; col < nt; col++) |
| { |
| fract_prev = fract; |
| pos = ((col + 1) * intra_pred_ang); |
| fract = pos & (31); |
| if(fract_prev < fract) |
| { |
| pu1_ref_tmp1 += 1; |
| pu1_ref_tmp2 += 1; |
| } |
| dup_fract = vdup_n_u8((uint8_t)fract); |
| dup_32_fract = vdup_n_u8((uint8_t)(32 - fract)); |
| |
| for(row = nt; row > 0; row -= 4) |
| { |
| pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, pu1_ref_val1, 0); |
| pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, pu1_ref_val2, 0); |
| |
| mul_res1 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_32_fract); |
| mul_res2 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val2), dup_fract); |
| |
| add_res = vaddq_u16(mul_res1, mul_res2); |
| |
| shift_res = vrshrn_n_u16(add_res, 5); |
| |
| vst1_lane_u8(pu1_dst_tmp1, shift_res, 3); |
| pu1_dst_tmp1 += dst_strd; |
| |
| vst1_lane_u8(pu1_dst_tmp1, shift_res, 2); |
| pu1_dst_tmp1 += dst_strd; |
| |
| vst1_lane_u8(pu1_dst_tmp1, shift_res, 1); |
| pu1_dst_tmp1 += dst_strd; |
| |
| vst1_lane_u8(pu1_dst_tmp1, shift_res, 0); |
| |
| } |
| pu1_ref_tmp1 -= 1; |
| pu1_ref_tmp2 -= 1; |
| |
| pu1_dst_tmp2 += 1; |
| pu1_dst_tmp1 = pu1_dst_tmp2; |
| |
| } |
| |
| |
| } |
| |
| } |
| |
| /** |
| ******************************************************************************* |
| * |
| * @brief |
| * Intra prediction interpolation filter for luma mode 11 to mode 17 |
| * |
| * @par Description: |
| * Intraprediction for mode 11 to 17 (negative angle, horizontal mode ) |
| * with reference neighboring samples location pointed by 'pu1_ref' to the |
| * TU block location pointed by 'pu1_dst' |
| * |
| * @param[in] pu1_src |
| * UWORD8 pointer to the source |
| * |
| * @param[out] pu1_dst |
| * UWORD8 pointer to the destination |
| * |
| * @param[in] src_strd |
| * integer source stride |
| * |
| * @param[in] dst_strd |
| * integer destination stride |
| * |
| * @param[in] nt |
| * integer Transform Block size |
| * |
| * @param[in] mode |
| * integer intraprediction mode |
| * |
| * @returns |
| * |
| * @remarks |
| * None |
| * |
| ******************************************************************************* |
| */ |
| |
| |
| void ihevc_intra_pred_luma_mode_11_to_17_neonintr(UWORD8 *pu1_ref, |
| WORD32 src_strd, |
| UWORD8 *pu1_dst, |
| WORD32 dst_strd, |
| WORD32 nt, |
| WORD32 mode) |
| { |
| |
| WORD32 row, col, k; |
| WORD32 two_nt; |
| WORD32 intra_pred_ang, inv_ang, inv_ang_sum; |
| WORD32 pos, fract = 1000, fract_prev; |
| WORD32 ref_idx; |
| |
| UWORD8 *ref_main; |
| UWORD8 *ref_main_tmp; |
| |
| UWORD8 *pu1_ref_tmp1 = pu1_ref; |
| UWORD8 *pu1_ref_tmp2 = pu1_ref; |
| UWORD8 *pu1_dst_tmp1 = pu1_dst; |
| UWORD8 *pu1_dst_tmp2 = pu1_dst; |
| |
| UWORD8 ref_temp[2 * MAX_CU_SIZE + 1]; |
| |
| uint16x8_t mul_res1, mul_res2, add_res; |
| uint8x8_t dup_const_fract, dup_const_32_fract; |
| uint8x8_t ref_main_idx, ref_main_idx_1, shift_res; |
| uint8x8_t ref_left_t; |
| uint32x2_t ref_left_tmp; |
| UNUSED(src_strd); |
| ref_left_tmp = vdup_n_u32(0); |
| |
| inv_ang_sum = 128; |
| two_nt = 2 * nt; |
| |
| intra_pred_ang = gai4_ihevc_ang_table[mode]; |
| |
| inv_ang = gai4_ihevc_inv_ang_table[mode - 11]; |
| |
| pu1_ref_tmp1 += two_nt; |
| |
| ref_main = ref_temp + (nt - 1); |
| ref_main_tmp = ref_main; |
| |
| if(0 == (nt & 7)) |
| { |
| pu1_ref_tmp2 += (two_nt - 7); |
| |
| for(k = nt - 1; k >= 0; k -= 8) |
| { |
| |
| ref_left_t = vld1_u8(pu1_ref_tmp2); |
| |
| ref_left_t = vrev64_u8(ref_left_t); |
| vst1_u8(ref_main_tmp, ref_left_t); |
| ref_main_tmp += 8; |
| pu1_ref_tmp2 -= 8; |
| |
| } |
| |
| } |
| else |
| { |
| uint8x8_t rev_val; |
| pu1_ref_tmp2 += (two_nt - (nt - 1)); |
| |
| for(k = nt - 1; k >= 0; k -= 8) |
| { |
| |
| ref_left_tmp = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, ref_left_tmp, 1); |
| |
| rev_val = vrev64_u8(vreinterpret_u8_u32(ref_left_tmp)); |
| vst1_lane_u32((uint32_t *)ref_main_tmp, vreinterpret_u32_u8(rev_val), 0); |
| |
| } |
| |
| } |
| |
| ref_main[nt] = pu1_ref[two_nt - nt]; |
| |
| /* For horizontal modes, (ref main = ref left) (ref side = ref above) */ |
| |
| ref_idx = (nt * intra_pred_ang) >> 5; |
| |
| /* SIMD Optimization can be done using look-up table for the loop */ |
| /* For negative angled derive the main reference samples from side */ |
| /* reference samples refer to section 8.4.4.2.6 */ |
| for(k = -1; k > ref_idx; k--) |
| { |
| inv_ang_sum += inv_ang; |
| ref_main[k] = pu1_ref[two_nt + (inv_ang_sum >> 8)]; |
| } |
| |
| UWORD8 *ref_main_tmp1 = ref_main; |
| UWORD8 *ref_main_tmp2 = ref_main; |
| |
| ref_main_tmp2 += 1; |
| |
| if(0 == (nt & 7)) |
| { |
| /* For the angles other then 45 degree, interpolation btw 2 neighboring */ |
| /* samples dependent on distance to obtain destination sample */ |
| for(col = 0; col < nt; col++) |
| { |
| |
| fract_prev = fract; |
| pos = ((col + 1) * intra_pred_ang); |
| fract = pos & (31); |
| |
| if(fract_prev < fract) |
| { |
| ref_main_tmp1 -= 1; |
| ref_main_tmp2 -= 1; |
| } |
| |
| dup_const_fract = vdup_n_u8((uint8_t)fract); |
| dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract)); |
| |
| // Do linear filtering |
| for(row = nt; row > 0; row -= 8) |
| { |
| ref_main_idx = vld1_u8(ref_main_tmp1); |
| |
| ref_main_idx_1 = vld1_u8(ref_main_tmp2); |
| |
| mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract); |
| mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract); |
| |
| add_res = vaddq_u16(mul_res1, mul_res2); |
| |
| shift_res = vrshrn_n_u16(add_res, 5); |
| |
| vst1_lane_u8(pu1_dst_tmp1, shift_res, 0); |
| pu1_dst_tmp1 += dst_strd; |
| |
| vst1_lane_u8(pu1_dst_tmp1, shift_res, 1); |
| pu1_dst_tmp1 += dst_strd; |
| |
| vst1_lane_u8(pu1_dst_tmp1, shift_res, 2); |
| pu1_dst_tmp1 += dst_strd; |
| |
| vst1_lane_u8(pu1_dst_tmp1, shift_res, 3); |
| pu1_dst_tmp1 += dst_strd; |
| |
| vst1_lane_u8(pu1_dst_tmp1, shift_res, 4); |
| pu1_dst_tmp1 += dst_strd; |
| |
| vst1_lane_u8(pu1_dst_tmp1, shift_res, 5); |
| pu1_dst_tmp1 += dst_strd; |
| |
| vst1_lane_u8(pu1_dst_tmp1, shift_res, 6); |
| pu1_dst_tmp1 += dst_strd; |
| |
| vst1_lane_u8(pu1_dst_tmp1, shift_res, 7); |
| pu1_dst_tmp1 += dst_strd; |
| |
| ref_main_tmp1 += 8; |
| ref_main_tmp2 += 8; |
| } |
| |
| ref_main_tmp1 -= nt; |
| ref_main_tmp2 -= nt; |
| |
| pu1_dst_tmp2 += 1; |
| pu1_dst_tmp1 = pu1_dst_tmp2; |
| } |
| } |
| else |
| { |
| uint32x2_t ref_main_idx1, ref_main_idx2; |
| |
| ref_main_idx1 = vdup_n_u32(0); |
| ref_main_idx2 = vdup_n_u32(0); |
| |
| for(col = 0; col < nt; col++) |
| { |
| fract_prev = fract; |
| pos = ((col + 1) * intra_pred_ang); |
| fract = pos & (31); |
| |
| if(fract_prev < fract) |
| { |
| ref_main_tmp1 -= 1; |
| ref_main_tmp2 -= 1; |
| } |
| |
| dup_const_fract = vdup_n_u8((uint8_t)fract); |
| dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract)); |
| |
| for(row = nt; row > 0; row -= 4) |
| { |
| |
| ref_main_idx1 = vld1_lane_u32((uint32_t *)ref_main_tmp1, ref_main_idx1, 0); |
| ref_main_idx2 = vld1_lane_u32((uint32_t *)ref_main_tmp2, ref_main_idx2, 0); |
| |
| mul_res1 = vmull_u8(vreinterpret_u8_u32(ref_main_idx1), dup_const_32_fract); |
| mul_res2 = vmull_u8(vreinterpret_u8_u32(ref_main_idx2), dup_const_fract); |
| |
| add_res = vaddq_u16(mul_res1, mul_res2); |
| |
| shift_res = vrshrn_n_u16(add_res, 5); |
| |
| vst1_lane_u8(pu1_dst_tmp1, shift_res, 0); |
| pu1_dst_tmp1 += dst_strd; |
| |
| vst1_lane_u8(pu1_dst_tmp1, shift_res, 1); |
| pu1_dst_tmp1 += dst_strd; |
| |
| vst1_lane_u8(pu1_dst_tmp1, shift_res, 2); |
| pu1_dst_tmp1 += dst_strd; |
| |
| vst1_lane_u8(pu1_dst_tmp1, shift_res, 3); |
| pu1_dst_tmp1 += dst_strd; |
| |
| } |
| |
| pu1_dst_tmp2 += 1; |
| pu1_dst_tmp1 = pu1_dst_tmp2; |
| |
| } |
| |
| } |
| } |
| |
| /** |
| ******************************************************************************* |
| * |
| * @brief |
| * Intra prediction interpolation filter for luma mode 19 to mode 25 |
| * |
| * @par Description: |
| * Intraprediction for mode 19 to 25 (negative angle, vertical mode ) with |
| * reference neighboring samples location pointed by 'pu1_ref' to the TU |
| * block location pointed by 'pu1_dst' |
| * |
| * @param[in] pu1_src |
| * UWORD8 pointer to the source |
| * |
| * @param[out] pu1_dst |
| * UWORD8 pointer to the destination |
| * |
| * @param[in] src_strd |
| * integer source stride |
| * |
| * @param[in] dst_strd |
| * integer destination stride |
| * |
| * @param[in] nt |
| * integer Transform Block size |
| * |
| * @param[in] mode |
| * integer intraprediction mode |
| * |
| * @returns |
| * |
| * @remarks |
| * None |
| * |
| ******************************************************************************* |
| */ |
| |
| |
| void ihevc_intra_pred_luma_mode_19_to_25_neonintr(UWORD8 *pu1_ref, |
| WORD32 src_strd, |
| UWORD8 *pu1_dst, |
| WORD32 dst_strd, |
| WORD32 nt, |
| WORD32 mode) |
| { |
| |
| WORD32 row, col, k; |
| WORD32 two_nt, intra_pred_ang; |
| WORD32 inv_ang, inv_ang_sum, pos, fract = 1000, fract_prev;; |
| WORD32 ref_idx; |
| UWORD8 *ref_main; |
| UWORD8 *ref_main_tmp; |
| UWORD8 ref_temp[(2 * MAX_CU_SIZE) + 1]; |
| |
| UWORD8 *pu1_ref_tmp1 = pu1_ref; |
| UWORD8 *pu1_ref_tmp2 = pu1_ref; |
| UWORD8 *pu1_dst_tmp1 = pu1_dst; |
| |
| uint16x8_t mul_res1, mul_res2, add_res; |
| uint8x8_t dup_const_fract, dup_const_32_fract; |
| uint8x8_t ref_main_idx, ref_main_idx_1, shift_res; |
| uint8x8_t ref_above_t; |
| uint32x2_t ref_above_tmp; |
| UNUSED(src_strd); |
| ref_above_tmp = vdup_n_u32(0); |
| |
| two_nt = 2 * nt; |
| intra_pred_ang = gai4_ihevc_ang_table[mode]; |
| inv_ang = gai4_ihevc_inv_ang_table[mode - 12]; |
| |
| /* Intermediate reference samples for negative angle modes */ |
| /* This have to be removed during optimization*/ |
| pu1_ref_tmp1 += two_nt; |
| |
| |
| ref_main = ref_temp + (nt - 1); |
| ref_main_tmp = ref_main; |
| |
| if(0 == (nt & 7)) |
| { |
| pu1_ref_tmp2 += (two_nt - 7); |
| for(k = nt - 1; k >= 0; k -= 8) |
| { |
| |
| ref_above_t = vld1_u8(pu1_ref_tmp1); |
| vst1_u8(ref_main_tmp, ref_above_t); |
| ref_main_tmp += 8; |
| pu1_ref_tmp1 += 8; |
| |
| } |
| |
| } |
| else |
| { |
| pu1_ref_tmp2 += (two_nt - (nt - 1)); |
| |
| for(k = nt - 1; k >= 0; k -= 4) |
| { |
| |
| ref_above_tmp = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, ref_above_tmp, 0); |
| vst1_lane_u32((uint32_t *)ref_main_tmp, ref_above_tmp, 0); |
| |
| } |
| |
| } |
| |
| ref_main[nt] = pu1_ref[two_nt + nt]; |
| |
| /* For horizontal modes, (ref main = ref above) (ref side = ref left) */ |
| |
| ref_idx = (nt * intra_pred_ang) >> 5; |
| inv_ang_sum = 128; |
| |
| /* SIMD Optimization can be done using look-up table for the loop */ |
| /* For negative angled derive the main reference samples from side */ |
| /* reference samples refer to section 8.4.4.2.6 */ |
| for(k = -1; k > ref_idx; k--) |
| { |
| inv_ang_sum += inv_ang; |
| ref_main[k] = pu1_ref[two_nt - (inv_ang_sum >> 8)]; |
| } |
| |
| UWORD8 *ref_main_tmp1 = ref_main; |
| UWORD8 *ref_main_tmp2 = ref_main; |
| |
| ref_main_tmp2 += 1; |
| |
| if(0 == (nt & 7)) |
| { |
| /* For the angles other then 45 degree, interpolation btw 2 neighboring */ |
| /* samples dependent on distance to obtain destination sample */ |
| for(row = 0; row < nt; row++) |
| { |
| |
| fract_prev = fract; |
| pos = ((row + 1) * intra_pred_ang); |
| fract = pos & (31); |
| |
| if(fract_prev < fract) |
| { |
| ref_main_tmp1 -= 1; |
| ref_main_tmp2 -= 1; |
| } |
| |
| dup_const_fract = vdup_n_u8((uint8_t)fract); |
| dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract)); |
| |
| // Do linear filtering |
| for(col = nt; col > 0; col -= 8) |
| { |
| ref_main_idx = vld1_u8(ref_main_tmp1); |
| |
| ref_main_idx_1 = vld1_u8(ref_main_tmp2); |
| |
| mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract); |
| mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract); |
| |
| add_res = vaddq_u16(mul_res1, mul_res2); |
| |
| shift_res = vrshrn_n_u16(add_res, 5); |
| |
| vst1_u8(pu1_dst_tmp1, shift_res); |
| pu1_dst_tmp1 += 8; |
| |
| ref_main_tmp1 += 8; |
| ref_main_tmp2 += 8; |
| } |
| |
| ref_main_tmp1 -= nt; |
| ref_main_tmp2 -= nt; |
| |
| pu1_dst_tmp1 += (dst_strd - nt); |
| } |
| } |
| else |
| { |
| uint32x2_t ref_main_idx1, ref_main_idx2; |
| |
| ref_main_idx1 = vdup_n_u32(0); |
| ref_main_idx2 = vdup_n_u32(0); |
| |
| for(row = 0; row < nt; row++) |
| { |
| fract_prev = fract; |
| pos = ((row + 1) * intra_pred_ang); |
| fract = pos & (31); |
| |
| if(fract_prev < fract) |
| { |
| ref_main_tmp1 -= 1; |
| ref_main_tmp2 -= 1; |
| } |
| |
| dup_const_fract = vdup_n_u8((uint8_t)fract); |
| dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract)); |
| |
| for(col = nt; col > 0; col -= 4) |
| { |
| |
| ref_main_idx1 = vld1_lane_u32((uint32_t *)ref_main_tmp1, ref_main_idx1, 0); |
| ref_main_idx2 = vld1_lane_u32((uint32_t *)ref_main_tmp2, ref_main_idx2, 0); |
| |
| mul_res1 = vmull_u8(vreinterpret_u8_u32(ref_main_idx1), dup_const_32_fract); |
| mul_res2 = vmull_u8(vreinterpret_u8_u32(ref_main_idx2), dup_const_fract); |
| |
| add_res = vaddq_u16(mul_res1, mul_res2); |
| |
| shift_res = vrshrn_n_u16(add_res, 5); |
| |
| vst1_lane_u32((uint32_t *)pu1_dst_tmp1, vreinterpret_u32_u8(shift_res), 0); |
| pu1_dst_tmp1 += 4; |
| |
| } |
| pu1_dst_tmp1 += (dst_strd - nt); |
| } |
| |
| } |
| |
| } |
| |
| /** |
| ******************************************************************************* |
| * |
| * @brief |
| * Intra prediction interpolation filter for luma mode 27 to mode 33 |
| * |
| * @par Description: |
| * Intraprediction for mode 27 to 33 (positive angle, vertical mode ) with |
| * reference neighboring samples location pointed by 'pu1_ref' to the TU |
| * block location pointed by 'pu1_dst' |
| * |
| * @param[in] pu1_src |
| * UWORD8 pointer to the source |
| * |
| * @param[out] pu1_dst |
| * UWORD8 pointer to the destination |
| * |
| * @param[in] src_strd |
| * integer source stride |
| * |
| * @param[in] dst_strd |
| * integer destination stride |
| * |
| * @param[in] nt |
| * integer Transform Block size |
| * |
| * @param[in] mode |
| * integer intraprediction mode |
| * |
| * @returns |
| * |
| * @remarks |
| * None |
| * |
| ******************************************************************************* |
| */ |
| |
| |
| void ihevc_intra_pred_luma_mode_27_to_33_neonintr(UWORD8 *pu1_ref, |
| WORD32 src_strd, |
| UWORD8 *pu1_dst, |
| WORD32 dst_strd, |
| WORD32 nt, |
| WORD32 mode) |
| { |
| |
| WORD32 row, col; |
| WORD32 intra_pred_ang; |
| WORD32 pos, fract = 0, fract_prev; |
| |
| WORD32 two_nt = 2 * nt; |
| UNUSED(src_strd); |
| if(0 == (nt & 7)) |
| { |
| |
| UWORD8 *pu1_ref_main_idx = pu1_ref; |
| UWORD8 *pu1_ref_main_idx_1 = pu1_ref; |
| |
| UWORD8 *pu1_dst_tmp1 = pu1_dst; |
| pu1_ref_main_idx += (two_nt + 1); |
| pu1_ref_main_idx_1 += (two_nt + 2); |
| |
| uint8x8_t dup_const_fract, dup_const_32_fract, ref_main_idx, ref_main_idx_1; |
| uint8x8_t shift_res; |
| uint16x8_t mul_res1, mul_res2, add_res; |
| |
| /* Intra Pred Angle according to the mode */ |
| intra_pred_ang = gai4_ihevc_ang_table[mode]; |
| |
| for(row = 0; row < nt; row++) |
| { |
| fract_prev = fract; |
| |
| pos = ((row + 1) * intra_pred_ang); |
| fract = pos & (31); |
| |
| if(fract_prev > fract) |
| { |
| pu1_ref_main_idx += 1; |
| pu1_ref_main_idx_1 += 1; |
| } |
| |
| dup_const_fract = vdup_n_u8((uint8_t)fract); |
| dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract)); |
| |
| for(col = nt; col > 0; col -= 8) |
| { |
| ref_main_idx = vld1_u8(pu1_ref_main_idx); |
| ref_main_idx_1 = vld1_u8(pu1_ref_main_idx_1); |
| |
| mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract); |
| mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract); |
| |
| add_res = vaddq_u16(mul_res1, mul_res2); |
| |
| shift_res = vrshrn_n_u16(add_res, 5); |
| |
| vst1_u8(pu1_dst_tmp1, shift_res); |
| pu1_dst_tmp1 += 8; |
| |
| pu1_ref_main_idx += 8; |
| pu1_ref_main_idx_1 += 8; |
| } |
| |
| pu1_ref_main_idx -= nt; |
| pu1_ref_main_idx_1 -= nt; |
| |
| pu1_dst_tmp1 += (dst_strd - nt); |
| } |
| |
| } |
| else |
| { |
| UWORD8 *pu1_ref_tmp1 = pu1_ref; |
| UWORD8 *pu1_ref_tmp2 = pu1_ref; |
| UWORD8 *pu1_dst_tmp1 = pu1_dst; |
| |
| pu1_ref_tmp1 += (two_nt + 1);; |
| pu1_ref_tmp2 += (two_nt + 2);; |
| |
| uint8x8_t dup_fract, dup_32_fract, shift_res; |
| uint16x8_t mul_res1, mul_res2, add_res; |
| uint32x2_t pu1_ref_val1, pu1_ref_val2; |
| |
| pu1_ref_val1 = vdup_n_u32(0); |
| pu1_ref_val2 = vdup_n_u32(0); |
| |
| /* Intra Pred Angle according to the mode */ |
| intra_pred_ang = gai4_ihevc_ang_table[mode]; |
| |
| for(row = 0; row < nt; row++) |
| { |
| fract_prev = fract; |
| pos = ((row + 1) * intra_pred_ang); |
| fract = pos & (31); |
| if(fract_prev > fract) |
| { |
| pu1_ref_tmp1 += 1; |
| pu1_ref_tmp2 += 1; |
| } |
| dup_fract = vdup_n_u8((uint8_t)fract); |
| dup_32_fract = vdup_n_u8((uint8_t)(32 - fract)); |
| |
| for(col = nt; col > 0; col -= 4) |
| { |
| pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, pu1_ref_val1, 0); |
| pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, pu1_ref_val2, 0); |
| |
| mul_res1 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_32_fract); |
| mul_res2 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val2), dup_fract); |
| |
| add_res = vaddq_u16(mul_res1, mul_res2); |
| |
| shift_res = vrshrn_n_u16(add_res, 5); |
| |
| vst1_lane_u32((uint32_t *)pu1_dst_tmp1, vreinterpret_u32_u8(shift_res), 0); |
| pu1_dst_tmp1 += 4; |
| |
| } |
| |
| pu1_dst_tmp1 += (dst_strd - nt); |
| |
| } |
| |
| |
| } |
| |
| } |