| /****************************************************************************** |
| * |
| * Copyright (C) 2018 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at: |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| * |
| ***************************************************************************** |
| * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
| */ |
| /** |
| ******************************************************************************* |
| * @file |
| * ihevce_decomp_pre_intra_pass_neon.c |
| * |
| * @brief |
| * Contains functions to perform input scaling |
| * |
| * @author |
| * Ittiam |
| * |
| * @par List of Functions: |
| * |
| * @remarks |
| * None |
| * |
| ******************************************************************************** |
| */ |
| /*****************************************************************************/ |
| /* File Includes */ |
| /*****************************************************************************/ |
| /* System include files */ |
| #include <stdio.h> |
| #include <string.h> |
| #include <assert.h> |
| #include <arm_neon.h> |
| |
| /* User include files */ |
| #include "ihevc_typedefs.h" |
| #include "ihevc_macros.h" |
| #include "ihevc_platform_macros.h" |
| #include "itt_video_api.h" |
| #include "ihevc_defs.h" |
| #include "ihevc_cmn_utils_neon.h" |
| #include "ihevce_ipe_instr_set_router.h" |
| |
| /*****************************************************************************/ |
| /* Function Definitions */ |
| /*****************************************************************************/ |
| void ihevce_scaling_filter_mxn( |
| UWORD8 *pu1_src, |
| WORD32 src_strd, |
| UWORD8 *pu1_scrtch, |
| WORD32 scrtch_strd, |
| UWORD8 *pu1_dst, |
| WORD32 dst_strd, |
| WORD32 ht, |
| WORD32 wd) |
| { |
| #define FILT_TAP_Q 8 |
| #define N_TAPS 7 |
| const WORD16 i4_ftaps[N_TAPS] = { -18, 0, 80, 132, 80, 0, -18 }; |
| WORD32 i, j; |
| WORD32 tmp; |
| UWORD8 *pu1_src_tmp = pu1_src - 3 * src_strd; |
| UWORD8 *pu1_scrtch_tmp = pu1_scrtch; |
| |
| /* horizontal filtering */ |
| for(i = -3; i < ht + 2; i++) |
| { |
| for(j = 0; j < wd; j += 2) |
| { |
| tmp = (i4_ftaps[3] * pu1_src_tmp[j] + |
| i4_ftaps[2] * (pu1_src_tmp[j - 1] + pu1_src_tmp[j + 1]) + |
| i4_ftaps[1] * (pu1_src_tmp[j + 2] + pu1_src_tmp[j - 2]) + |
| i4_ftaps[0] * (pu1_src_tmp[j + 3] + pu1_src_tmp[j - 3]) + |
| (1 << (FILT_TAP_Q - 1))) >> |
| FILT_TAP_Q; |
| pu1_scrtch_tmp[j >> 1] = CLIP_U8(tmp); |
| } |
| pu1_scrtch_tmp += scrtch_strd; |
| pu1_src_tmp += src_strd; |
| } |
| /* vertical filtering */ |
| pu1_scrtch_tmp = pu1_scrtch + 3 * scrtch_strd; |
| for(i = 0; i < ht; i += 2) |
| { |
| for(j = 0; j < (wd >> 1); j++) |
| { |
| tmp = |
| (i4_ftaps[3] * pu1_scrtch_tmp[j] + |
| i4_ftaps[2] * (pu1_scrtch_tmp[j + scrtch_strd] + pu1_scrtch_tmp[j - scrtch_strd]) + |
| i4_ftaps[1] * |
| (pu1_scrtch_tmp[j + 2 * scrtch_strd] + pu1_scrtch_tmp[j - 2 * scrtch_strd]) + |
| i4_ftaps[0] * |
| (pu1_scrtch_tmp[j + 3 * scrtch_strd] + pu1_scrtch_tmp[j - 3 * scrtch_strd]) + |
| (1 << (FILT_TAP_Q - 1))) >> |
| FILT_TAP_Q; |
| pu1_dst[j] = CLIP_U8(tmp); |
| } |
| pu1_dst += dst_strd; |
| pu1_scrtch_tmp += (scrtch_strd << 1); |
| } |
| } |
| |
| void ihevce_scale_by_2_neon( |
| UWORD8 *pu1_src, |
| WORD32 src_strd, |
| UWORD8 *pu1_dst, |
| WORD32 dst_strd, |
| WORD32 wd, |
| WORD32 ht, |
| UWORD8 *pu1_wkg_mem, |
| WORD32 ht_offset, |
| WORD32 block_ht, |
| WORD32 wd_offset, |
| WORD32 block_wd, |
| FT_COPY_2D *pf_copy_2d) |
| { |
| #define MAX_BLK_SZ (MAX_CTB_SIZE + ((N_TAPS >> 1) << 1)) |
| UWORD8 au1_cpy[MAX_BLK_SZ * MAX_BLK_SZ]; |
| UWORD32 cpy_strd = MAX_BLK_SZ; |
| UWORD8 *pu1_cpy = au1_cpy + cpy_strd * (N_TAPS >> 1) + (N_TAPS >> 1); |
| |
| UWORD8 *pu1_in, *pu1_out; |
| WORD32 in_strd, wkg_mem_strd; |
| |
| WORD32 row_start, row_end; |
| WORD32 col_start, col_end; |
| WORD32 i, fun_select; |
| WORD32 ht_tmp, wd_tmp; |
| FT_SCALING_FILTER_BY_2 *ihevce_scaling_filters[2]; |
| |
| assert((wd & 1) == 0); |
| assert((ht & 1) == 0); |
| assert(block_wd <= MAX_CTB_SIZE); |
| assert(block_ht <= MAX_CTB_SIZE); |
| |
| /* function pointers for filtering different dimensions */ |
| ihevce_scaling_filters[0] = ihevce_scaling_filter_mxn; |
| ihevce_scaling_filters[1] = ihevce_scaling_filter_mxn_neon; |
| |
| /* handle boundary blks */ |
| col_start = (wd_offset < (N_TAPS >> 1)) ? 1 : 0; |
| row_start = (ht_offset < (N_TAPS >> 1)) ? 1 : 0; |
| col_end = ((wd_offset + block_wd) > (wd - (N_TAPS >> 1))) ? 1 : 0; |
| row_end = ((ht_offset + block_ht) > (ht - (N_TAPS >> 1))) ? 1 : 0; |
| if(col_end && (wd % block_wd != 0)) |
| { |
| block_wd = (wd % block_wd); |
| } |
| if(row_end && (ht % block_ht != 0)) |
| { |
| block_ht = (ht % block_ht); |
| } |
| |
| /* boundary blks needs to be padded, copy src to tmp buffer */ |
| if(col_start || col_end || row_end || row_start) |
| { |
| UWORD8 *pu1_src_tmp = pu1_src + wd_offset + ht_offset * src_strd; |
| |
| pu1_cpy -= (3 * (1 - col_start) + cpy_strd * 3 * (1 - row_start)); |
| pu1_src_tmp -= (3 * (1 - col_start) + src_strd * 3 * (1 - row_start)); |
| ht_tmp = block_ht + 3 * (1 - row_start) + 3 * (1 - row_end); |
| wd_tmp = block_wd + 3 * (1 - col_start) + 3 * (1 - col_end); |
| pf_copy_2d(pu1_cpy, cpy_strd, pu1_src_tmp, src_strd, wd_tmp, ht_tmp); |
| pu1_in = au1_cpy + cpy_strd * 3 + 3; |
| in_strd = cpy_strd; |
| } |
| else |
| { |
| pu1_in = pu1_src + wd_offset + ht_offset * src_strd; |
| in_strd = src_strd; |
| } |
| |
| /*top padding*/ |
| if(row_start) |
| { |
| UWORD8 *pu1_cpy_tmp = au1_cpy + cpy_strd * 3; |
| |
| pu1_cpy = au1_cpy + cpy_strd * (3 - 1); |
| memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6); |
| pu1_cpy -= cpy_strd; |
| memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6); |
| pu1_cpy -= cpy_strd; |
| memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6); |
| } |
| |
| /*bottom padding*/ |
| if(row_end) |
| { |
| UWORD8 *pu1_cpy_tmp = au1_cpy + cpy_strd * 3 + (block_ht - 1) * cpy_strd; |
| |
| pu1_cpy = pu1_cpy_tmp + cpy_strd; |
| memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6); |
| pu1_cpy += cpy_strd; |
| memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6); |
| pu1_cpy += cpy_strd; |
| memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6); |
| } |
| |
| /*left padding*/ |
| if(col_start) |
| { |
| UWORD8 *pu1_cpy_tmp = au1_cpy + 3; |
| |
| pu1_cpy = au1_cpy; |
| for(i = 0; i < block_ht + 6; i++) |
| { |
| pu1_cpy[0] = pu1_cpy[1] = pu1_cpy[2] = pu1_cpy_tmp[0]; |
| pu1_cpy += cpy_strd; |
| pu1_cpy_tmp += cpy_strd; |
| } |
| } |
| |
| /*right padding*/ |
| if(col_end) |
| { |
| UWORD8 *pu1_cpy_tmp = au1_cpy + 3 + block_wd - 1; |
| |
| pu1_cpy = au1_cpy + 3 + block_wd; |
| for(i = 0; i < block_ht + 6; i++) |
| { |
| pu1_cpy[0] = pu1_cpy[1] = pu1_cpy[2] = pu1_cpy_tmp[0]; |
| pu1_cpy += cpy_strd; |
| pu1_cpy_tmp += cpy_strd; |
| } |
| } |
| |
| wkg_mem_strd = block_wd >> 1; |
| pu1_out = pu1_dst + (wd_offset >> 1); |
| fun_select = (block_wd % 16 == 0); |
| ihevce_scaling_filters[fun_select]( |
| pu1_in, in_strd, pu1_wkg_mem, wkg_mem_strd, pu1_out, dst_strd, block_ht, block_wd); |
| |
| /* Left padding of 16 for 1st block of every row */ |
| if(wd_offset == 0) |
| { |
| UWORD8 u1_val; |
| WORD32 pad_wd = 16; |
| WORD32 pad_ht = block_ht >> 1; |
| UWORD8 *dst = pu1_dst; |
| |
| for(i = 0; i < pad_ht; i++) |
| { |
| u1_val = dst[0]; |
| memset(&dst[-pad_wd], u1_val, pad_wd); |
| dst += dst_strd; |
| } |
| } |
| |
| if(wd == wd_offset + block_wd) |
| { |
| /* Right padding of (16 + (CEIL16(wd/2))-wd/2) for last block of every row */ |
| /* Right padding is done only after processing of last block of that row is done*/ |
| UWORD8 u1_val; |
| WORD32 pad_wd = 16 + CEIL16((wd >> 1)) - (wd >> 1) + 4; |
| WORD32 pad_ht = block_ht >> 1; |
| UWORD8 *dst = pu1_dst + (wd >> 1) - 1; |
| |
| for(i = 0; i < pad_ht; i++) |
| { |
| u1_val = dst[0]; |
| memset(&dst[1], u1_val, pad_wd); |
| dst += dst_strd; |
| } |
| |
| if(ht_offset == 0) |
| { |
| /* Top padding of 16 is done for 1st row only after we reach end of that row */ |
| WORD32 pad_wd = dst_strd; |
| WORD32 pad_ht = 16; |
| UWORD8 *dst = pu1_dst - 16; |
| |
| for(i = 1; i <= pad_ht; i++) |
| { |
| memcpy(dst - (i * dst_strd), dst, pad_wd); |
| } |
| } |
| |
| /* Bottom padding of (16 + (CEIL16(ht/2)) - ht/2) is done only if we have |
| reached end of frame */ |
| if(ht - ht_offset - block_ht == 0) |
| { |
| WORD32 pad_wd = dst_strd; |
| WORD32 pad_ht = 16 + CEIL16((ht >> 1)) - (ht >> 1) + 4; |
| UWORD8 *dst = pu1_dst + (((block_ht >> 1) - 1) * dst_strd) - 16; |
| |
| for(i = 1; i <= pad_ht; i++) |
| memcpy(dst + (i * dst_strd), dst, pad_wd); |
| } |
| } |
| } |