| //****************************************************************************** |
| //* |
| //* Copyright (C) 2015 The Android Open Source Project |
| //* |
| //* Licensed under the Apache License, Version 2.0 (the "License"); |
| //* you may not use this file except in compliance with the License. |
| //* You may obtain a copy of the License at: |
| //* |
| //* http://www.apache.org/licenses/LICENSE-2.0 |
| //* |
| //* Unless required by applicable law or agreed to in writing, software |
| //* distributed under the License is distributed on an "AS IS" BASIS, |
| //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| //* See the License for the specific language governing permissions and |
| //* limitations under the License. |
| //* |
| //***************************************************************************** |
| //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
| //*/ |
| ///** |
| ///** |
| //******************************************************************************* |
| //* |
| //* @brief |
| //* Interprediction luma function for copy |
| //* |
| //* @par Description: |
| //* Copies the array of width 'wd' and height 'ht' from the location pointed |
| //* by 'src' to the location pointed by 'dst' |
| //* |
| //* @param[in] pu1_src |
| //* UWORD8 pointer to the source |
| //* |
| //* @param[out] pu1_dst |
| //* UWORD8 pointer to the destination |
| //* |
| //* @param[in] src_strd |
| //* integer source stride |
| //* |
| //* @param[in] dst_strd |
| //* integer destination stride |
| //* |
| //* |
| //* @param[in] ht |
| //* integer height of the array |
| //* |
| //* @param[in] wd |
| //* integer width of the array |
| //* |
| //* @returns |
| //* |
| //* @remarks |
| //* None |
| //* |
| //******************************************************************************* |
| //*/ |
| //void ih264_inter_pred_luma_copy ( |
| // UWORD8 *pu1_src, |
| // UWORD8 *pu1_dst, |
| // WORD32 src_strd, |
| // WORD32 dst_strd, |
| // WORD32 ht, |
| // WORD32 wd ) |
| |
| //**************Variables Vs Registers***************************************** |
| // x0 => *pu1_src |
| // x1 => *pu1_dst |
| // x2 => src_strd |
| // x3 => dst_strd |
| // x7 => ht |
| // x12 => wd |
| |
| .text |
| .p2align 2 |
| .include "ih264_neon_macros.s" |
| |
| |
| |
| .global ih264_inter_pred_luma_copy_av8 |
| |
| ih264_inter_pred_luma_copy_av8: |
| |
| push_v_regs |
| stp x19, x20, [sp, #-16]! |
| |
| mov x12, x5 |
| mov x7, x4 |
| cmp x7, #0 //checks ht == 0 |
| ble end_loops |
| tst x12, #15 //checks wd for multiples for 4 & 8 |
| beq core_loop_wd_16 |
| tst x12, #7 //checks wd for multiples for 4 & 8 |
| beq core_loop_wd_8 |
| sub x11, x12, #4 |
| |
| outer_loop_wd_4: |
| subs x4, x12, #0 //checks wd == 0 |
| ble end_inner_loop_wd_4 |
| |
| inner_loop_wd_4: |
| ld1 {v0.s}[0], [x0] //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) |
| add x5, x0, x2 //pu1_src_tmp += src_strd |
| add x6, x1, x3 //pu1_dst_tmp += dst_strd |
| st1 {v0.s}[0], [x1] //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) |
| ld1 {v0.s}[0], [x5], x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) |
| add x0, x0, #4 //pu1_src += 4 |
| st1 {v0.s}[0], [x6], x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) |
| ld1 {v0.s}[0], [x5], x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) |
| subs x4, x4, #4 //(wd -4) |
| st1 {v0.s}[0], [x6], x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) |
| ld1 {v0.s}[0], [x5], x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) |
| add x1, x1, #4 //pu1_dst += 4 |
| st1 {v0.s}[0], [x6], x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) |
| |
| bgt inner_loop_wd_4 |
| |
| end_inner_loop_wd_4: |
| subs x7, x7, #4 //ht - 4 |
| sub x0, x5, x11 //pu1_src = pu1_src_tmp |
| sub x1, x6, x11 //pu1_dst = pu1_dst_tmp |
| bgt outer_loop_wd_4 |
| |
| end_loops: |
| // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP |
| ldp x19, x20, [sp], #16 |
| pop_v_regs |
| ret |
| |
| |
| core_loop_wd_8: |
| sub x11, x12, #8 |
| |
| outer_loop_wd_8: |
| subs x4, x12, #0 //checks wd |
| ble end_inner_loop_wd_8 |
| |
| inner_loop_wd_8: |
| add x5, x0, x2 //pu1_src_tmp += src_strd |
| ld1 {v0.8b}, [x0], #8 //vld1_u8(pu1_src_tmp) |
| add x6, x1, x3 //pu1_dst_tmp += dst_strd |
| st1 {v0.8b}, [x1], #8 //vst1_u8(pu1_dst_tmp, tmp_src) |
| ld1 {v1.8b}, [x5], x2 //vld1_u8(pu1_src_tmp) |
| st1 {v1.8b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src) |
| subs x4, x4, #8 //wd - 8(Loop condition) |
| ld1 {v2.8b}, [x5], x2 //vld1_u8(pu1_src_tmp) |
| st1 {v2.8b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src) |
| ld1 {v3.8b}, [x5], x2 //vld1_u8(pu1_src_tmp) |
| st1 {v3.8b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src) |
| bgt inner_loop_wd_8 |
| |
| end_inner_loop_wd_8: |
| subs x7, x7, #4 //ht -= 4 |
| sub x0, x5, x11 //pu1_src = pu1_src_tmp |
| sub x1, x6, x11 //pu1_dst = pu1_dst_tmp |
| bgt outer_loop_wd_8 |
| |
| // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP |
| ldp x19, x20, [sp], #16 |
| pop_v_regs |
| ret |
| |
| core_loop_wd_16: |
| sub x11, x12, #16 |
| |
| outer_loop_wd_16: |
| subs x4, x12, #0 //checks wd |
| ble end_inner_loop_wd_16 |
| |
| inner_loop_wd_16: |
| add x5, x0, x2 //pu1_src_tmp += src_strd |
| ld1 { v0.16b}, [x0], #16 //vld1_u8(pu1_src_tmp) |
| add x6, x1, x3 //pu1_dst_tmp += dst_strd |
| st1 { v0.16b}, [x1], #16 //vst1_u8(pu1_dst_tmp, tmp_src) |
| ld1 { v2.16b}, [x5], x2 //vld1_u8(pu1_src_tmp) |
| st1 { v2.16b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src) |
| subs x4, x4, #16 //wd - 8(Loop condition) |
| ld1 { v4.16b}, [x5], x2 //vld1_u8(pu1_src_tmp) |
| st1 { v4.16b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src) |
| ld1 { v6.16b}, [x5], x2 //vld1_u8(pu1_src_tmp) |
| st1 { v6.16b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src) |
| bgt inner_loop_wd_16 |
| |
| end_inner_loop_wd_16: |
| subs x7, x7, #4 //ht -= 4 |
| sub x0, x5, x11 //pu1_src = pu1_src_tmp |
| sub x1, x6, x11 //pu1_dst = pu1_dst_tmp |
| bgt outer_loop_wd_16 |
| |
| |
| ldp x19, x20, [sp], #16 |
| pop_v_regs |
| ret |
| |
| |
| // /* |
| // ******************************************************************************** |
| // * |
| // * @brief This function copies a 4x4 block to destination |
| // * |
| // * @par Description: |
| // * Copies a 4x4 block to destination, where both src and dst are interleaved |
| // * |
| // * @param[in] pi2_src |
| // * Source |
| // * |
| // * @param[in] pu1_out |
| // * Output pointer |
| // * |
| // * @param[in] pred_strd, |
| // * Prediction buffer stride |
| // * |
| // * @param[in] out_strd |
| // * output buffer buffer Stride |
| // * |
| // * @returns none |
| // * |
| // * @remarks none |
| // * Currently wd and height is not used, ie a 4x4 block is always copied |
| // * |
| // ******************************************************************************* |
| // */ |
| // void ih264_interleave_copy(WORD16 *pi2_src, |
| // UWORD8 *pu1_out, |
| // WORD32 pred_strd, |
| // WORD32 out_strd |
| // WORD32 wd |
| // WORD32 ht) |
| // Register Usage |
| // x0 : pi2_src |
| // x1 : pu1_out |
| // x2 : src_strd |
| // x3 : out_strd |
| // Neon registers d0-d7, d16-d30 are used |
| // No need for pushing arm and neon registers |
| |
| .global ih264_interleave_copy_av8 |
| ih264_interleave_copy_av8: |
| push_v_regs |
| ld1 {v2.8b}, [x0], x2 //load src plane 1 => d2 &pred palne 2 => d3 |
| ld1 {v3.8b}, [x0], x2 |
| mov v2.d[1], v3.d[0] |
| ld1 {v4.8b}, [x0], x2 |
| ld1 {v5.8b}, [x0], x2 |
| mov v4.d[1], v5.d[0] |
| |
| mov x0, x1 |
| |
| ld1 {v18.8b}, [x1], x3 //load out [8 bit size) -8 coeffs |
| ld1 {v19.8b}, [x1], x3 |
| mov v18.d[1], v19.d[0] |
| movi v30.8h, #0x00ff |
| ld1 {v20.8b}, [x1], x3 |
| ld1 {v21.8b}, [x1], x3 |
| mov v20.d[1], v21.d[0] |
| |
| bit v18.16b, v2.16b , v30.16b |
| bit v20.16b, v4.16b , v30.16b |
| |
| st1 {v18.8b}, [x0], x3 //store out |
| st1 {v18.d}[1], [x0], x3 |
| st1 {v20.8b}, [x0], x3 |
| st1 {v20.d}[1], [x0], x3 |
| |
| pop_v_regs |
| ret |
| |
| |