| @/****************************************************************************** |
| @ * |
| @ * Copyright (C) 2015 The Android Open Source Project |
| @ * |
| @ * Licensed under the Apache License, Version 2.0 (the "License"); |
| @ * you may not use this file except in compliance with the License. |
| @ * You may obtain a copy of the License at: |
| @ * |
| @ * http://www.apache.org/licenses/LICENSE-2.0 |
| @ * |
| @ * Unless required by applicable law or agreed to in writing, software |
| @ * distributed under the License is distributed on an "AS IS" BASIS, |
| @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| @ * See the License for the specific language governing permissions and |
| @ * limitations under the License. |
| @ * |
| @ ***************************************************************************** |
| @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
| @*/ |
| @** |
| @** |
| @******************************************************************************* |
| @* |
| @* @brief |
| @* Interprediction luma function for copy |
| @* |
| @* @par Description: |
| @* Copies the array of width 'wd' and height 'ht' from the location pointed |
| @* by 'src' to the location pointed by 'dst' |
| @* |
| @* @param[in] pu1_src |
| @* UWORD8 pointer to the source |
| @* |
| @* @param[out] pu1_dst |
| @* UWORD8 pointer to the destination |
| @* |
| @* @param[in] src_strd |
| @* integer source stride |
| @* |
| @* @param[in] dst_strd |
| @* integer destination stride |
| @* |
| @* |
| @* @param[in] ht |
| @* integer height of the array |
| @* |
| @* @param[in] wd |
| @* integer width of the array |
| @* |
| @* @returns |
| @* |
| @* @remarks |
| @* None |
| @* |
| @******************************************************************************* |
| @* |
| @void ih264_inter_pred_luma_copy ( |
| @ UWORD8 *pu1_src, |
| @ UWORD8 *pu1_dst, |
| @ WORD32 src_strd, |
| @ WORD32 dst_strd, |
| @ WORD32 ht, |
| @ WORD32 wd ) |
| |
| @**************Variables Vs Registers***************************************** |
| @ r0 => *pu1_src |
| @ r1 => *pu1_dst |
| @ r2 => src_strd |
| @ r3 => dst_strd |
| @ r7 => ht |
| @ r12 => wd |
| |
| .text |
| .p2align 2 |
| |
| .global ih264_inter_pred_luma_copy_a9q |
| |
| ih264_inter_pred_luma_copy_a9q: |
| stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments |
| vstmdb sp!, {d8-d15} @push neon registers to stack |
| ldr r12, [sp, #108] @Loads wd |
| ldr r7, [sp, #104] @Loads ht |
| cmp r7, #0 @checks ht == 0 |
| ble end_loops |
| tst r12, #15 @checks wd for multiples for 4 & 8 |
| beq core_loop_wd_16 |
| tst r12, #7 @checks wd for multiples for 4 & 8 |
| beq core_loop_wd_8 |
| sub r11, r12, #4 |
| |
| outer_loop_wd_4: |
| subs r4, r12, #0 @checks wd == 0 |
| ble end_inner_loop_wd_4 |
| |
| inner_loop_wd_4: |
| vld1.32 {d0[0]}, [r0] @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) |
| add r5, r0, r2 @pu1_src_tmp += src_strd |
| add r6, r1, r3 @pu1_dst_tmp += dst_strd |
| vst1.32 {d0[0]}, [r1] @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) |
| vld1.32 {d0[0]}, [r5], r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) |
| add r0, r0, #4 @pu1_src += 4 |
| vst1.32 {d0[0]}, [r6], r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) |
| vld1.32 {d0[0]}, [r5], r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) |
| subs r4, r4, #4 @(wd -4) |
| vst1.32 {d0[0]}, [r6], r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) |
| vld1.32 {d0[0]}, [r5], r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) |
| add r1, r1, #4 @pu1_dst += 4 |
| vst1.32 {d0[0]}, [r6], r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) |
| |
| bgt inner_loop_wd_4 |
| |
| end_inner_loop_wd_4: |
| subs r7, r7, #4 @ht - 4 |
| sub r0, r5, r11 @pu1_src = pu1_src_tmp |
| sub r1, r6, r11 @pu1_dst = pu1_dst_tmp |
| bgt outer_loop_wd_4 |
| |
| end_loops: |
| vldmia sp!, {d8-d15} @ Restore neon registers that were saved |
| ldmfd sp!, {r4-r12, r15} @Reload the registers from SP |
| |
| |
| |
| core_loop_wd_8: |
| sub r11, r12, #8 |
| |
| outer_loop_wd_8: |
| subs r4, r12, #0 @checks wd |
| ble end_inner_loop_wd_8 |
| |
| inner_loop_wd_8: |
| add r5, r0, r2 @pu1_src_tmp += src_strd |
| vld1.8 {d0}, [r0]! @vld1_u8(pu1_src_tmp) |
| add r6, r1, r3 @pu1_dst_tmp += dst_strd |
| vst1.8 {d0}, [r1]! @vst1_u8(pu1_dst_tmp, tmp_src) |
| vld1.8 {d1}, [r5], r2 @vld1_u8(pu1_src_tmp) |
| vst1.8 {d1}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src) |
| subs r4, r4, #8 @wd - 8(Loop condition) |
| vld1.8 {d2}, [r5], r2 @vld1_u8(pu1_src_tmp) |
| vst1.8 {d2}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src) |
| vld1.8 {d3}, [r5], r2 @vld1_u8(pu1_src_tmp) |
| vst1.8 {d3}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src) |
| bgt inner_loop_wd_8 |
| |
| end_inner_loop_wd_8: |
| subs r7, r7, #4 @ht -= 4 |
| sub r0, r5, r11 @pu1_src = pu1_src_tmp |
| sub r1, r6, r11 @pu1_dst = pu1_dst_tmp |
| bgt outer_loop_wd_8 |
| |
| vldmia sp!, {d8-d15} @ Restore neon registers that were saved |
| ldmfd sp!, {r4-r12, r15} @Reload the registers from SP |
| |
| core_loop_wd_16: |
| sub r11, r12, #16 |
| |
| outer_loop_wd_16: |
| subs r4, r12, #0 @checks wd |
| ble end_inner_loop_wd_16 |
| |
| inner_loop_wd_16: |
| add r5, r0, r2 @pu1_src_tmp += src_strd |
| vld1.8 {q0}, [r0]! @vld1_u8(pu1_src_tmp) |
| add r6, r1, r3 @pu1_dst_tmp += dst_strd |
| vst1.8 {q0}, [r1]! @vst1_u8(pu1_dst_tmp, tmp_src) |
| vld1.8 {q1}, [r5], r2 @vld1_u8(pu1_src_tmp) |
| vst1.8 {q1}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src) |
| subs r4, r4, #16 @wd - 8(Loop condition) |
| vld1.8 {q2}, [r5], r2 @vld1_u8(pu1_src_tmp) |
| vst1.8 {q2}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src) |
| vld1.8 {q3}, [r5], r2 @vld1_u8(pu1_src_tmp) |
| vst1.8 {q3}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src) |
| bgt inner_loop_wd_16 |
| |
| end_inner_loop_wd_16: |
| subs r7, r7, #4 @ht -= 4 |
| sub r0, r5, r11 @pu1_src = pu1_src_tmp |
| sub r1, r6, r11 @pu1_dst = pu1_dst_tmp |
| bgt outer_loop_wd_16 |
| |
| vldmia sp!, {d8-d15} @ Restore neon registers that were saved |
| ldmfd sp!, {r4-r12, r15} @Reload the registers from SP |
| |
| |
| @ * |
| @ ******************************************************************************** |
| @ * |
| @ * @brief This function copies a 4x4 block to destination |
| @ * |
| @ * @par Description: |
| @ * Copies a 4x4 block to destination, where both src and dst are interleaved |
| @ * |
| @ * @param[in] pi2_src |
| @ * Source |
| @ * |
| @ * @param[in] pu1_out |
| @ * Output pointer |
| @ * |
| @ * @param[in] pred_strd, |
| @ * Prediction buffer stride |
| @ * |
| @ * @param[in] out_strd |
| @ * output buffer buffer Stride |
| @ * |
| @ * @returns none |
| @ * |
| @ * @remarks none |
| @ * Currently wd and height is not used, ie a 4x4 block is always copied |
| @ * |
| @ ******************************************************************************* |
| @ * |
| @ void ih264_interleave_copy(WORD16 *pi2_src, |
| @ UWORD8 *pu1_out, |
| @ WORD32 pred_strd, |
| @ WORD32 out_strd |
| @ WORD32 wd |
| @ WORD32 ht) |
| @ Register Usage |
| @ r0 : pi2_src |
| @ r1 : pu1_out |
| @ r2 : src_strd |
| @ r3 : out_strd |
| @ Neon registers d0-d7, d16-d30 are used |
| @ No need for pushing arm and neon registers |
| |
| .global ih264_interleave_copy_a9 |
| ih264_interleave_copy_a9: |
| |
| vld1.u8 d2, [r0], r2 @load src plane 1 => d2 &pred palne 2 => d3 |
| vld1.u8 d3, [r0], r2 |
| vld1.u8 d4, [r0], r2 |
| vld1.u8 d5, [r0], r2 |
| |
| mov r0, r1 |
| |
| vld1.u8 d18, [r1], r3 @load out [8 bit size) -8 coeffs |
| vld1.u8 d19, [r1], r3 |
| vmov.u16 q15, #0x00ff |
| vld1.u8 d20, [r1], r3 |
| vld1.u8 d21, [r1], r3 |
| |
| vbit.u8 q9, q1, q15 |
| vbit.u8 q10, q2, q15 |
| |
| vst1.u8 d18, [r0], r3 @store out |
| vst1.u8 d19, [r0], r3 |
| vst1.u8 d20, [r0], r3 |
| vst1.u8 d21, [r0], r3 |
| |
| bx lr |
| |
| |
| |