blob: c0b0568631cc9093928840aa7b179e17bce08c57 [file] [log] [blame]
@/******************************************************************************
@ *
@ * Copyright (C) 2015 The Android Open Source Project
@ *
@ * Licensed under the Apache License, Version 2.0 (the "License");
@ * you may not use this file except in compliance with the License.
@ * You may obtain a copy of the License at:
@ *
@ * http://www.apache.org/licenses/LICENSE-2.0
@ *
@ * Unless required by applicable law or agreed to in writing, software
@ * distributed under the License is distributed on an "AS IS" BASIS,
@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ * See the License for the specific language governing permissions and
@ * limitations under the License.
@ *
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
@**
@**
@*******************************************************************************
@*
@* @brief
@* Interprediction luma function for copy
@*
@* @par Description:
@* Copies the array of width 'wd' and height 'ht' from the location pointed
@* by 'src' to the location pointed by 'dst'
@*
@* @param[in] pu1_src
@* UWORD8 pointer to the source
@*
@* @param[out] pu1_dst
@* UWORD8 pointer to the destination
@*
@* @param[in] src_strd
@* integer source stride
@*
@* @param[in] dst_strd
@* integer destination stride
@*
@*
@* @param[in] ht
@* integer height of the array
@*
@* @param[in] wd
@* integer width of the array
@*
@* @returns
@*
@* @remarks
@* None
@*
@*******************************************************************************
@*
@void ih264_inter_pred_luma_copy (
@ UWORD8 *pu1_src,
@ UWORD8 *pu1_dst,
@ WORD32 src_strd,
@ WORD32 dst_strd,
@ WORD32 ht,
@ WORD32 wd )
@**************Variables Vs Registers*****************************************
@ r0 => *pu1_src
@ r1 => *pu1_dst
@ r2 => src_strd
@ r3 => dst_strd
@ r7 => ht
@ r12 => wd
.text
.p2align 2
.global ih264_inter_pred_luma_copy_a9q
ih264_inter_pred_luma_copy_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
vstmdb sp!, {d8-d15} @push neon registers to stack
ldr r12, [sp, #108] @Loads wd
ldr r7, [sp, #104] @Loads ht
cmp r7, #0 @checks ht == 0
ble end_loops
tst r12, #15 @checks wd for multiples for 4 & 8
beq core_loop_wd_16
tst r12, #7 @checks wd for multiples for 4 & 8
beq core_loop_wd_8
sub r11, r12, #4
outer_loop_wd_4:
subs r4, r12, #0 @checks wd == 0
ble end_inner_loop_wd_4
inner_loop_wd_4:
vld1.32 {d0[0]}, [r0] @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
add r5, r0, r2 @pu1_src_tmp += src_strd
add r6, r1, r3 @pu1_dst_tmp += dst_strd
vst1.32 {d0[0]}, [r1] @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
vld1.32 {d0[0]}, [r5], r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
add r0, r0, #4 @pu1_src += 4
vst1.32 {d0[0]}, [r6], r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
vld1.32 {d0[0]}, [r5], r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
subs r4, r4, #4 @(wd -4)
vst1.32 {d0[0]}, [r6], r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
vld1.32 {d0[0]}, [r5], r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
add r1, r1, #4 @pu1_dst += 4
vst1.32 {d0[0]}, [r6], r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
bgt inner_loop_wd_4
end_inner_loop_wd_4:
subs r7, r7, #4 @ht - 4
sub r0, r5, r11 @pu1_src = pu1_src_tmp
sub r1, r6, r11 @pu1_dst = pu1_dst_tmp
bgt outer_loop_wd_4
end_loops:
vldmia sp!, {d8-d15} @ Restore neon registers that were saved
ldmfd sp!, {r4-r12, r15} @Reload the registers from SP
core_loop_wd_8:
sub r11, r12, #8
outer_loop_wd_8:
subs r4, r12, #0 @checks wd
ble end_inner_loop_wd_8
inner_loop_wd_8:
add r5, r0, r2 @pu1_src_tmp += src_strd
vld1.8 {d0}, [r0]! @vld1_u8(pu1_src_tmp)
add r6, r1, r3 @pu1_dst_tmp += dst_strd
vst1.8 {d0}, [r1]! @vst1_u8(pu1_dst_tmp, tmp_src)
vld1.8 {d1}, [r5], r2 @vld1_u8(pu1_src_tmp)
vst1.8 {d1}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src)
subs r4, r4, #8 @wd - 8(Loop condition)
vld1.8 {d2}, [r5], r2 @vld1_u8(pu1_src_tmp)
vst1.8 {d2}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src)
vld1.8 {d3}, [r5], r2 @vld1_u8(pu1_src_tmp)
vst1.8 {d3}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src)
bgt inner_loop_wd_8
end_inner_loop_wd_8:
subs r7, r7, #4 @ht -= 4
sub r0, r5, r11 @pu1_src = pu1_src_tmp
sub r1, r6, r11 @pu1_dst = pu1_dst_tmp
bgt outer_loop_wd_8
vldmia sp!, {d8-d15} @ Restore neon registers that were saved
ldmfd sp!, {r4-r12, r15} @Reload the registers from SP
core_loop_wd_16:
sub r11, r12, #16
outer_loop_wd_16:
subs r4, r12, #0 @checks wd
ble end_inner_loop_wd_16
inner_loop_wd_16:
add r5, r0, r2 @pu1_src_tmp += src_strd
vld1.8 {q0}, [r0]! @vld1_u8(pu1_src_tmp)
add r6, r1, r3 @pu1_dst_tmp += dst_strd
vst1.8 {q0}, [r1]! @vst1_u8(pu1_dst_tmp, tmp_src)
vld1.8 {q1}, [r5], r2 @vld1_u8(pu1_src_tmp)
vst1.8 {q1}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src)
subs r4, r4, #16 @wd - 8(Loop condition)
vld1.8 {q2}, [r5], r2 @vld1_u8(pu1_src_tmp)
vst1.8 {q2}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src)
vld1.8 {q3}, [r5], r2 @vld1_u8(pu1_src_tmp)
vst1.8 {q3}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src)
bgt inner_loop_wd_16
end_inner_loop_wd_16:
subs r7, r7, #4 @ht -= 4
sub r0, r5, r11 @pu1_src = pu1_src_tmp
sub r1, r6, r11 @pu1_dst = pu1_dst_tmp
bgt outer_loop_wd_16
vldmia sp!, {d8-d15} @ Restore neon registers that were saved
ldmfd sp!, {r4-r12, r15} @Reload the registers from SP
@ *
@ ********************************************************************************
@ *
@ * @brief This function copies a 4x4 block to destination
@ *
@ * @par Description:
@ * Copies a 4x4 block to destination, where both src and dst are interleaved
@ *
@ * @param[in] pi2_src
@ * Source
@ *
@ * @param[in] pu1_out
@ * Output pointer
@ *
@ * @param[in] pred_strd,
@ * Prediction buffer stride
@ *
@ * @param[in] out_strd
@ * output buffer buffer Stride
@ *
@ * @returns none
@ *
@ * @remarks none
@ * Currently wd and height is not used, ie a 4x4 block is always copied
@ *
@ *******************************************************************************
@ *
@ void ih264_interleave_copy(WORD16 *pi2_src,
@ UWORD8 *pu1_out,
@ WORD32 pred_strd,
@ WORD32 out_strd
@ WORD32 wd
@ WORD32 ht)
@ Register Usage
@ r0 : pi2_src
@ r1 : pu1_out
@ r2 : src_strd
@ r3 : out_strd
@ Neon registers d0-d7, d16-d30 are used
@ No need for pushing arm and neon registers
.global ih264_interleave_copy_a9
ih264_interleave_copy_a9:
vld1.u8 d2, [r0], r2 @load src plane 1 => d2 &pred palne 2 => d3
vld1.u8 d3, [r0], r2
vld1.u8 d4, [r0], r2
vld1.u8 d5, [r0], r2
mov r0, r1
vld1.u8 d18, [r1], r3 @load out [8 bit size) -8 coeffs
vld1.u8 d19, [r1], r3
vmov.u16 q15, #0x00ff
vld1.u8 d20, [r1], r3
vld1.u8 d21, [r1], r3
vbit.u8 q9, q1, q15
vbit.u8 q10, q2, q15
vst1.u8 d18, [r0], r3 @store out
vst1.u8 d19, [r0], r3
vst1.u8 d20, [r0], r3
vst1.u8 d21, [r0], r3
bx lr