common/arm/ih264_inter_pred_luma_copy_a9q.s - platform/external/libavc - Git at Google

 @/******************************************************************************
 @ *
 @ * Copyright (C) 2015 The Android Open Source Project
 @ *
 @ * Licensed under the Apache License, Version 2.0 (the "License");
 @ * you may not use this file except in compliance with the License.
 @ * You may obtain a copy of the License at:
 @ *
 @ * http://www.apache.org/licenses/LICENSE-2.0
 @ *
 @ * Unless required by applicable law or agreed to in writing, software
 @ * distributed under the License is distributed on an "AS IS" BASIS,
 @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 @ * See the License for the specific language governing permissions and
 @ * limitations under the License.
 @ *
 @ *****************************************************************************
 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 @*/
 @**
 @**
 @*******************************************************************************
 @*
 @* @brief
 @*     Interprediction luma function for copy
 @*
 @* @par Description:
 @*   Copies the array of width 'wd' and height 'ht' from the  location pointed
 @*   by 'src' to the location pointed by 'dst'
 @*
 @* @param[in] pu1_src
 @*  UWORD8 pointer to the source
 @*
 @* @param[out] pu1_dst
 @*  UWORD8 pointer to the destination
 @*
 @* @param[in] src_strd
 @*  integer source stride
 @*
 @* @param[in] dst_strd
 @*  integer destination stride
 @*
 @*
 @* @param[in] ht
 @*  integer height of the array
 @*
 @* @param[in] wd
 @*  integer width of the array
 @*
 @* @returns
 @*
 @* @remarks
 @*  None
 @*
 @*******************************************************************************
 @*
 @void ih264_inter_pred_luma_copy (
 @                            UWORD8 *pu1_src,
 @                            UWORD8 *pu1_dst,
 @                            WORD32 src_strd,
 @                            WORD32 dst_strd,
 @                            WORD32 ht,
 @                            WORD32 wd   )

 @**************Variables Vs Registers*****************************************
 @   r0 => *pu1_src
 @   r1 => *pu1_dst
 @   r2 =>  src_strd
 @   r3 =>  dst_strd
 @   r7 =>  ht
 @   r12 => wd

 .text
 .p2align 2

     .global ih264_inter_pred_luma_copy_a9q

 ih264_inter_pred_luma_copy_a9q:
     stmfd         sp!, {r4-r12, r14}    @stack stores the values of the arguments
     vstmdb        sp!, {d8-d15}         @push neon registers to stack
     ldr           r12, [sp, #108]       @Loads wd
     ldr           r7, [sp, #104]        @Loads ht
     cmp           r7, #0                @checks ht == 0
     ble           end_loops
     tst           r12, #15              @checks wd for multiples for 4 & 8
     beq           core_loop_wd_16
     tst           r12, #7               @checks wd for multiples for 4 & 8
     beq           core_loop_wd_8
     sub           r11, r12, #4

 outer_loop_wd_4:
     subs          r4, r12, #0           @checks wd == 0
     ble           end_inner_loop_wd_4

 inner_loop_wd_4:
     vld1.32       {d0[0]}, [r0]         @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
     add           r5, r0, r2            @pu1_src_tmp += src_strd
     add           r6, r1, r3            @pu1_dst_tmp += dst_strd
     vst1.32       {d0[0]}, [r1]         @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
     vld1.32       {d0[0]}, [r5], r2     @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
     add           r0, r0, #4            @pu1_src += 4
     vst1.32       {d0[0]}, [r6], r3     @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
     vld1.32       {d0[0]}, [r5], r2     @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
     subs          r4, r4, #4            @(wd -4)
     vst1.32       {d0[0]}, [r6], r3     @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
     vld1.32       {d0[0]}, [r5], r2     @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
     add           r1, r1, #4            @pu1_dst += 4
     vst1.32       {d0[0]}, [r6], r3     @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)

     bgt           inner_loop_wd_4

 end_inner_loop_wd_4:
     subs          r7, r7, #4            @ht - 4
     sub           r0, r5, r11           @pu1_src = pu1_src_tmp
     sub           r1, r6, r11           @pu1_dst = pu1_dst_tmp
     bgt           outer_loop_wd_4

 end_loops:
     vldmia        sp!, {d8-d15}         @ Restore neon registers that were saved
     ldmfd         sp!, {r4-r12, r15}    @Reload the registers from SP


 core_loop_wd_8:
     sub           r11, r12, #8

 outer_loop_wd_8:
     subs          r4, r12, #0           @checks wd
     ble           end_inner_loop_wd_8

 inner_loop_wd_8:
     add           r5, r0, r2            @pu1_src_tmp += src_strd
     vld1.8        {d0}, [r0]!           @vld1_u8(pu1_src_tmp)
     add           r6, r1, r3            @pu1_dst_tmp += dst_strd
     vst1.8        {d0}, [r1]!           @vst1_u8(pu1_dst_tmp, tmp_src)
     vld1.8        {d1}, [r5], r2        @vld1_u8(pu1_src_tmp)
     vst1.8        {d1}, [r6], r3        @vst1_u8(pu1_dst_tmp, tmp_src)
     subs          r4, r4, #8            @wd - 8(Loop condition)
     vld1.8        {d2}, [r5], r2        @vld1_u8(pu1_src_tmp)
     vst1.8        {d2}, [r6], r3        @vst1_u8(pu1_dst_tmp, tmp_src)
     vld1.8        {d3}, [r5], r2        @vld1_u8(pu1_src_tmp)
     vst1.8        {d3}, [r6], r3        @vst1_u8(pu1_dst_tmp, tmp_src)
     bgt           inner_loop_wd_8

 end_inner_loop_wd_8:
     subs          r7, r7, #4            @ht -= 4
     sub           r0, r5, r11           @pu1_src = pu1_src_tmp
     sub           r1, r6, r11           @pu1_dst = pu1_dst_tmp
     bgt           outer_loop_wd_8

     vldmia        sp!, {d8-d15}         @ Restore neon registers that were saved
     ldmfd         sp!, {r4-r12, r15}    @Reload the registers from SP

 core_loop_wd_16:
     sub           r11, r12, #16

 outer_loop_wd_16:
     subs          r4, r12, #0           @checks wd
     ble           end_inner_loop_wd_16

 inner_loop_wd_16:
     add           r5, r0, r2            @pu1_src_tmp += src_strd
     vld1.8        {q0}, [r0]!           @vld1_u8(pu1_src_tmp)
     add           r6, r1, r3            @pu1_dst_tmp += dst_strd
     vst1.8        {q0}, [r1]!           @vst1_u8(pu1_dst_tmp, tmp_src)
     vld1.8        {q1}, [r5], r2        @vld1_u8(pu1_src_tmp)
     vst1.8        {q1}, [r6], r3        @vst1_u8(pu1_dst_tmp, tmp_src)
     subs          r4, r4, #16           @wd - 8(Loop condition)
     vld1.8        {q2}, [r5], r2        @vld1_u8(pu1_src_tmp)
     vst1.8        {q2}, [r6], r3        @vst1_u8(pu1_dst_tmp, tmp_src)
     vld1.8        {q3}, [r5], r2        @vld1_u8(pu1_src_tmp)
     vst1.8        {q3}, [r6], r3        @vst1_u8(pu1_dst_tmp, tmp_src)
     bgt           inner_loop_wd_16

 end_inner_loop_wd_16:
     subs          r7, r7, #4            @ht -= 4
     sub           r0, r5, r11           @pu1_src = pu1_src_tmp
     sub           r1, r6, r11           @pu1_dst = pu1_dst_tmp
     bgt           outer_loop_wd_16

     vldmia        sp!, {d8-d15}         @ Restore neon registers that were saved
     ldmfd         sp!, {r4-r12, r15}    @Reload the registers from SP


 @ *
 @ ********************************************************************************
 @ *
 @ * @brief This function copies a 4x4 block to destination
 @ *
 @ * @par Description:
 @ * Copies a 4x4 block to destination, where both src and dst are interleaved
 @ *
 @ * @param[in] pi2_src
 @ *  Source
 @ *
 @ * @param[in] pu1_out
 @ *  Output pointer
 @ *
 @ * @param[in] pred_strd,
 @ *  Prediction buffer stride
 @ *
 @ * @param[in] out_strd
 @ *  output buffer buffer Stride
 @ *
 @ * @returns none
 @ *
 @ * @remarks none
 @ * Currently wd and height is not used, ie a 4x4 block is always copied
 @ *
 @ *******************************************************************************
 @ *
 @ void ih264_interleave_copy(WORD16 *pi2_src,
 @                            UWORD8 *pu1_out,
 @                            WORD32 pred_strd,
 @                            WORD32 out_strd
 @                            WORD32 wd
 @                            WORD32 ht)
 @ Register Usage
 @ r0 : pi2_src
 @ r1 : pu1_out
 @ r2 : src_strd
 @ r3 : out_strd
 @ Neon registers d0-d7, d16-d30 are used
 @ No need for pushing  arm and neon registers

     .global ih264_interleave_copy_a9
 ih264_interleave_copy_a9:

     vld1.u8       d2, [r0], r2          @load src plane 1 => d2 &pred palne 2 => d3
     vld1.u8       d3, [r0], r2
     vld1.u8       d4, [r0], r2
     vld1.u8       d5, [r0], r2

     mov           r0, r1

     vld1.u8       d18, [r1], r3         @load out [8 bit size) -8 coeffs
     vld1.u8       d19, [r1], r3
     vmov.u16      q15, #0x00ff
     vld1.u8       d20, [r1], r3
     vld1.u8       d21, [r1], r3

     vbit.u8       q9, q1, q15
     vbit.u8       q10, q2, q15

     vst1.u8       d18, [r0], r3         @store  out
     vst1.u8       d19, [r0], r3
     vst1.u8       d20, [r0], r3
     vst1.u8       d21, [r0], r3

     bx            lr
	@/******************************************************************************
	@ *
	@ * Copyright (C) 2015 The Android Open Source Project
	@ *
	@ * Licensed under the Apache License, Version 2.0 (the "License");
	@ * you may not use this file except in compliance with the License.
	@ * You may obtain a copy of the License at:
	@ *
	@ * http://www.apache.org/licenses/LICENSE-2.0
	@ *
	@ * Unless required by applicable law or agreed to in writing, software
	@ * distributed under the License is distributed on an "AS IS" BASIS,
	@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	@ * See the License for the specific language governing permissions and
	@ * limitations under the License.
	@ *
	@ *****************************************************************************
	@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
	@*/
	@**
	@**
	@*******************************************************************************
	@*
	@* @brief
	@* Interprediction luma function for copy
	@*
	@* @par Description:
	@* Copies the array of width 'wd' and height 'ht' from the location pointed
	@* by 'src' to the location pointed by 'dst'
	@*
	@* @param[in] pu1_src
	@* UWORD8 pointer to the source
	@*
	@* @param[out] pu1_dst
	@* UWORD8 pointer to the destination
	@*
	@* @param[in] src_strd
	@* integer source stride
	@*
	@* @param[in] dst_strd
	@* integer destination stride
	@*
	@*
	@* @param[in] ht
	@* integer height of the array
	@*
	@* @param[in] wd
	@* integer width of the array
	@*
	@* @returns
	@*
	@* @remarks
	@* None
	@*
	@*******************************************************************************
	@*
	@void ih264_inter_pred_luma_copy (
	@ UWORD8 *pu1_src,
	@ UWORD8 *pu1_dst,
	@ WORD32 src_strd,
	@ WORD32 dst_strd,
	@ WORD32 ht,
	@ WORD32 wd )

	@************Variables Vs Registers***************************************
	@ r0 => *pu1_src
	@ r1 => *pu1_dst
	@ r2 => src_strd
	@ r3 => dst_strd
	@ r7 => ht
	@ r12 => wd

	.text
	.p2align 2

	.global ih264_inter_pred_luma_copy_a9q

	ih264_inter_pred_luma_copy_a9q:
	stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
	vstmdb sp!, {d8-d15} @push neon registers to stack
	ldr r12, [sp, #108] @Loads wd
	ldr r7, [sp, #104] @Loads ht
	cmp r7, #0 @checks ht == 0
	ble end_loops
	tst r12, #15 @checks wd for multiples for 4 & 8
	beq core_loop_wd_16
	tst r12, #7 @checks wd for multiples for 4 & 8
	beq core_loop_wd_8
	sub r11, r12, #4

	outer_loop_wd_4:
	subs r4, r12, #0 @checks wd == 0
	ble end_inner_loop_wd_4

	inner_loop_wd_4:
	vld1.32 {d0[0]}, [r0] @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
	add r5, r0, r2 @pu1_src_tmp += src_strd
	add r6, r1, r3 @pu1_dst_tmp += dst_strd
	vst1.32 {d0[0]}, [r1] @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
	vld1.32 {d0[0]}, [r5], r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
	add r0, r0, #4 @pu1_src += 4
	vst1.32 {d0[0]}, [r6], r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
	vld1.32 {d0[0]}, [r5], r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
	subs r4, r4, #4 @(wd -4)
	vst1.32 {d0[0]}, [r6], r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
	vld1.32 {d0[0]}, [r5], r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
	add r1, r1, #4 @pu1_dst += 4
	vst1.32 {d0[0]}, [r6], r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)

	bgt inner_loop_wd_4

	end_inner_loop_wd_4:
	subs r7, r7, #4 @ht - 4
	sub r0, r5, r11 @pu1_src = pu1_src_tmp
	sub r1, r6, r11 @pu1_dst = pu1_dst_tmp
	bgt outer_loop_wd_4

	end_loops:
	vldmia sp!, {d8-d15} @ Restore neon registers that were saved
	ldmfd sp!, {r4-r12, r15} @Reload the registers from SP



	core_loop_wd_8:
	sub r11, r12, #8

	outer_loop_wd_8:
	subs r4, r12, #0 @checks wd
	ble end_inner_loop_wd_8

	inner_loop_wd_8:
	add r5, r0, r2 @pu1_src_tmp += src_strd
	vld1.8 {d0}, [r0]! @vld1_u8(pu1_src_tmp)
	add r6, r1, r3 @pu1_dst_tmp += dst_strd
	vst1.8 {d0}, [r1]! @vst1_u8(pu1_dst_tmp, tmp_src)
	vld1.8 {d1}, [r5], r2 @vld1_u8(pu1_src_tmp)
	vst1.8 {d1}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src)
	subs r4, r4, #8 @wd - 8(Loop condition)
	vld1.8 {d2}, [r5], r2 @vld1_u8(pu1_src_tmp)
	vst1.8 {d2}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src)
	vld1.8 {d3}, [r5], r2 @vld1_u8(pu1_src_tmp)
	vst1.8 {d3}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src)
	bgt inner_loop_wd_8

	end_inner_loop_wd_8:
	subs r7, r7, #4 @ht -= 4
	sub r0, r5, r11 @pu1_src = pu1_src_tmp
	sub r1, r6, r11 @pu1_dst = pu1_dst_tmp
	bgt outer_loop_wd_8

	vldmia sp!, {d8-d15} @ Restore neon registers that were saved
	ldmfd sp!, {r4-r12, r15} @Reload the registers from SP

	core_loop_wd_16:
	sub r11, r12, #16

	outer_loop_wd_16:
	subs r4, r12, #0 @checks wd
	ble end_inner_loop_wd_16

	inner_loop_wd_16:
	add r5, r0, r2 @pu1_src_tmp += src_strd
	vld1.8 {q0}, [r0]! @vld1_u8(pu1_src_tmp)
	add r6, r1, r3 @pu1_dst_tmp += dst_strd
	vst1.8 {q0}, [r1]! @vst1_u8(pu1_dst_tmp, tmp_src)
	vld1.8 {q1}, [r5], r2 @vld1_u8(pu1_src_tmp)
	vst1.8 {q1}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src)
	subs r4, r4, #16 @wd - 8(Loop condition)
	vld1.8 {q2}, [r5], r2 @vld1_u8(pu1_src_tmp)
	vst1.8 {q2}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src)
	vld1.8 {q3}, [r5], r2 @vld1_u8(pu1_src_tmp)
	vst1.8 {q3}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src)
	bgt inner_loop_wd_16

	end_inner_loop_wd_16:
	subs r7, r7, #4 @ht -= 4
	sub r0, r5, r11 @pu1_src = pu1_src_tmp
	sub r1, r6, r11 @pu1_dst = pu1_dst_tmp
	bgt outer_loop_wd_16

	vldmia sp!, {d8-d15} @ Restore neon registers that were saved
	ldmfd sp!, {r4-r12, r15} @Reload the registers from SP


	@ *
	@ ********************************************************************************
	@ *
	@ * @brief This function copies a 4x4 block to destination
	@ *
	@ * @par Description:
	@ * Copies a 4x4 block to destination, where both src and dst are interleaved
	@ *
	@ * @param[in] pi2_src
	@ * Source
	@ *
	@ * @param[in] pu1_out
	@ * Output pointer
	@ *
	@ * @param[in] pred_strd,
	@ * Prediction buffer stride
	@ *
	@ * @param[in] out_strd
	@ * output buffer buffer Stride
	@ *
	@ * @returns none
	@ *
	@ * @remarks none
	@ * Currently wd and height is not used, ie a 4x4 block is always copied
	@ *
	@ *******************************************************************************
	@ *
	@ void ih264_interleave_copy(WORD16 *pi2_src,
	@ UWORD8 *pu1_out,
	@ WORD32 pred_strd,
	@ WORD32 out_strd
	@ WORD32 wd
	@ WORD32 ht)
	@ Register Usage
	@ r0 : pi2_src
	@ r1 : pu1_out
	@ r2 : src_strd
	@ r3 : out_strd
	@ Neon registers d0-d7, d16-d30 are used
	@ No need for pushing arm and neon registers

	.global ih264_interleave_copy_a9
	ih264_interleave_copy_a9:

	vld1.u8 d2, [r0], r2 @load src plane 1 => d2 &pred palne 2 => d3
	vld1.u8 d3, [r0], r2
	vld1.u8 d4, [r0], r2
	vld1.u8 d5, [r0], r2

	mov r0, r1

	vld1.u8 d18, [r1], r3 @load out [8 bit size) -8 coeffs
	vld1.u8 d19, [r1], r3
	vmov.u16 q15, #0x00ff
	vld1.u8 d20, [r1], r3
	vld1.u8 d21, [r1], r3

	vbit.u8 q9, q1, q15
	vbit.u8 q10, q2, q15

	vst1.u8 d18, [r0], r3 @store out
	vst1.u8 d19, [r0], r3
	vst1.u8 d20, [r0], r3
	vst1.u8 d21, [r0], r3

	bx lr