common/armv8/ih264_inter_pred_luma_copy_av8.s - platform/external/libavc - Git at Google

 //******************************************************************************
 //*
 //* Copyright (C) 2015 The Android Open Source Project
 //*
 //* Licensed under the Apache License, Version 2.0 (the "License");
 //* you may not use this file except in compliance with the License.
 //* You may obtain a copy of the License at:
 //*
 //* http://www.apache.org/licenses/LICENSE-2.0
 //*
 //* Unless required by applicable law or agreed to in writing, software
 //* distributed under the License is distributed on an "AS IS" BASIS,
 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 //* See the License for the specific language governing permissions and
 //* limitations under the License.
 //*
 //*****************************************************************************
 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 //*/
 ///**
 ///**
 //*******************************************************************************
 //*
 //* @brief
 //*     Interprediction luma function for copy
 //*
 //* @par Description:
 //*   Copies the array of width 'wd' and height 'ht' from the  location pointed
 //*   by 'src' to the location pointed by 'dst'
 //*
 //* @param[in] pu1_src
 //*  UWORD8 pointer to the source
 //*
 //* @param[out] pu1_dst
 //*  UWORD8 pointer to the destination
 //*
 //* @param[in] src_strd
 //*  integer source stride
 //*
 //* @param[in] dst_strd
 //*  integer destination stride
 //*
 //*
 //* @param[in] ht
 //*  integer height of the array
 //*
 //* @param[in] wd
 //*  integer width of the array
 //*
 //* @returns
 //*
 //* @remarks
 //*  None
 //*
 //*******************************************************************************
 //*/
 //void ih264_inter_pred_luma_copy (
 //                            UWORD8 *pu1_src,
 //                            UWORD8 *pu1_dst,
 //                            WORD32 src_strd,
 //                            WORD32 dst_strd,
 //                            WORD32 ht,
 //                            WORD32 wd   )

 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
 //    w2 =>  src_strd
 //    w3 =>  dst_strd
 //    w4 =>  ht
 //    w5 =>  wd

 .text
 .p2align 2
 .include "ih264_neon_macros.s"


     .global ih264_inter_pred_luma_copy_av8

 ih264_inter_pred_luma_copy_av8:

     push_v_regs
     stp       x19, x20, [sp, #-16]!
     sxtw      x2, w2
     sxtw      x3, w3
     sxtw      x4, w4
     sxtw      x5, w5

     mov       x12, x5
     mov       x7, x4
     cmp       x7, #0                    //checks ht == 0
     ble       end_loops
     tst       x12, #15                  //checks wd for multiples for 4 & 8
     beq       core_loop_wd_16
     tst       x12, #7                   //checks wd for multiples for 4 & 8
     beq       core_loop_wd_8
     sub       x11, x12, #4

 outer_loop_wd_4:
     subs      x4, x12, #0               //checks wd == 0
     ble       end_inner_loop_wd_4

 inner_loop_wd_4:
     ld1       {v0.s}[0], [x0]           //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
     add       x5, x0, x2                //pu1_src_tmp += src_strd
     add       x6, x1, x3                //pu1_dst_tmp += dst_strd
     st1       {v0.s}[0], [x1]           //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
     ld1       {v0.s}[0], [x5], x2       //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
     add       x0, x0, #4                //pu1_src += 4
     st1       {v0.s}[0], [x6], x3       //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
     ld1       {v0.s}[0], [x5], x2       //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
     subs      x4, x4, #4                //(wd -4)
     st1       {v0.s}[0], [x6], x3       //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
     ld1       {v0.s}[0], [x5], x2       //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
     add       x1, x1, #4                //pu1_dst += 4
     st1       {v0.s}[0], [x6], x3       //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)

     bgt       inner_loop_wd_4

 end_inner_loop_wd_4:
     subs      x7, x7, #4                //ht - 4
     sub       x0, x5, x11               //pu1_src = pu1_src_tmp
     sub       x1, x6, x11               //pu1_dst = pu1_dst_tmp
     bgt       outer_loop_wd_4

 end_loops:
     // LDMFD sp!,{x4-x12,x15}                  //Reload the registers from SP
     ldp       x19, x20, [sp], #16
     pop_v_regs
     ret


 core_loop_wd_8:
     sub       x11, x12, #8

 outer_loop_wd_8:
     subs      x4, x12, #0               //checks wd
     ble       end_inner_loop_wd_8

 inner_loop_wd_8:
     add       x5, x0, x2                //pu1_src_tmp += src_strd
     ld1       {v0.8b}, [x0], #8         //vld1_u8(pu1_src_tmp)
     add       x6, x1, x3                //pu1_dst_tmp += dst_strd
     st1       {v0.8b}, [x1], #8         //vst1_u8(pu1_dst_tmp, tmp_src)
     ld1       {v1.8b}, [x5], x2         //vld1_u8(pu1_src_tmp)
     st1       {v1.8b}, [x6], x3         //vst1_u8(pu1_dst_tmp, tmp_src)
     subs      x4, x4, #8                //wd - 8(Loop condition)
     ld1       {v2.8b}, [x5], x2         //vld1_u8(pu1_src_tmp)
     st1       {v2.8b}, [x6], x3         //vst1_u8(pu1_dst_tmp, tmp_src)
     ld1       {v3.8b}, [x5], x2         //vld1_u8(pu1_src_tmp)
     st1       {v3.8b}, [x6], x3         //vst1_u8(pu1_dst_tmp, tmp_src)
     bgt       inner_loop_wd_8

 end_inner_loop_wd_8:
     subs      x7, x7, #4                //ht -= 4
     sub       x0, x5, x11               //pu1_src = pu1_src_tmp
     sub       x1, x6, x11               //pu1_dst = pu1_dst_tmp
     bgt       outer_loop_wd_8

     // LDMFD sp!,{x4-x12,x15}                  //Reload the registers from SP
     ldp       x19, x20, [sp], #16
     pop_v_regs
     ret

 core_loop_wd_16:
     sub       x11, x12, #16

 outer_loop_wd_16:
     subs      x4, x12, #0               //checks wd
     ble       end_inner_loop_wd_16

 inner_loop_wd_16:
     add       x5, x0, x2                //pu1_src_tmp += src_strd
     ld1       { v0.16b}, [x0], #16      //vld1_u8(pu1_src_tmp)
     add       x6, x1, x3                //pu1_dst_tmp += dst_strd
     st1       { v0.16b}, [x1], #16      //vst1_u8(pu1_dst_tmp, tmp_src)
     ld1       { v2.16b}, [x5], x2       //vld1_u8(pu1_src_tmp)
     st1       { v2.16b}, [x6], x3       //vst1_u8(pu1_dst_tmp, tmp_src)
     subs      x4, x4, #16               //wd - 8(Loop condition)
     ld1       { v4.16b}, [x5], x2       //vld1_u8(pu1_src_tmp)
     st1       { v4.16b}, [x6], x3       //vst1_u8(pu1_dst_tmp, tmp_src)
     ld1       { v6.16b}, [x5], x2       //vld1_u8(pu1_src_tmp)
     st1       { v6.16b}, [x6], x3       //vst1_u8(pu1_dst_tmp, tmp_src)
     bgt       inner_loop_wd_16

 end_inner_loop_wd_16:
     subs      x7, x7, #4                //ht -= 4
     sub       x0, x5, x11               //pu1_src = pu1_src_tmp
     sub       x1, x6, x11               //pu1_dst = pu1_dst_tmp
     bgt       outer_loop_wd_16


     ldp       x19, x20, [sp], #16
     pop_v_regs
     ret


 // /*
 // ********************************************************************************
 // *
 // * @brief This function copies a 4x4 block to destination
 // *
 // * @par Description:
 // * Copies a 4x4 block to destination, where both src and dst are interleaved
 // *
 // * @param[in] pi2_src
 // *  Source
 // *
 // * @param[in] pu1_out
 // *  Output pointer
 // *
 // * @param[in] pred_strd,
 // *  Prediction buffer stride
 // *
 // * @param[in] out_strd
 // *  output buffer buffer Stride
 // *
 // * @returns none
 // *
 // * @remarks none
 // * Currently wd and height is not used, ie a 4x4 block is always copied
 // *
 // *******************************************************************************
 // */
 // void ih264_interleave_copy(WORD16 *pi2_src,
 //                            UWORD8 *pu1_out,
 //                            WORD32 pred_strd,
 //                            WORD32 out_strd
 //                            WORD32 wd
 //                            WORD32 ht)
 // Register Usage
 // x0 : pi2_src
 // x1 : pu1_out
 // w2 : src_strd
 // w3 : out_strd
 // Neon registers d0-d7, d16-d30 are used
 // No need for pushing  arm and neon registers

     .global ih264_interleave_copy_av8
 ih264_interleave_copy_av8:
     push_v_regs
     sxtw      x2, w2
     sxtw      x3, w3
     ld1       {v2.8b}, [x0], x2         //load src plane 1 => d2 &pred palne 2 => d3
     ld1       {v3.8b}, [x0], x2
     mov       v2.d[1], v3.d[0]
     ld1       {v4.8b}, [x0], x2
     ld1       {v5.8b}, [x0], x2
     mov       v4.d[1], v5.d[0]

     mov       x0, x1

     ld1       {v18.8b}, [x1], x3        //load out [8 bit size) -8 coeffs
     ld1       {v19.8b}, [x1], x3
     mov       v18.d[1], v19.d[0]
     movi      v30.8h, #0x00ff
     ld1       {v20.8b}, [x1], x3
     ld1       {v21.8b}, [x1], x3
     mov       v20.d[1], v21.d[0]

     bit       v18.16b, v2.16b , v30.16b
     bit       v20.16b, v4.16b , v30.16b

     st1       {v18.8b}, [x0], x3        //store  out
     st1       {v18.d}[1], [x0], x3
     st1       {v20.8b}, [x0], x3
     st1       {v20.d}[1], [x0], x3

     pop_v_regs
     ret
	//******************************************************************************
	//*
	//* Copyright (C) 2015 The Android Open Source Project
	//*
	//* Licensed under the Apache License, Version 2.0 (the "License");
	//* you may not use this file except in compliance with the License.
	//* You may obtain a copy of the License at:
	//*
	//* http://www.apache.org/licenses/LICENSE-2.0
	//*
	//* Unless required by applicable law or agreed to in writing, software
	//* distributed under the License is distributed on an "AS IS" BASIS,
	//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	//* See the License for the specific language governing permissions and
	//* limitations under the License.
	//*
	//*****************************************************************************
	//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
	//*/
	///**
	///**
	//*******************************************************************************
	//*
	//* @brief
	//* Interprediction luma function for copy
	//*
	//* @par Description:
	//* Copies the array of width 'wd' and height 'ht' from the location pointed
	//* by 'src' to the location pointed by 'dst'
	//*
	//* @param[in] pu1_src
	//* UWORD8 pointer to the source
	//*
	//* @param[out] pu1_dst
	//* UWORD8 pointer to the destination
	//*
	//* @param[in] src_strd
	//* integer source stride
	//*
	//* @param[in] dst_strd
	//* integer destination stride
	//*
	//*
	//* @param[in] ht
	//* integer height of the array
	//*
	//* @param[in] wd
	//* integer width of the array
	//*
	//* @returns
	//*
	//* @remarks
	//* None
	//*
	//*******************************************************************************
	//*/
	//void ih264_inter_pred_luma_copy (
	// UWORD8 *pu1_src,
	// UWORD8 *pu1_dst,
	// WORD32 src_strd,
	// WORD32 dst_strd,
	// WORD32 ht,
	// WORD32 wd )

	//************Variables Vs Registers***************************************
	// x0 => *pu1_src
	// x1 => *pu1_dst
	// w2 => src_strd
	// w3 => dst_strd
	// w4 => ht
	// w5 => wd

	.text
	.p2align 2
	.include "ih264_neon_macros.s"



	.global ih264_inter_pred_luma_copy_av8

	ih264_inter_pred_luma_copy_av8:

	push_v_regs
	stp x19, x20, [sp, #-16]!
	sxtw x2, w2
	sxtw x3, w3
	sxtw x4, w4
	sxtw x5, w5

	mov x12, x5
	mov x7, x4
	cmp x7, #0 //checks ht == 0
	ble end_loops
	tst x12, #15 //checks wd for multiples for 4 & 8
	beq core_loop_wd_16
	tst x12, #7 //checks wd for multiples for 4 & 8
	beq core_loop_wd_8
	sub x11, x12, #4

	outer_loop_wd_4:
	subs x4, x12, #0 //checks wd == 0
	ble end_inner_loop_wd_4

	inner_loop_wd_4:
	ld1 {v0.s}[0], [x0] //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
	add x5, x0, x2 //pu1_src_tmp += src_strd
	add x6, x1, x3 //pu1_dst_tmp += dst_strd
	st1 {v0.s}[0], [x1] //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
	ld1 {v0.s}[0], [x5], x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
	add x0, x0, #4 //pu1_src += 4
	st1 {v0.s}[0], [x6], x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
	ld1 {v0.s}[0], [x5], x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
	subs x4, x4, #4 //(wd -4)
	st1 {v0.s}[0], [x6], x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
	ld1 {v0.s}[0], [x5], x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
	add x1, x1, #4 //pu1_dst += 4
	st1 {v0.s}[0], [x6], x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)

	bgt inner_loop_wd_4

	end_inner_loop_wd_4:
	subs x7, x7, #4 //ht - 4
	sub x0, x5, x11 //pu1_src = pu1_src_tmp
	sub x1, x6, x11 //pu1_dst = pu1_dst_tmp
	bgt outer_loop_wd_4

	end_loops:
	// LDMFD sp!,{x4-x12,x15} //Reload the registers from SP
	ldp x19, x20, [sp], #16
	pop_v_regs
	ret


	core_loop_wd_8:
	sub x11, x12, #8

	outer_loop_wd_8:
	subs x4, x12, #0 //checks wd
	ble end_inner_loop_wd_8

	inner_loop_wd_8:
	add x5, x0, x2 //pu1_src_tmp += src_strd
	ld1 {v0.8b}, [x0], #8 //vld1_u8(pu1_src_tmp)
	add x6, x1, x3 //pu1_dst_tmp += dst_strd
	st1 {v0.8b}, [x1], #8 //vst1_u8(pu1_dst_tmp, tmp_src)
	ld1 {v1.8b}, [x5], x2 //vld1_u8(pu1_src_tmp)
	st1 {v1.8b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src)
	subs x4, x4, #8 //wd - 8(Loop condition)
	ld1 {v2.8b}, [x5], x2 //vld1_u8(pu1_src_tmp)
	st1 {v2.8b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src)
	ld1 {v3.8b}, [x5], x2 //vld1_u8(pu1_src_tmp)
	st1 {v3.8b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src)
	bgt inner_loop_wd_8

	end_inner_loop_wd_8:
	subs x7, x7, #4 //ht -= 4
	sub x0, x5, x11 //pu1_src = pu1_src_tmp
	sub x1, x6, x11 //pu1_dst = pu1_dst_tmp
	bgt outer_loop_wd_8

	// LDMFD sp!,{x4-x12,x15} //Reload the registers from SP
	ldp x19, x20, [sp], #16
	pop_v_regs
	ret

	core_loop_wd_16:
	sub x11, x12, #16

	outer_loop_wd_16:
	subs x4, x12, #0 //checks wd
	ble end_inner_loop_wd_16

	inner_loop_wd_16:
	add x5, x0, x2 //pu1_src_tmp += src_strd
	ld1 { v0.16b}, [x0], #16 //vld1_u8(pu1_src_tmp)
	add x6, x1, x3 //pu1_dst_tmp += dst_strd
	st1 { v0.16b}, [x1], #16 //vst1_u8(pu1_dst_tmp, tmp_src)
	ld1 { v2.16b}, [x5], x2 //vld1_u8(pu1_src_tmp)
	st1 { v2.16b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src)
	subs x4, x4, #16 //wd - 8(Loop condition)
	ld1 { v4.16b}, [x5], x2 //vld1_u8(pu1_src_tmp)
	st1 { v4.16b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src)
	ld1 { v6.16b}, [x5], x2 //vld1_u8(pu1_src_tmp)
	st1 { v6.16b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src)
	bgt inner_loop_wd_16

	end_inner_loop_wd_16:
	subs x7, x7, #4 //ht -= 4
	sub x0, x5, x11 //pu1_src = pu1_src_tmp
	sub x1, x6, x11 //pu1_dst = pu1_dst_tmp
	bgt outer_loop_wd_16


	ldp x19, x20, [sp], #16
	pop_v_regs
	ret


	// /*
	// ********************************************************************************
	// *
	// * @brief This function copies a 4x4 block to destination
	// *
	// * @par Description:
	// * Copies a 4x4 block to destination, where both src and dst are interleaved
	// *
	// * @param[in] pi2_src
	// * Source
	// *
	// * @param[in] pu1_out
	// * Output pointer
	// *
	// * @param[in] pred_strd,
	// * Prediction buffer stride
	// *
	// * @param[in] out_strd
	// * output buffer buffer Stride
	// *
	// * @returns none
	// *
	// * @remarks none
	// * Currently wd and height is not used, ie a 4x4 block is always copied
	// *
	// *******************************************************************************
	// */
	// void ih264_interleave_copy(WORD16 *pi2_src,
	// UWORD8 *pu1_out,
	// WORD32 pred_strd,
	// WORD32 out_strd
	// WORD32 wd
	// WORD32 ht)
	// Register Usage
	// x0 : pi2_src
	// x1 : pu1_out
	// w2 : src_strd
	// w3 : out_strd
	// Neon registers d0-d7, d16-d30 are used
	// No need for pushing arm and neon registers

	.global ih264_interleave_copy_av8
	ih264_interleave_copy_av8:
	push_v_regs
	sxtw x2, w2
	sxtw x3, w3
	ld1 {v2.8b}, [x0], x2 //load src plane 1 => d2 &pred palne 2 => d3
	ld1 {v3.8b}, [x0], x2
	mov v2.d[1], v3.d[0]
	ld1 {v4.8b}, [x0], x2
	ld1 {v5.8b}, [x0], x2
	mov v4.d[1], v5.d[0]

	mov x0, x1

	ld1 {v18.8b}, [x1], x3 //load out [8 bit size) -8 coeffs
	ld1 {v19.8b}, [x1], x3
	mov v18.d[1], v19.d[0]
	movi v30.8h, #0x00ff
	ld1 {v20.8b}, [x1], x3
	ld1 {v21.8b}, [x1], x3
	mov v20.d[1], v21.d[0]

	bit v18.16b, v2.16b , v30.16b
	bit v20.16b, v4.16b , v30.16b

	st1 {v18.8b}, [x0], x3 //store out
	st1 {v18.d}[1], [x0], x3
	st1 {v20.8b}, [x0], x3
	st1 {v20.d}[1], [x0], x3

	pop_v_regs
	ret