common/arm64/ihevc_intra_pred_chroma_dc.s - platform/external/libhevc - Git at Google

 ///*****************************************************************************
 //*
 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
 //*
 //* Licensed under the Apache License, Version 2.0 (the "License");
 //* you may not use this file except in compliance with the License.
 //* You may obtain a copy of the License at:
 //*
 //* http://www.apache.org/licenses/LICENSE-2.0
 //*
 //* Unless required by applicable law or agreed to in writing, software
 //* distributed under the License is distributed on an "AS IS" BASIS,
 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 //* See the License for the specific language governing permissions and
 //* limitations under the License.
 //*
 //*****************************************************************************/
 ///**
 //*******************************************************************************
 //* @file
 //*  ihevc_intra_pred_chroma_dc_neon.s
 //*
 //* @brief
 //*  contains function definitions for intra prediction dc filtering.
 //* functions are coded using neon  intrinsics and can be compiled using

 //* rvct
 //*
 //* @author
 //*  yogeswaran rs
 //*
 //* @par list of functions:
 //*
 //*
 //* @remarks
 //*  none
 //*
 //*******************************************************************************
 //*/
 ///**
 //*******************************************************************************
 //*
 //* @brief
 //*    luma intraprediction filter for dc input
 //*
 //* @par description:
 //*
 //* @param[in] pu1_ref
 //*  uword8 pointer to the source
 //*
 //* @param[out] pu1_dst
 //*  uword8 pointer to the destination
 //*
 //* @param[in] src_strd
 //*  integer source stride
 //*
 //* @param[in] dst_strd
 //*  integer destination stride
 //*
 //* @param[in] pi1_coeff
 //*  word8 pointer to the planar coefficients
 //*
 //* @param[in] nt
 //*  size of tranform block
 //*
 //* @param[in] mode
 //*  type of filtering
 //*
 //* @returns
 //*
 //* @remarks
 //*  none
 //*
 //*******************************************************************************
 //*/

 //void ihevc_intra_pred_chroma_dc(uword8 *pu1_ref,
 //                                word32 src_strd,
 //                                uword8 *pu1_dst,
 //                                word32 dst_strd,
 //                                word32 nt,
 //                                word32 mode)
 //
 //**************variables vs registers*****************************************
 //x0 => *pu1_ref
 //x1 => src_strd
 //x2 => *pu1_dst
 //x3 => dst_strd

 //stack contents from #40
 //    nt
 //    mode
 //    pi1_coeff

 .text
 .align 4
 .include "ihevc_neon_macros.s"


 .globl ihevc_intra_pred_chroma_dc_av8

 .type ihevc_intra_pred_chroma_dc_av8, %function

 ihevc_intra_pred_chroma_dc_av8:

     // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments
     push_v_regs
     stp         x19, x20,[sp,#-16]!

     mov         x9, #0
     mov         v17.s[0], w9
     mov         v17.s[1], w9

     clz         w5,w4                       //counts leading zeros

     add         x6, x0, x4,lsl #1           //&src[2nt]
     mov         v18.s[0], w9
     mov         v18.s[1], w9
     sub         x20, x5, #32                //log2nt
     neg         x5, x20
     add         x7, x0, x4, lsl #2          //&src[4nt]
     mov         x12,x5
     add         x8, x7, #2                  //&src[4nt+2]

     cmp         x4, #4
     beq         dc_4                        //nt=4 loop


 add_loop:
     ld2         {v30.8b, v31.8b}, [x6], #16 //load from src[nt]
     lsl         x10,x4,#1                   //2nt

     uaddlp      v2.4h,  v30.8b
     subs        x10, x10,#0x10

     ld2         {v26.8b, v27.8b}, [x8],#16  //load from src[2nt+1]

     uaddlp      v3.4h,  v31.8b
     uaddlp      v2.2s,  v2.4h
     uaddlp      v3.2s,  v3.4h

     uadalp      v17.1d,  v2.2s

     uadalp      v18.1d,  v3.2s

     uaddlp      v2.4h,  v26.8b
     uaddlp      v3.4h,  v27.8b

     uaddlp      v2.2s,  v2.4h
     uaddlp      v3.2s,  v3.4h

     uadalp      v17.1d,  v2.2s
     uadalp      v18.1d,  v3.2s

     beq         epil_add_loop

 core_loop_add:
     ld2         {v30.8b, v31.8b}, [x6],#16  //load from src[nt]
     uaddlp      v28.4h,  v30.8b
     uaddlp      v3.4h,  v31.8b

     ld2         {v26.8b, v27.8b}, [x8],#16  //load from src[2nt+1]

     uaddlp      v3.2s,  v3.4h
     uaddlp      v29.2s,  v28.4h

     uadalp      v18.1d,  v3.2s
     uadalp      v17.1d,  v29.2s

     uaddlp      v3.4h,  v27.8b
     uaddlp      v28.4h,  v26.8b

     uaddlp      v3.2s,  v3.4h
     uaddlp      v29.2s,  v28.4h

     uadalp      v18.1d,  v3.2s
     uadalp      v17.1d,  v29.2s


 epil_add_loop:

     smov        x1, v18.s[0]
     smov        x11, v17.s[0]

     add         x1,x1,x4
     add         x11,x11,x4

     lsr         x1,x1,x12
     lsr         x11,x11,x12

     dup         v17.8b,w1
     dup         v16.8b,w11

 prologue_cpy_32:

     add         x5, x2, x3
     subs        x9, x4, #8
     lsl         x6, x3, #2
     csel        x11, x6, x11,eq
     add         x8, x5, x3
     add         x10, x8, x3

     beq         epilogue_copy

     st2         {v16.8b, v17.8b}, [x2],#16
     sub         x6, x6, #16

     st2         {v16.8b, v17.8b}, [x5],#16
     st2         {v16.8b, v17.8b}, [x8],#16
     mov         x20,#16
     csel        x11, x20, x11,ne
     st2         {v16.8b, v17.8b}, [x10],#16


     st2         {v16.8b, v17.8b}, [x2], x6
     st2         {v16.8b, v17.8b}, [x5], x6
     st2         {v16.8b, v17.8b}, [x8], x6
     st2         {v16.8b, v17.8b}, [x10], x6

 kernel_copy:
     st2         {v16.8b, v17.8b}, [x2],#16
     st2         {v16.8b, v17.8b}, [x5],#16
     st2         {v16.8b, v17.8b}, [x8],#16
     st2         {v16.8b, v17.8b}, [x10],#16

     st2         {v16.8b, v17.8b}, [x2], x6
     st2         {v16.8b, v17.8b}, [x5], x6
     st2         {v16.8b, v17.8b}, [x8], x6
     st2         {v16.8b, v17.8b}, [x10], x6

     st2         {v16.8b, v17.8b}, [x2],#16
     st2         {v16.8b, v17.8b}, [x5],#16
     st2         {v16.8b, v17.8b}, [x8],#16
     st2         {v16.8b, v17.8b}, [x10],#16

     st2         {v16.8b, v17.8b}, [x2], x6
     st2         {v16.8b, v17.8b}, [x5], x6
     st2         {v16.8b, v17.8b}, [x8], x6
     st2         {v16.8b, v17.8b}, [x10], x6

 epilogue_copy:
     st2         {v16.8b, v17.8b}, [x2],x11
     st2         {v16.8b, v17.8b}, [x5],x11
     st2         {v16.8b, v17.8b}, [x8],x11
     st2         {v16.8b, v17.8b}, [x10],x11

     st2         {v16.8b, v17.8b}, [x2]
     st2         {v16.8b, v17.8b}, [x5]
     st2         {v16.8b, v17.8b}, [x8]
     st2         {v16.8b, v17.8b}, [x10]
     b           end_func

 dc_4:
     ld2         {v30.8b, v31.8b},[x6]       //load from src[nt]
     shl         d3, d30,#32

     ld2         {v26.8b, v27.8b},[x8]       //load from src[2nt+1]
     shl         d2, d31,#32

     uaddlp      v3.4h,  v3.8b
     uaddlp      v2.4h,  v2.8b
     uaddlp      v3.2s,  v3.4h
     uaddlp      v2.2s,  v2.4h
     uadalp      v17.1d,  v3.2s
     uadalp      v18.1d,  v2.2s

     shl         d3, d26,#32
     shl         d2, d27,#32
     uaddlp      v3.4h,  v3.8b
     uaddlp      v2.4h,  v2.8b
     uaddlp      v3.2s,  v3.4h
     uaddlp      v2.2s,  v2.4h
     uadalp      v17.1d,  v3.2s
     uadalp      v18.1d,  v2.2s

     smov        x10, v17.s[0]
     smov        x11, v18.s[0]

     add         x10,x10,x4
     add         x11,x11,x4
     lsr         x10,x10,x12
     lsr         x11,x11,x12
     orr         x10,x10,x11,lsl #8
     dup         v0.4h,w10

     st1         {v0.8b},[x2],x3
     st1         {v0.8b},[x2],x3
     st1         {v0.8b},[x2],x3
     st1         {v0.8b},[x2]

 end_func:
     // ldmfd sp!,{x4-x12,x15}     //reload the registers from sp
     ldp         x19, x20,[sp],#16
     pop_v_regs
     ret
	///*****************************************************************************
	//*
	//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
	//*
	//* Licensed under the Apache License, Version 2.0 (the "License");
	//* you may not use this file except in compliance with the License.
	//* You may obtain a copy of the License at:
	//*
	//* http://www.apache.org/licenses/LICENSE-2.0
	//*
	//* Unless required by applicable law or agreed to in writing, software
	//* distributed under the License is distributed on an "AS IS" BASIS,
	//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	//* See the License for the specific language governing permissions and
	//* limitations under the License.
	//*
	//*****************************************************************************/
	///**
	//*******************************************************************************
	//* @file
	//* ihevc_intra_pred_chroma_dc_neon.s
	//*
	//* @brief
	//* contains function definitions for intra prediction dc filtering.
	//* functions are coded using neon intrinsics and can be compiled using

	//* rvct
	//*
	//* @author
	//* yogeswaran rs
	//*
	//* @par list of functions:
	//*
	//*
	//* @remarks
	//* none
	//*
	//*******************************************************************************
	//*/
	///**
	//*******************************************************************************
	//*
	//* @brief
	//* luma intraprediction filter for dc input
	//*
	//* @par description:
	//*
	//* @param[in] pu1_ref
	//* uword8 pointer to the source
	//*
	//* @param[out] pu1_dst
	//* uword8 pointer to the destination
	//*
	//* @param[in] src_strd
	//* integer source stride
	//*
	//* @param[in] dst_strd
	//* integer destination stride
	//*
	//* @param[in] pi1_coeff
	//* word8 pointer to the planar coefficients
	//*
	//* @param[in] nt
	//* size of tranform block
	//*
	//* @param[in] mode
	//* type of filtering
	//*
	//* @returns
	//*
	//* @remarks
	//* none
	//*
	//*******************************************************************************
	//*/

	//void ihevc_intra_pred_chroma_dc(uword8 *pu1_ref,
	// word32 src_strd,
	// uword8 *pu1_dst,
	// word32 dst_strd,
	// word32 nt,
	// word32 mode)
	//
	//************variables vs registers***************************************
	//x0 => *pu1_ref
	//x1 => src_strd
	//x2 => *pu1_dst
	//x3 => dst_strd

	//stack contents from #40
	// nt
	// mode
	// pi1_coeff

	.text
	.align 4
	.include "ihevc_neon_macros.s"



	.globl ihevc_intra_pred_chroma_dc_av8

	.type ihevc_intra_pred_chroma_dc_av8, %function

	ihevc_intra_pred_chroma_dc_av8:

	// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
	push_v_regs
	stp x19, x20,[sp,#-16]!

	mov x9, #0
	mov v17.s[0], w9
	mov v17.s[1], w9

	clz w5,w4 //counts leading zeros

	add x6, x0, x4,lsl #1 //&src[2nt]
	mov v18.s[0], w9
	mov v18.s[1], w9
	sub x20, x5, #32 //log2nt
	neg x5, x20
	add x7, x0, x4, lsl #2 //&src[4nt]
	mov x12,x5
	add x8, x7, #2 //&src[4nt+2]

	cmp x4, #4
	beq dc_4 //nt=4 loop


	add_loop:
	ld2 {v30.8b, v31.8b}, [x6], #16 //load from src[nt]
	lsl x10,x4,#1 //2nt

	uaddlp v2.4h, v30.8b
	subs x10, x10,#0x10

	ld2 {v26.8b, v27.8b}, [x8],#16 //load from src[2nt+1]

	uaddlp v3.4h, v31.8b
	uaddlp v2.2s, v2.4h
	uaddlp v3.2s, v3.4h

	uadalp v17.1d, v2.2s

	uadalp v18.1d, v3.2s

	uaddlp v2.4h, v26.8b
	uaddlp v3.4h, v27.8b

	uaddlp v2.2s, v2.4h
	uaddlp v3.2s, v3.4h

	uadalp v17.1d, v2.2s
	uadalp v18.1d, v3.2s

	beq epil_add_loop

	core_loop_add:
	ld2 {v30.8b, v31.8b}, [x6],#16 //load from src[nt]
	uaddlp v28.4h, v30.8b
	uaddlp v3.4h, v31.8b

	ld2 {v26.8b, v27.8b}, [x8],#16 //load from src[2nt+1]

	uaddlp v3.2s, v3.4h
	uaddlp v29.2s, v28.4h

	uadalp v18.1d, v3.2s
	uadalp v17.1d, v29.2s

	uaddlp v3.4h, v27.8b
	uaddlp v28.4h, v26.8b

	uaddlp v3.2s, v3.4h
	uaddlp v29.2s, v28.4h

	uadalp v18.1d, v3.2s
	uadalp v17.1d, v29.2s


	epil_add_loop:

	smov x1, v18.s[0]
	smov x11, v17.s[0]

	add x1,x1,x4
	add x11,x11,x4

	lsr x1,x1,x12
	lsr x11,x11,x12

	dup v17.8b,w1
	dup v16.8b,w11

	prologue_cpy_32:

	add x5, x2, x3
	subs x9, x4, #8
	lsl x6, x3, #2
	csel x11, x6, x11,eq
	add x8, x5, x3
	add x10, x8, x3

	beq epilogue_copy

	st2 {v16.8b, v17.8b}, [x2],#16
	sub x6, x6, #16

	st2 {v16.8b, v17.8b}, [x5],#16
	st2 {v16.8b, v17.8b}, [x8],#16
	mov x20,#16
	csel x11, x20, x11,ne
	st2 {v16.8b, v17.8b}, [x10],#16


	st2 {v16.8b, v17.8b}, [x2], x6
	st2 {v16.8b, v17.8b}, [x5], x6
	st2 {v16.8b, v17.8b}, [x8], x6
	st2 {v16.8b, v17.8b}, [x10], x6

	kernel_copy:
	st2 {v16.8b, v17.8b}, [x2],#16
	st2 {v16.8b, v17.8b}, [x5],#16
	st2 {v16.8b, v17.8b}, [x8],#16
	st2 {v16.8b, v17.8b}, [x10],#16

	st2 {v16.8b, v17.8b}, [x2], x6
	st2 {v16.8b, v17.8b}, [x5], x6
	st2 {v16.8b, v17.8b}, [x8], x6
	st2 {v16.8b, v17.8b}, [x10], x6

	st2 {v16.8b, v17.8b}, [x2],#16
	st2 {v16.8b, v17.8b}, [x5],#16
	st2 {v16.8b, v17.8b}, [x8],#16
	st2 {v16.8b, v17.8b}, [x10],#16

	st2 {v16.8b, v17.8b}, [x2], x6
	st2 {v16.8b, v17.8b}, [x5], x6
	st2 {v16.8b, v17.8b}, [x8], x6
	st2 {v16.8b, v17.8b}, [x10], x6

	epilogue_copy:
	st2 {v16.8b, v17.8b}, [x2],x11
	st2 {v16.8b, v17.8b}, [x5],x11
	st2 {v16.8b, v17.8b}, [x8],x11
	st2 {v16.8b, v17.8b}, [x10],x11

	st2 {v16.8b, v17.8b}, [x2]
	st2 {v16.8b, v17.8b}, [x5]
	st2 {v16.8b, v17.8b}, [x8]
	st2 {v16.8b, v17.8b}, [x10]
	b end_func

	dc_4:
	ld2 {v30.8b, v31.8b},[x6] //load from src[nt]
	shl d3, d30,#32

	ld2 {v26.8b, v27.8b},[x8] //load from src[2nt+1]
	shl d2, d31,#32

	uaddlp v3.4h, v3.8b
	uaddlp v2.4h, v2.8b
	uaddlp v3.2s, v3.4h
	uaddlp v2.2s, v2.4h
	uadalp v17.1d, v3.2s
	uadalp v18.1d, v2.2s

	shl d3, d26,#32
	shl d2, d27,#32
	uaddlp v3.4h, v3.8b
	uaddlp v2.4h, v2.8b
	uaddlp v3.2s, v3.4h
	uaddlp v2.2s, v2.4h
	uadalp v17.1d, v3.2s
	uadalp v18.1d, v2.2s

	smov x10, v17.s[0]
	smov x11, v18.s[0]

	add x10,x10,x4
	add x11,x11,x4
	lsr x10,x10,x12
	lsr x11,x11,x12
	orr x10,x10,x11,lsl #8
	dup v0.4h,w10

	st1 {v0.8b},[x2],x3
	st1 {v0.8b},[x2],x3
	st1 {v0.8b},[x2],x3
	st1 {v0.8b},[x2]

	end_func:
	// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
	ldp x19, x20,[sp],#16
	pop_v_regs
	ret