common/arm64/ihevc_intra_pred_luma_horz.s - platform/external/libhevc - Git at Google

 ///*****************************************************************************
 //*
 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
 //*
 //* Licensed under the Apache License, Version 2.0 (the "License");
 //* you may not use this file except in compliance with the License.
 //* You may obtain a copy of the License at:
 //*
 //* http://www.apache.org/licenses/LICENSE-2.0
 //*
 //* Unless required by applicable law or agreed to in writing, software
 //* distributed under the License is distributed on an "AS IS" BASIS,
 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 //* See the License for the specific language governing permissions and
 //* limitations under the License.
 //*
 //*****************************************************************************/
 ///**
 //*******************************************************************************
 //* @file
 //*  ihevc_intra_pred_luma_horz_neon.s
 //*
 //* @brief
 //*  contains function definition for intra prediction  interpolation filters
 //*
 //*
 //* @author
 //*  parthiban v
 //*
 //* @par list of functions:
 //*  - ihevc_intra_pred_luma_horz()
 //*
 //* @remarks
 //*  none
 //*
 //*******************************************************************************
 //*/
 //
 ///**
 //*******************************************************************************
 //*
 //* @brief
 //*     intra prediction interpolation filter for horizontal luma variable.
 //*
 //* @par description:
 //*      horizontal intraprediction(mode 10) with.extern  samples location
 //*      pointed by 'pu1_ref' to the tu block  location pointed by 'pu1_dst'  refer
 //*      to section 8.4.4.2.6 in the standard (special case)
 //*
 //* @param[in] pu1_src
 //*  uword8 pointer to the source
 //*
 //* @param[out] pu1_dst
 //*  uword8 pointer to the destination
 //*
 //* @param[in] src_strd
 //*  integer source stride
 //*
 //* @param[in] dst_strd
 //*  integer destination stride
 //*
 //* @param[in] nt
 //*  integer transform block size
 //*
 //* @param[in] mode
 //*  integer intraprediction mode
 //*
 //* @returns
 //*
 //* @remarks
 //*  none
 //*
 //*******************************************************************************
 //*/
 //void ihevc_intra_pred_luma_horz(uword8 *pu1_ref,
 //                                word32 src_strd,
 //                                uword8 *pu1_dst,
 //                                word32 dst_strd,
 //                                word32 nt,
 //                                word32 mode)
 //**************variables vs registers*****************************************
 //x0 => *pu1_ref
 //x1 =>  src_strd
 //x2 => *pu1_dst
 //x3 =>  dst_strd

 .text
 .align 4
 .include "ihevc_neon_macros.s"


 .globl ihevc_intra_pred_luma_horz_av8

 .type ihevc_intra_pred_luma_horz_av8, %function

 ihevc_intra_pred_luma_horz_av8:

     // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments

     stp         x19, x20,[sp,#-16]!

     //ldr          x5,[sp,#44]                        @loads mode

     lsl         x6,x4,#1                    //two_nt

     add         x12,x0,x6                   //*pu1_ref[two_nt]
     cmp         x4,#4                       //if nt == 4
     beq         core_loop_4

     cmp         x4,#8                       //if nt == 8
     beq         core_loop_8

     cmp         x4,#16                      //if nt == 16
     beq         core_loop_16
     sub         x12,x12,#16                 //move to 16th value pointer
     add         x9,x2,#16

 core_loop_32:
     ld1         { v0.16b},[x12]             //load 16 values. d1[7] will have the 1st value.

     dup         v2.16b, v0.b[15]            //duplicate the i value.

     dup         v4.16b, v0.b[14]            //duplicate the ii value.
     dup         v6.16b, v0.b[13]            //duplicate the iii value.
     st1         { v2.16b},[x2],x3           //store in 1st row 0-16 columns
     st1         { v2.16b},[x9],x3           //store in 1st row 16-32 columns

     dup         v1.16b, v0.b[12]
     st1         { v4.16b},[x2],x3
     st1         { v4.16b},[x9],x3

     dup         v2.16b, v0.b[11]
     st1         { v6.16b},[x2],x3
     st1         { v6.16b},[x9],x3

     dup         v4.16b, v0.b[10]
     st1         { v1.16b},[x2],x3
     st1         { v1.16b},[x9],x3

     dup         v6.16b, v0.b[9]
     st1         { v2.16b},[x2],x3
     st1         { v2.16b},[x9],x3

     dup         v1.16b, v0.b[8]
     st1         { v4.16b},[x2],x3
     st1         { v4.16b},[x9],x3

     dup         v2.16b, v0.b[7]
     st1         { v6.16b},[x2],x3
     st1         { v6.16b},[x9],x3

     dup         v4.16b, v0.b[6]
     st1         { v1.16b},[x2],x3
     st1         { v1.16b},[x9],x3

     dup         v6.16b, v0.b[5]
     st1         { v2.16b},[x2],x3
     st1         { v2.16b},[x9],x3

     dup         v1.16b, v0.b[4]
     st1         { v4.16b},[x2],x3
     st1         { v4.16b},[x9],x3

     dup         v2.16b, v0.b[3]
     st1         { v6.16b},[x2],x3
     st1         { v6.16b},[x9],x3

     dup         v4.16b, v0.b[2]
     st1         { v1.16b},[x2],x3
     st1         { v1.16b},[x9],x3

     dup         v6.16b, v0.b[1]
     st1         { v2.16b},[x2],x3
     st1         { v2.16b},[x9],x3
     sub         x12,x12,#16                 //move to 16th value pointer

     dup         v1.16b, v0.b[0]
     st1         { v4.16b},[x2],x3
     st1         { v4.16b},[x9],x3

     subs        x4,x4,#16                   //decrement the loop count by 16
     st1         { v6.16b},[x2],x3
     st1         { v6.16b},[x9],x3

     st1         { v1.16b},[x2],x3
     st1         { v1.16b},[x9],x3
     bgt         core_loop_32
     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
     ldp         x19, x20,[sp],#16

     ret
     b           end_func

 core_loop_16:
     ldrb        w14,[x12],#1                //pu1_ref[two_nt]
     sxtw        x14,w14
     ld1         { v30.8b},[x12],#8          //pu1_ref[two_nt + 1 + col]
     ld1         { v31.8b},[x12]             //pu1_ref[two_nt + 1 + col]
     sub         x12,x12,#8

     dup         v28.8b,w14
     sub         x12,x12,#17
     ld1         { v0.16b},[x12]
     dup         v26.8b, v0.b[15]
     uxtl        v26.8h, v26.8b

     dup         v2.16b, v0.b[14]
     usubl       v24.8h, v30.8b, v28.8b

     dup         v4.16b, v0.b[13]
     sshr        v24.8h, v24.8h,#1

     dup         v6.16b, v0.b[12]
     sqadd       v22.8h,  v26.8h ,  v24.8h

     dup         v1.16b, v0.b[11]
     sqxtun      v22.8b, v22.8h

     st1         {v22.8b},[x2],#8

     dup         v18.16b, v0.b[10]
     usubl       v24.8h, v31.8b, v28.8b

     dup         v19.16b, v0.b[9]
     sshr        v24.8h, v24.8h,#1

     dup         v20.16b, v0.b[8]
     sqadd       v22.8h,  v26.8h ,  v24.8h

     dup         v16.16b, v0.b[7]
     sqxtun      v22.8b, v22.8h

     st1         {v22.8b},[x2],x3
     sub         x2,x2,#8

     st1         { v2.16b},[x2],x3

     st1         { v4.16b},[x2],x3
     st1         { v6.16b},[x2],x3
     st1         { v1.16b},[x2],x3

     dup         v2.16b, v0.b[6]
     st1         { v18.16b},[x2],x3

     dup         v4.16b, v0.b[5]
     st1         { v19.16b},[x2],x3

     dup         v6.16b, v0.b[4]
     st1         { v20.16b},[x2],x3

     dup         v1.16b, v0.b[3]
     st1         { v16.16b},[x2],x3

     dup         v18.16b, v0.b[2]
     st1         { v2.16b},[x2],x3

     dup         v19.16b, v0.b[1]
     st1         { v4.16b},[x2],x3

     dup         v20.16b, v0.b[0]
     st1         { v6.16b},[x2],x3

     st1         { v1.16b},[x2],x3
     st1         { v18.16b},[x2],x3
     st1         { v19.16b},[x2],x3
     st1         { v20.16b},[x2],x3

     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
     ldp         x19, x20,[sp],#16

     ret
     b           end_func


 core_loop_8:
     ldrb        w14,[x12]                   //pu1_ref[two_nt]
     sxtw        x14,w14
     add         x12,x12,#1                  //pu1_ref[two_nt + 1]
     ld1         {v30.8b},[x12]              //pu1_ref[two_nt + 1 + col]

     sub         x12,x12,#9
     ld1         {v0.8b},[x12]
     dup         v26.8b, v0.b[7]
     dup         v28.8b,w14

     dup         v3.8b, v0.b[6]
     uxtl        v26.8h, v26.8b

     dup         v4.8b, v0.b[5]
     usubl       v24.8h, v30.8b, v28.8b

     dup         v5.8b, v0.b[4]
     sshr        v24.8h, v24.8h,#1

     dup         v6.8b, v0.b[3]
     sqadd       v22.8h,  v26.8h ,  v24.8h

     dup         v7.8b, v0.b[2]
     sqxtun      v22.8b, v22.8h

     st1         {v22.8b},[x2],x3
     st1         {v3.8b},[x2],x3

     dup         v1.8b, v0.b[1]
     st1         {v4.8b},[x2],x3
     st1         {v5.8b},[x2],x3

     dup         v17.8b, v0.b[0]
     st1         {v6.8b},[x2],x3
     st1         {v7.8b},[x2],x3

     st1         {v1.8b},[x2],x3
     st1         {v17.8b},[x2],x3
     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
     ldp         x19, x20,[sp],#16

     ret
     b           end_func


 core_loop_4:
     ldrb        w14,[x12]                   //pu1_ref[two_nt]
     sxtw        x14,w14
     add         x12,x12,#1                  //pu1_ref[two_nt + 1]
     ld1         {v30.8b},[x12]              //pu1_ref[two_nt + 1 + col]

     sub         x12,x12,#5
     ld1         {v0.8b},[x12]
     dup         v28.8b,w14
     dup         v26.8b, v0.b[3]
     uxtl        v26.8h, v26.8b

     dup         v3.8b, v0.b[2]
     usubl       v24.8h, v30.8b, v28.8b

     dup         v4.8b, v0.b[1]
     sshr        v24.8h, v24.8h,#1

     dup         v5.8b, v0.b[0]
     sqadd       v22.8h,  v26.8h ,  v24.8h

     sqxtun      v22.8b, v22.8h

     st1         {v22.s}[0],[x2],x3
     st1         {v3.s}[0],[x2],x3
     st1         {v4.s}[0],[x2],x3
     st1         {v5.s}[0],[x2],x3

     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
     ldp         x19, x20,[sp],#16

     ret
 end_func:
	///*****************************************************************************
	//*
	//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
	//*
	//* Licensed under the Apache License, Version 2.0 (the "License");
	//* you may not use this file except in compliance with the License.
	//* You may obtain a copy of the License at:
	//*
	//* http://www.apache.org/licenses/LICENSE-2.0
	//*
	//* Unless required by applicable law or agreed to in writing, software
	//* distributed under the License is distributed on an "AS IS" BASIS,
	//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	//* See the License for the specific language governing permissions and
	//* limitations under the License.
	//*
	//*****************************************************************************/
	///**
	//*******************************************************************************
	//* @file
	//* ihevc_intra_pred_luma_horz_neon.s
	//*
	//* @brief
	//* contains function definition for intra prediction interpolation filters
	//*
	//*
	//* @author
	//* parthiban v
	//*
	//* @par list of functions:
	//* - ihevc_intra_pred_luma_horz()
	//*
	//* @remarks
	//* none
	//*
	//*******************************************************************************
	//*/
	//
	///**
	//*******************************************************************************
	//*
	//* @brief
	//* intra prediction interpolation filter for horizontal luma variable.
	//*
	//* @par description:
	//* horizontal intraprediction(mode 10) with.extern samples location
	//* pointed by 'pu1_ref' to the tu block location pointed by 'pu1_dst' refer
	//* to section 8.4.4.2.6 in the standard (special case)
	//*
	//* @param[in] pu1_src
	//* uword8 pointer to the source
	//*
	//* @param[out] pu1_dst
	//* uword8 pointer to the destination
	//*
	//* @param[in] src_strd
	//* integer source stride
	//*
	//* @param[in] dst_strd
	//* integer destination stride
	//*
	//* @param[in] nt
	//* integer transform block size
	//*
	//* @param[in] mode
	//* integer intraprediction mode
	//*
	//* @returns
	//*
	//* @remarks
	//* none
	//*
	//*******************************************************************************
	//*/
	//void ihevc_intra_pred_luma_horz(uword8 *pu1_ref,
	// word32 src_strd,
	// uword8 *pu1_dst,
	// word32 dst_strd,
	// word32 nt,
	// word32 mode)
	//************variables vs registers***************************************
	//x0 => *pu1_ref
	//x1 => src_strd
	//x2 => *pu1_dst
	//x3 => dst_strd

	.text
	.align 4
	.include "ihevc_neon_macros.s"



	.globl ihevc_intra_pred_luma_horz_av8

	.type ihevc_intra_pred_luma_horz_av8, %function

	ihevc_intra_pred_luma_horz_av8:

	// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments

	stp x19, x20,[sp,#-16]!

	//ldr x5,[sp,#44] @loads mode

	lsl x6,x4,#1 //two_nt

	add x12,x0,x6 //*pu1_ref[two_nt]
	cmp x4,#4 //if nt == 4
	beq core_loop_4

	cmp x4,#8 //if nt == 8
	beq core_loop_8

	cmp x4,#16 //if nt == 16
	beq core_loop_16
	sub x12,x12,#16 //move to 16th value pointer
	add x9,x2,#16

	core_loop_32:
	ld1 { v0.16b},[x12] //load 16 values. d1[7] will have the 1st value.

	dup v2.16b, v0.b[15] //duplicate the i value.

	dup v4.16b, v0.b[14] //duplicate the ii value.
	dup v6.16b, v0.b[13] //duplicate the iii value.
	st1 { v2.16b},[x2],x3 //store in 1st row 0-16 columns
	st1 { v2.16b},[x9],x3 //store in 1st row 16-32 columns

	dup v1.16b, v0.b[12]
	st1 { v4.16b},[x2],x3
	st1 { v4.16b},[x9],x3

	dup v2.16b, v0.b[11]
	st1 { v6.16b},[x2],x3
	st1 { v6.16b},[x9],x3

	dup v4.16b, v0.b[10]
	st1 { v1.16b},[x2],x3
	st1 { v1.16b},[x9],x3

	dup v6.16b, v0.b[9]
	st1 { v2.16b},[x2],x3
	st1 { v2.16b},[x9],x3

	dup v1.16b, v0.b[8]
	st1 { v4.16b},[x2],x3
	st1 { v4.16b},[x9],x3

	dup v2.16b, v0.b[7]
	st1 { v6.16b},[x2],x3
	st1 { v6.16b},[x9],x3

	dup v4.16b, v0.b[6]
	st1 { v1.16b},[x2],x3
	st1 { v1.16b},[x9],x3

	dup v6.16b, v0.b[5]
	st1 { v2.16b},[x2],x3
	st1 { v2.16b},[x9],x3

	dup v1.16b, v0.b[4]
	st1 { v4.16b},[x2],x3
	st1 { v4.16b},[x9],x3

	dup v2.16b, v0.b[3]
	st1 { v6.16b},[x2],x3
	st1 { v6.16b},[x9],x3

	dup v4.16b, v0.b[2]
	st1 { v1.16b},[x2],x3
	st1 { v1.16b},[x9],x3

	dup v6.16b, v0.b[1]
	st1 { v2.16b},[x2],x3
	st1 { v2.16b},[x9],x3
	sub x12,x12,#16 //move to 16th value pointer

	dup v1.16b, v0.b[0]
	st1 { v4.16b},[x2],x3
	st1 { v4.16b},[x9],x3

	subs x4,x4,#16 //decrement the loop count by 16
	st1 { v6.16b},[x2],x3
	st1 { v6.16b},[x9],x3

	st1 { v1.16b},[x2],x3
	st1 { v1.16b},[x9],x3
	bgt core_loop_32
	// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
	ldp x19, x20,[sp],#16

	ret
	b end_func

	core_loop_16:
	ldrb w14,[x12],#1 //pu1_ref[two_nt]
	sxtw x14,w14
	ld1 { v30.8b},[x12],#8 //pu1_ref[two_nt + 1 + col]
	ld1 { v31.8b},[x12] //pu1_ref[two_nt + 1 + col]
	sub x12,x12,#8

	dup v28.8b,w14
	sub x12,x12,#17
	ld1 { v0.16b},[x12]
	dup v26.8b, v0.b[15]
	uxtl v26.8h, v26.8b

	dup v2.16b, v0.b[14]
	usubl v24.8h, v30.8b, v28.8b

	dup v4.16b, v0.b[13]
	sshr v24.8h, v24.8h,#1

	dup v6.16b, v0.b[12]
	sqadd v22.8h, v26.8h , v24.8h

	dup v1.16b, v0.b[11]
	sqxtun v22.8b, v22.8h

	st1 {v22.8b},[x2],#8

	dup v18.16b, v0.b[10]
	usubl v24.8h, v31.8b, v28.8b

	dup v19.16b, v0.b[9]
	sshr v24.8h, v24.8h,#1

	dup v20.16b, v0.b[8]
	sqadd v22.8h, v26.8h , v24.8h

	dup v16.16b, v0.b[7]
	sqxtun v22.8b, v22.8h

	st1 {v22.8b},[x2],x3
	sub x2,x2,#8

	st1 { v2.16b},[x2],x3

	st1 { v4.16b},[x2],x3
	st1 { v6.16b},[x2],x3
	st1 { v1.16b},[x2],x3

	dup v2.16b, v0.b[6]
	st1 { v18.16b},[x2],x3

	dup v4.16b, v0.b[5]
	st1 { v19.16b},[x2],x3

	dup v6.16b, v0.b[4]
	st1 { v20.16b},[x2],x3

	dup v1.16b, v0.b[3]
	st1 { v16.16b},[x2],x3

	dup v18.16b, v0.b[2]
	st1 { v2.16b},[x2],x3

	dup v19.16b, v0.b[1]
	st1 { v4.16b},[x2],x3

	dup v20.16b, v0.b[0]
	st1 { v6.16b},[x2],x3

	st1 { v1.16b},[x2],x3
	st1 { v18.16b},[x2],x3
	st1 { v19.16b},[x2],x3
	st1 { v20.16b},[x2],x3

	// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
	ldp x19, x20,[sp],#16

	ret
	b end_func


	core_loop_8:
	ldrb w14,[x12] //pu1_ref[two_nt]
	sxtw x14,w14
	add x12,x12,#1 //pu1_ref[two_nt + 1]
	ld1 {v30.8b},[x12] //pu1_ref[two_nt + 1 + col]

	sub x12,x12,#9
	ld1 {v0.8b},[x12]
	dup v26.8b, v0.b[7]
	dup v28.8b,w14

	dup v3.8b, v0.b[6]
	uxtl v26.8h, v26.8b

	dup v4.8b, v0.b[5]
	usubl v24.8h, v30.8b, v28.8b

	dup v5.8b, v0.b[4]
	sshr v24.8h, v24.8h,#1

	dup v6.8b, v0.b[3]
	sqadd v22.8h, v26.8h , v24.8h

	dup v7.8b, v0.b[2]
	sqxtun v22.8b, v22.8h

	st1 {v22.8b},[x2],x3
	st1 {v3.8b},[x2],x3

	dup v1.8b, v0.b[1]
	st1 {v4.8b},[x2],x3
	st1 {v5.8b},[x2],x3

	dup v17.8b, v0.b[0]
	st1 {v6.8b},[x2],x3
	st1 {v7.8b},[x2],x3

	st1 {v1.8b},[x2],x3
	st1 {v17.8b},[x2],x3
	// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
	ldp x19, x20,[sp],#16

	ret
	b end_func


	core_loop_4:
	ldrb w14,[x12] //pu1_ref[two_nt]
	sxtw x14,w14
	add x12,x12,#1 //pu1_ref[two_nt + 1]
	ld1 {v30.8b},[x12] //pu1_ref[two_nt + 1 + col]

	sub x12,x12,#5
	ld1 {v0.8b},[x12]
	dup v28.8b,w14
	dup v26.8b, v0.b[3]
	uxtl v26.8h, v26.8b

	dup v3.8b, v0.b[2]
	usubl v24.8h, v30.8b, v28.8b

	dup v4.8b, v0.b[1]
	sshr v24.8h, v24.8h,#1

	dup v5.8b, v0.b[0]
	sqadd v22.8h, v26.8h , v24.8h

	sqxtun v22.8b, v22.8h

	st1 {v22.s}[0],[x2],x3
	st1 {v3.s}[0],[x2],x3
	st1 {v4.s}[0],[x2],x3
	st1 {v5.s}[0],[x2],x3

	// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
	ldp x19, x20,[sp],#16

	ret
	end_func: