common/arm64/ihevc_intra_pred_luma_mode_18_34.s - platform/external/libhevc - Git at Google

 ///*****************************************************************************
 //*
 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
 //*
 //* Licensed under the Apache License, Version 2.0 (the "License");
 //* you may not use this file except in compliance with the License.
 //* You may obtain a copy of the License at:
 //*
 //* http://www.apache.org/licenses/LICENSE-2.0
 //*
 //* Unless required by applicable law or agreed to in writing, software
 //* distributed under the License is distributed on an "AS IS" BASIS,
 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 //* See the License for the specific language governing permissions and
 //* limitations under the License.
 //*
 //*****************************************************************************/
 ///**
 //*******************************************************************************
 //* @file
 //*  ihevc_intra_pred_luma_mode_18_34_neon.s
 //*
 //* @brief
 //*  contains function definitions for intra prediction dc filtering.
 //* functions are coded using neon  intrinsics and can be compiled using

 //* rvct
 //*
 //* @author
 //*  yogeswaran rs
 //*
 //* @par list of functions:
 //*
 //*
 //* @remarks
 //*  none
 //*
 //*******************************************************************************
 //*/
 ///**
 //*******************************************************************************
 //*
 //* @brief
 //*    luma intraprediction filter for dc input
 //*
 //* @par description:
 //*
 //* @param[in] pu1_ref
 //*  uword8 pointer to the source
 //*
 //* @param[out] pu1_dst
 //*  uword8 pointer to the destination
 //*
 //* @param[in] src_strd
 //*  integer source stride
 //*
 //* @param[in] dst_strd
 //*  integer destination stride
 //*
 //* @param[in] pi1_coeff
 //*  word8 pointer to the planar coefficients
 //*
 //* @param[in] nt
 //*  size of tranform block
 //*
 //* @param[in] mode
 //*  type of filtering
 //*
 //* @returns
 //*
 //* @remarks
 //*  none
 //*
 //*******************************************************************************
 //*/

 //void ihevc_intra_pred_luma_mode_18_34(uword8 *pu1_ref,
 //                                      word32 src_strd,
 //                                      uword8 *pu1_dst,
 //                                      word32 dst_strd,
 //                                      word32 nt,
 //                                      word32 mode)
 //
 //**************variables vs registers*****************************************
 //x0 => *pu1_ref
 //x1 => src_strd
 //x2 => *pu1_dst
 //x3 => dst_strd

 //stack contents from #40
 //    nt
 //    mode
 //    pi1_coeff

 .text
 .align 4
 .include "ihevc_neon_macros.s"


 .globl ihevc_intra_pred_luma_mode_18_34_av8

 .type ihevc_intra_pred_luma_mode_18_34_av8, %function

 ihevc_intra_pred_luma_mode_18_34_av8:

     // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments
     push_v_regs
     stp         x19, x20,[sp,#-16]!

     cmp         x4,#4
     beq         mode2_4

     mov         x11,x4
     mov         x12,x4
     sub         x14,x4,#8

     add         x0,x0,x4,lsl #1

     cmp         x5,#0x22
     mov         x10,x2

     add         x0,x0,#2
     sub         x20,x0,#2
     csel        x0, x20, x0,ne
     mov         x20,#1
     csel        x6, x20, x6,eq
     mov         x20,#-1
     csel        x6, x20, x6,ne
     mov         x8,x0

 prologue_cpy_32:

     ld1         {v0.8b},[x8],x6
     lsr         x1, x4, #3
     ld1         {v1.8b},[x8],x6
     mul         x1, x4, x1
     ld1         {v2.8b},[x8],x6
     ld1         {v3.8b},[x8],x6
     subs        x1,x1,#8
     ld1         {v4.8b},[x8],x6
     ld1         {v5.8b},[x8],x6
     ld1         {v6.8b},[x8],x6

     ld1         {v7.8b},[x8],x6


     beq         epilogue_mode2
     sub         x11,x11,#8

     cmp         x5,#0x22
     add         x20,x0,#8
     csel        x0, x20, x0,ne
     csel        x8, x0, x8,ne
     bne         kernel_mode18
     //add        x8,x0,#8

 kernel_mode2:
     st1         {v0.8b},[x10],x3
     st1         {v1.8b},[x10],x3
     subs        x12,x12,#8
     st1         {v2.8b},[x10],x3
     add         x20,x2,#8
     csel        x2, x20, x2,ne
     st1         {v3.8b},[x10],x3

     ld1         {v0.8b},[x8],x6
     st1         {v4.8b},[x10],x3

     st1         {v5.8b},[x10],x3
     ld1         {v1.8b},[x8],x6
     st1         {v6.8b},[x10],x3
     ld1         {v2.8b},[x8],x6
     st1         {v7.8b},[x10],x3

     ld1         {v3.8b},[x8],x6
     sub         x20,x10,x14
     csel        x2, x20, x2,eq
     ld1         {v4.8b},[x8],x6
     mov         x10,x2
     ld1         {v5.8b},[x8],x6
     csel        x12, x4, x12,eq
     ld1         {v6.8b},[x8],x6
     subs        x11,x11,#8

     ld1         {v7.8b},[x8],x6

     add         x20,x0,#8
     csel        x0, x20, x0,eq
     csel        x11, x4, x11,eq
     csel        x8, x0, x8,eq

     subs        x1, x1, #8

     bne         kernel_mode2

     b           epilogue_mode2

 kernel_mode18:
     st1         {v0.8b},[x10],x3
     st1         {v1.8b},[x10],x3
     subs        x12,x12,#8
     st1         {v2.8b},[x10],x3
     add         x20,x2,#8
     csel        x2, x20, x2,ne
     st1         {v3.8b},[x10],x3

     ld1         {v0.8b},[x8],x6
     st1         {v4.8b},[x10],x3

     st1         {v5.8b},[x10],x3
     ld1         {v1.8b},[x8],x6

     st1         {v6.8b},[x10],x3
     ld1         {v2.8b},[x8],x6
     st1         {v7.8b},[x10],x3

     ld1         {v3.8b},[x8],x6
     sub         x20,x10,x14
     csel        x2, x20, x2,eq
     ld1         {v4.8b},[x8],x6
     mov         x10,x2
     ld1         {v5.8b},[x8],x6
     csel        x12, x4, x12,eq
     ld1         {v6.8b},[x8],x6
     subs        x11,x11,#8
     ld1         {v7.8b},[x8],x6

     add         x20,x0,#8
     csel        x0, x20, x0,ne
     csel        x11, x4, x11,eq
     sub         x20,x8,x14
     csel        x0, x20, x0,eq
     subs        x1, x1, #8
     mov         x8,x0

     bne         kernel_mode18


 epilogue_mode2:

     st1         {v0.8b},[x10],x3
     st1         {v1.8b},[x10],x3
     st1         {v2.8b},[x10],x3
     st1         {v3.8b},[x10],x3
     st1         {v4.8b},[x10],x3
     st1         {v5.8b},[x10],x3
     st1         {v6.8b},[x10],x3
     st1         {v7.8b},[x10],x3

     b           end_func

 mode2_4:

     add         x0,x0,#10
     cmp         x5,#0x22
     sub         x20,x0,#2
     csel        x0, x20, x0,ne

     mov         x20,#1
     csel        x8, x20, x8,eq
     mov         x20,#-1
     csel        x8, x20, x8,ne

     ld1         {v0.8b},[x0],x8
     st1         {v0.s}[0],[x2],x3

     ld1         {v0.8b},[x0],x8
     st1         {v0.s}[0],[x2],x3

     ld1         {v0.8b},[x0],x8
     st1         {v0.s}[0],[x2],x3

     ld1         {v0.8b},[x0],x8
     st1         {v0.s}[0],[x2],x3

 end_func:
     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
     ldp         x19, x20,[sp],#16
     pop_v_regs
     ret
	///*****************************************************************************
	//*
	//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
	//*
	//* Licensed under the Apache License, Version 2.0 (the "License");
	//* you may not use this file except in compliance with the License.
	//* You may obtain a copy of the License at:
	//*
	//* http://www.apache.org/licenses/LICENSE-2.0
	//*
	//* Unless required by applicable law or agreed to in writing, software
	//* distributed under the License is distributed on an "AS IS" BASIS,
	//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	//* See the License for the specific language governing permissions and
	//* limitations under the License.
	//*
	//*****************************************************************************/
	///**
	//*******************************************************************************
	//* @file
	//* ihevc_intra_pred_luma_mode_18_34_neon.s
	//*
	//* @brief
	//* contains function definitions for intra prediction dc filtering.
	//* functions are coded using neon intrinsics and can be compiled using

	//* rvct
	//*
	//* @author
	//* yogeswaran rs
	//*
	//* @par list of functions:
	//*
	//*
	//* @remarks
	//* none
	//*
	//*******************************************************************************
	//*/
	///**
	//*******************************************************************************
	//*
	//* @brief
	//* luma intraprediction filter for dc input
	//*
	//* @par description:
	//*
	//* @param[in] pu1_ref
	//* uword8 pointer to the source
	//*
	//* @param[out] pu1_dst
	//* uword8 pointer to the destination
	//*
	//* @param[in] src_strd
	//* integer source stride
	//*
	//* @param[in] dst_strd
	//* integer destination stride
	//*
	//* @param[in] pi1_coeff
	//* word8 pointer to the planar coefficients
	//*
	//* @param[in] nt
	//* size of tranform block
	//*
	//* @param[in] mode
	//* type of filtering
	//*
	//* @returns
	//*
	//* @remarks
	//* none
	//*
	//*******************************************************************************
	//*/

	//void ihevc_intra_pred_luma_mode_18_34(uword8 *pu1_ref,
	// word32 src_strd,
	// uword8 *pu1_dst,
	// word32 dst_strd,
	// word32 nt,
	// word32 mode)
	//
	//************variables vs registers***************************************
	//x0 => *pu1_ref
	//x1 => src_strd
	//x2 => *pu1_dst
	//x3 => dst_strd

	//stack contents from #40
	// nt
	// mode
	// pi1_coeff

	.text
	.align 4
	.include "ihevc_neon_macros.s"



	.globl ihevc_intra_pred_luma_mode_18_34_av8

	.type ihevc_intra_pred_luma_mode_18_34_av8, %function

	ihevc_intra_pred_luma_mode_18_34_av8:

	// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
	push_v_regs
	stp x19, x20,[sp,#-16]!

	cmp x4,#4
	beq mode2_4

	mov x11,x4
	mov x12,x4
	sub x14,x4,#8

	add x0,x0,x4,lsl #1

	cmp x5,#0x22
	mov x10,x2

	add x0,x0,#2
	sub x20,x0,#2
	csel x0, x20, x0,ne
	mov x20,#1
	csel x6, x20, x6,eq
	mov x20,#-1
	csel x6, x20, x6,ne
	mov x8,x0

	prologue_cpy_32:

	ld1 {v0.8b},[x8],x6
	lsr x1, x4, #3
	ld1 {v1.8b},[x8],x6
	mul x1, x4, x1
	ld1 {v2.8b},[x8],x6
	ld1 {v3.8b},[x8],x6
	subs x1,x1,#8
	ld1 {v4.8b},[x8],x6
	ld1 {v5.8b},[x8],x6
	ld1 {v6.8b},[x8],x6

	ld1 {v7.8b},[x8],x6


	beq epilogue_mode2
	sub x11,x11,#8

	cmp x5,#0x22
	add x20,x0,#8
	csel x0, x20, x0,ne
	csel x8, x0, x8,ne
	bne kernel_mode18
	//add x8,x0,#8

	kernel_mode2:
	st1 {v0.8b},[x10],x3
	st1 {v1.8b},[x10],x3
	subs x12,x12,#8
	st1 {v2.8b},[x10],x3
	add x20,x2,#8
	csel x2, x20, x2,ne
	st1 {v3.8b},[x10],x3

	ld1 {v0.8b},[x8],x6
	st1 {v4.8b},[x10],x3

	st1 {v5.8b},[x10],x3
	ld1 {v1.8b},[x8],x6
	st1 {v6.8b},[x10],x3
	ld1 {v2.8b},[x8],x6
	st1 {v7.8b},[x10],x3

	ld1 {v3.8b},[x8],x6
	sub x20,x10,x14
	csel x2, x20, x2,eq
	ld1 {v4.8b},[x8],x6
	mov x10,x2
	ld1 {v5.8b},[x8],x6
	csel x12, x4, x12,eq
	ld1 {v6.8b},[x8],x6
	subs x11,x11,#8

	ld1 {v7.8b},[x8],x6

	add x20,x0,#8
	csel x0, x20, x0,eq
	csel x11, x4, x11,eq
	csel x8, x0, x8,eq

	subs x1, x1, #8

	bne kernel_mode2

	b epilogue_mode2

	kernel_mode18:
	st1 {v0.8b},[x10],x3
	st1 {v1.8b},[x10],x3
	subs x12,x12,#8
	st1 {v2.8b},[x10],x3
	add x20,x2,#8
	csel x2, x20, x2,ne
	st1 {v3.8b},[x10],x3

	ld1 {v0.8b},[x8],x6
	st1 {v4.8b},[x10],x3

	st1 {v5.8b},[x10],x3
	ld1 {v1.8b},[x8],x6

	st1 {v6.8b},[x10],x3
	ld1 {v2.8b},[x8],x6
	st1 {v7.8b},[x10],x3

	ld1 {v3.8b},[x8],x6
	sub x20,x10,x14
	csel x2, x20, x2,eq
	ld1 {v4.8b},[x8],x6
	mov x10,x2
	ld1 {v5.8b},[x8],x6
	csel x12, x4, x12,eq
	ld1 {v6.8b},[x8],x6
	subs x11,x11,#8
	ld1 {v7.8b},[x8],x6

	add x20,x0,#8
	csel x0, x20, x0,ne
	csel x11, x4, x11,eq
	sub x20,x8,x14
	csel x0, x20, x0,eq
	subs x1, x1, #8
	mov x8,x0

	bne kernel_mode18


	epilogue_mode2:

	st1 {v0.8b},[x10],x3
	st1 {v1.8b},[x10],x3
	st1 {v2.8b},[x10],x3
	st1 {v3.8b},[x10],x3
	st1 {v4.8b},[x10],x3
	st1 {v5.8b},[x10],x3
	st1 {v6.8b},[x10],x3
	st1 {v7.8b},[x10],x3

	b end_func

	mode2_4:

	add x0,x0,#10
	cmp x5,#0x22
	sub x20,x0,#2
	csel x0, x20, x0,ne

	mov x20,#1
	csel x8, x20, x8,eq
	mov x20,#-1
	csel x8, x20, x8,ne

	ld1 {v0.8b},[x0],x8
	st1 {v0.s}[0],[x2],x3

	ld1 {v0.8b},[x0],x8
	st1 {v0.s}[0],[x2],x3

	ld1 {v0.8b},[x0],x8
	st1 {v0.s}[0],[x2],x3

	ld1 {v0.8b},[x0],x8
	st1 {v0.s}[0],[x2],x3

	end_func:
	// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
	ldp x19, x20,[sp],#16
	pop_v_regs
	ret