common/arm/ihevc_intra_pred_luma_horz.s - platform/external/libhevc - Git at Google

 @/*****************************************************************************
 @*
 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
 @*
 @* Licensed under the Apache License, Version 2.0 (the "License");
 @* you may not use this file except in compliance with the License.
 @* You may obtain a copy of the License at:
 @*
 @* http://www.apache.org/licenses/LICENSE-2.0
 @*
 @* Unless required by applicable law or agreed to in writing, software
 @* distributed under the License is distributed on an "AS IS" BASIS,
 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 @* See the License for the specific language governing permissions and
 @* limitations under the License.
 @*
 @*****************************************************************************/
 @/**
 @*******************************************************************************
 @* @file
 @*  ihevc_intra_pred_luma_horz_neon.s
 @*
 @* @brief
 @*  contains function definition for intra prediction  interpolation filters
 @*
 @*
 @* @author
 @*  parthiban v
 @*
 @* @par list of functions:
 @*  - ihevc_intra_pred_luma_horz()
 @*
 @* @remarks
 @*  none
 @*
 @*******************************************************************************
 @*/
 @
 @/**
 @*******************************************************************************
 @*
 @* @brief
 @*     intra prediction interpolation filter for horizontal luma variable.
 @*
 @* @par description:
 @*      horizontal intraprediction(mode 10) with.extern  samples location
 @*      pointed by 'pu1_ref' to the tu block  location pointed by 'pu1_dst'  refer
 @*      to section 8.4.4.2.6 in the standard (special case)
 @*
 @* @param[in] pu1_src
 @*  uword8 pointer to the source
 @*
 @* @param[out] pu1_dst
 @*  uword8 pointer to the destination
 @*
 @* @param[in] src_strd
 @*  integer source stride
 @*
 @* @param[in] dst_strd
 @*  integer destination stride
 @*
 @* @param[in] nt
 @*  integer transform block size
 @*
 @* @param[in] mode
 @*  integer intraprediction mode
 @*
 @* @returns
 @*
 @* @remarks
 @*  none
 @*
 @*******************************************************************************
 @*/
 @void ihevc_intra_pred_luma_horz(uword8 *pu1_ref,
 @                                word32 src_strd,
 @                                uword8 *pu1_dst,
 @                                word32 dst_strd,
 @                                word32 nt,
 @                                word32 mode)
 @**************variables vs registers*****************************************
 @r0 => *pu1_ref
 @r1 =>  src_strd
 @r2 => *pu1_dst
 @r3 =>  dst_strd

 .text
 .align 4


 .globl ihevc_intra_pred_luma_horz_a9q

 .type ihevc_intra_pred_luma_horz_a9q, %function

 ihevc_intra_pred_luma_horz_a9q:

     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments

     ldr         r4,[sp,#40]                 @loads nt
     @ldr        r5,[sp,#44]                     @loads mode

     lsl         r6,r4,#1                    @two_nt

     add         r12,r0,r6                   @*pu1_ref[two_nt]
     cmp         r4,#4                       @if nt == 4
     beq         core_loop_4

     cmp         r4,#8                       @if nt == 8
     beq         core_loop_8

     cmp         r4,#16                      @if nt == 16
     beq         core_loop_16
     sub         r12,r12,#16                 @move to 16th value pointer
     add         r9,r2,#16

 core_loop_32:
     vld1.8      {q0},[r12]                  @load 16 values. d1[7] will have the 1st value.

     vdup.8      q1,d1[7]                    @duplicate the i value.

     vdup.8      q2,d1[6]                    @duplicate the ii value.
     vdup.8      q3,d1[5]                    @duplicate the iii value.
     vst1.8      {q1},[r2],r3                @store in 1st row 0-16 columns
     vst1.8      {q1},[r9],r3                @store in 1st row 16-32 columns

     vdup.8      q4,d1[4]
     vst1.8      {q2},[r2],r3
     vst1.8      {q2},[r9],r3

     vdup.8      q1,d1[3]
     vst1.8      {q3},[r2],r3
     vst1.8      {q3},[r9],r3

     vdup.8      q2,d1[2]
     vst1.8      {q4},[r2],r3
     vst1.8      {q4},[r9],r3

     vdup.8      q3,d1[1]
     vst1.8      {q1},[r2],r3
     vst1.8      {q1},[r9],r3

     vdup.8      q4,d1[0]
     vst1.8      {q2},[r2],r3
     vst1.8      {q2},[r9],r3

     vdup.8      q1,d0[7]
     vst1.8      {q3},[r2],r3
     vst1.8      {q3},[r9],r3

     vdup.8      q2,d0[6]
     vst1.8      {q4},[r2],r3
     vst1.8      {q4},[r9],r3

     vdup.8      q3,d0[5]
     vst1.8      {q1},[r2],r3
     vst1.8      {q1},[r9],r3

     vdup.8      q4,d0[4]
     vst1.8      {q2},[r2],r3
     vst1.8      {q2},[r9],r3

     vdup.8      q1,d0[3]
     vst1.8      {q3},[r2],r3
     vst1.8      {q3},[r9],r3

     vdup.8      q2,d0[2]
     vst1.8      {q4},[r2],r3
     vst1.8      {q4},[r9],r3

     vdup.8      q3,d0[1]
     vst1.8      {q1},[r2],r3
     vst1.8      {q1},[r9],r3
     sub         r12,r12,#16                 @move to 16th value pointer

     vdup.8      q4,d0[0]
     vst1.8      {q2},[r2],r3
     vst1.8      {q2},[r9],r3

     subs        r4,r4,#16                   @decrement the loop count by 16
     vst1.8      {q3},[r2],r3
     vst1.8      {q3},[r9],r3

     vst1.8      {q4},[r2],r3
     vst1.8      {q4},[r9],r3
     bgt         core_loop_32
     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
     b           end_func

 core_loop_16:
     ldrb        lr,[r12],#1                 @pu1_ref[two_nt]
     vld1.8      {q15},[r12]                 @pu1_ref[two_nt + 1 + col]

     vdup.8      d28,lr
     sub         r12,r12,#17
     vld1.8      {q0},[r12]
     vdup.8      d26,d1[7]
     vmovl.u8    q13,d26

     vdup.8      q1,d1[6]
     vsubl.u8    q12,d30,d28

     vdup.8      q2,d1[5]
     vshr.s16    q12,q12,#1

     vdup.8      q3,d1[4]
     vqadd.s16   q11,q13,q12

     vdup.8      q4,d1[3]
     vqmovun.s16 d22,q11

     vst1.8      {d22},[r2]!

     vdup.8      q5,d1[2]
     vsubl.u8    q12,d31,d28

     vdup.8      q6,d1[1]
     vshr.s16    q12,q12,#1

     vdup.8      q7,d1[0]
     vqadd.s16   q11,q13,q12

     vdup.8      q8,d0[7]
     vqmovun.s16 d22,q11

     vst1.8      {d22},[r2],r3
     sub         r2,r2,#8

     vst1.8      {q1},[r2],r3

     vst1.8      {q2},[r2],r3
     vst1.8      {q3},[r2],r3
     vst1.8      {q4},[r2],r3

     vdup.8      q1,d0[6]
     vst1.8      {q5},[r2],r3

     vdup.8      q2,d0[5]
     vst1.8      {q6},[r2],r3

     vdup.8      q3,d0[4]
     vst1.8      {q7},[r2],r3

     vdup.8      q4,d0[3]
     vst1.8      {q8},[r2],r3

     vdup.8      q5,d0[2]
     vst1.8      {q1},[r2],r3

     vdup.8      q6,d0[1]
     vst1.8      {q2},[r2],r3

     vdup.8      q7,d0[0]
     vst1.8      {q3},[r2],r3

     vst1.8      {q4},[r2],r3
     vst1.8      {q5},[r2],r3
     vst1.8      {q6},[r2],r3
     vst1.8      {q7},[r2],r3

     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
     b           end_func


 core_loop_8:
     ldrb        lr,[r12]                    @pu1_ref[two_nt]
     add         r12,r12,#1                  @pu1_ref[two_nt + 1]
     vld1.8      {d30},[r12]                 @pu1_ref[two_nt + 1 + col]

     sub         r12,r12,#9
     vld1.8      {d0},[r12]
     vdup.8      d26,d0[7]
     vdup.8      d28,lr

     vdup.8      d3,d0[6]
     vmovl.u8    q13,d26

     vdup.8      d4,d0[5]
     vsubl.u8    q12,d30,d28

     vdup.8      d5,d0[4]
     vshr.s16    q12,q12,#1

     vdup.8      d6,d0[3]
     vqadd.s16   q11,q13,q12

     vdup.8      d7,d0[2]
     vqmovun.s16 d22,q11

     vst1.8      {d22},[r2],r3
     vst1.8      {d3},[r2],r3

     vdup.8      d8,d0[1]
     vst1.8      {d4},[r2],r3
     vst1.8      {d5},[r2],r3

     vdup.8      d9,d0[0]
     vst1.8      {d6},[r2],r3
     vst1.8      {d7},[r2],r3

     vst1.8      {d8},[r2],r3
     vst1.8      {d9},[r2],r3
     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
     b           end_func


 core_loop_4:
     ldrb        lr,[r12]                    @pu1_ref[two_nt]
     add         r12,r12,#1                  @pu1_ref[two_nt + 1]
     vld1.8      {d30},[r12]                 @pu1_ref[two_nt + 1 + col]

     sub         r12,r12,#5
     vld1.8      {d0},[r12]
     vdup.8      d28,lr
     vdup.8      d26,d0[3]
     vmovl.u8    q13,d26

     vdup.8      d3,d0[2]
     vsubl.u8    q12,d30,d28

     vdup.8      d4,d0[1]
     vshr.s16    q12,q12,#1

     vdup.8      d5,d0[0]
     vqadd.s16   q11,q13,q12

     vqmovun.s16 d22,q11

     vst1.32     {d22[0]},[r2],r3
     vst1.32     {d3[0]},[r2],r3
     vst1.32     {d4[0]},[r2],r3
     vst1.32     {d5[0]},[r2],r3

     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
 end_func:
	@/*****************************************************************************
	@*
	@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
	@*
	@* Licensed under the Apache License, Version 2.0 (the "License");
	@* you may not use this file except in compliance with the License.
	@* You may obtain a copy of the License at:
	@*
	@* http://www.apache.org/licenses/LICENSE-2.0
	@*
	@* Unless required by applicable law or agreed to in writing, software
	@* distributed under the License is distributed on an "AS IS" BASIS,
	@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	@* See the License for the specific language governing permissions and
	@* limitations under the License.
	@*
	@*****************************************************************************/
	@/**
	@*******************************************************************************
	@* @file
	@* ihevc_intra_pred_luma_horz_neon.s
	@*
	@* @brief
	@* contains function definition for intra prediction interpolation filters
	@*
	@*
	@* @author
	@* parthiban v
	@*
	@* @par list of functions:
	@* - ihevc_intra_pred_luma_horz()
	@*
	@* @remarks
	@* none
	@*
	@*******************************************************************************
	@*/
	@
	@/**
	@*******************************************************************************
	@*
	@* @brief
	@* intra prediction interpolation filter for horizontal luma variable.
	@*
	@* @par description:
	@* horizontal intraprediction(mode 10) with.extern samples location
	@* pointed by 'pu1_ref' to the tu block location pointed by 'pu1_dst' refer
	@* to section 8.4.4.2.6 in the standard (special case)
	@*
	@* @param[in] pu1_src
	@* uword8 pointer to the source
	@*
	@* @param[out] pu1_dst
	@* uword8 pointer to the destination
	@*
	@* @param[in] src_strd
	@* integer source stride
	@*
	@* @param[in] dst_strd
	@* integer destination stride
	@*
	@* @param[in] nt
	@* integer transform block size
	@*
	@* @param[in] mode
	@* integer intraprediction mode
	@*
	@* @returns
	@*
	@* @remarks
	@* none
	@*
	@*******************************************************************************
	@*/
	@void ihevc_intra_pred_luma_horz(uword8 *pu1_ref,
	@ word32 src_strd,
	@ uword8 *pu1_dst,
	@ word32 dst_strd,
	@ word32 nt,
	@ word32 mode)
	@************variables vs registers***************************************
	@r0 => *pu1_ref
	@r1 => src_strd
	@r2 => *pu1_dst
	@r3 => dst_strd

	.text
	.align 4




	.globl ihevc_intra_pred_luma_horz_a9q

	.type ihevc_intra_pred_luma_horz_a9q, %function

	ihevc_intra_pred_luma_horz_a9q:

	stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments

	ldr r4,[sp,#40] @loads nt
	@ldr r5,[sp,#44] @loads mode

	lsl r6,r4,#1 @two_nt

	add r12,r0,r6 @*pu1_ref[two_nt]
	cmp r4,#4 @if nt == 4
	beq core_loop_4

	cmp r4,#8 @if nt == 8
	beq core_loop_8

	cmp r4,#16 @if nt == 16
	beq core_loop_16
	sub r12,r12,#16 @move to 16th value pointer
	add r9,r2,#16

	core_loop_32:
	vld1.8 {q0},[r12] @load 16 values. d1[7] will have the 1st value.

	vdup.8 q1,d1[7] @duplicate the i value.

	vdup.8 q2,d1[6] @duplicate the ii value.
	vdup.8 q3,d1[5] @duplicate the iii value.
	vst1.8 {q1},[r2],r3 @store in 1st row 0-16 columns
	vst1.8 {q1},[r9],r3 @store in 1st row 16-32 columns

	vdup.8 q4,d1[4]
	vst1.8 {q2},[r2],r3
	vst1.8 {q2},[r9],r3

	vdup.8 q1,d1[3]
	vst1.8 {q3},[r2],r3
	vst1.8 {q3},[r9],r3

	vdup.8 q2,d1[2]
	vst1.8 {q4},[r2],r3
	vst1.8 {q4},[r9],r3

	vdup.8 q3,d1[1]
	vst1.8 {q1},[r2],r3
	vst1.8 {q1},[r9],r3

	vdup.8 q4,d1[0]
	vst1.8 {q2},[r2],r3
	vst1.8 {q2},[r9],r3

	vdup.8 q1,d0[7]
	vst1.8 {q3},[r2],r3
	vst1.8 {q3},[r9],r3

	vdup.8 q2,d0[6]
	vst1.8 {q4},[r2],r3
	vst1.8 {q4},[r9],r3

	vdup.8 q3,d0[5]
	vst1.8 {q1},[r2],r3
	vst1.8 {q1},[r9],r3

	vdup.8 q4,d0[4]
	vst1.8 {q2},[r2],r3
	vst1.8 {q2},[r9],r3

	vdup.8 q1,d0[3]
	vst1.8 {q3},[r2],r3
	vst1.8 {q3},[r9],r3

	vdup.8 q2,d0[2]
	vst1.8 {q4},[r2],r3
	vst1.8 {q4},[r9],r3

	vdup.8 q3,d0[1]
	vst1.8 {q1},[r2],r3
	vst1.8 {q1},[r9],r3
	sub r12,r12,#16 @move to 16th value pointer

	vdup.8 q4,d0[0]
	vst1.8 {q2},[r2],r3
	vst1.8 {q2},[r9],r3

	subs r4,r4,#16 @decrement the loop count by 16
	vst1.8 {q3},[r2],r3
	vst1.8 {q3},[r9],r3

	vst1.8 {q4},[r2],r3
	vst1.8 {q4},[r9],r3
	bgt core_loop_32
	ldmfd sp!,{r4-r12,r15} @reload the registers from sp
	b end_func

	core_loop_16:
	ldrb lr,[r12],#1 @pu1_ref[two_nt]
	vld1.8 {q15},[r12] @pu1_ref[two_nt + 1 + col]

	vdup.8 d28,lr
	sub r12,r12,#17
	vld1.8 {q0},[r12]
	vdup.8 d26,d1[7]
	vmovl.u8 q13,d26

	vdup.8 q1,d1[6]
	vsubl.u8 q12,d30,d28

	vdup.8 q2,d1[5]
	vshr.s16 q12,q12,#1

	vdup.8 q3,d1[4]
	vqadd.s16 q11,q13,q12

	vdup.8 q4,d1[3]
	vqmovun.s16 d22,q11

	vst1.8 {d22},[r2]!

	vdup.8 q5,d1[2]
	vsubl.u8 q12,d31,d28

	vdup.8 q6,d1[1]
	vshr.s16 q12,q12,#1

	vdup.8 q7,d1[0]
	vqadd.s16 q11,q13,q12

	vdup.8 q8,d0[7]
	vqmovun.s16 d22,q11

	vst1.8 {d22},[r2],r3
	sub r2,r2,#8

	vst1.8 {q1},[r2],r3

	vst1.8 {q2},[r2],r3
	vst1.8 {q3},[r2],r3
	vst1.8 {q4},[r2],r3

	vdup.8 q1,d0[6]
	vst1.8 {q5},[r2],r3

	vdup.8 q2,d0[5]
	vst1.8 {q6},[r2],r3

	vdup.8 q3,d0[4]
	vst1.8 {q7},[r2],r3

	vdup.8 q4,d0[3]
	vst1.8 {q8},[r2],r3

	vdup.8 q5,d0[2]
	vst1.8 {q1},[r2],r3

	vdup.8 q6,d0[1]
	vst1.8 {q2},[r2],r3

	vdup.8 q7,d0[0]
	vst1.8 {q3},[r2],r3

	vst1.8 {q4},[r2],r3
	vst1.8 {q5},[r2],r3
	vst1.8 {q6},[r2],r3
	vst1.8 {q7},[r2],r3

	ldmfd sp!,{r4-r12,r15} @reload the registers from sp
	b end_func


	core_loop_8:
	ldrb lr,[r12] @pu1_ref[two_nt]
	add r12,r12,#1 @pu1_ref[two_nt + 1]
	vld1.8 {d30},[r12] @pu1_ref[two_nt + 1 + col]

	sub r12,r12,#9
	vld1.8 {d0},[r12]
	vdup.8 d26,d0[7]
	vdup.8 d28,lr

	vdup.8 d3,d0[6]
	vmovl.u8 q13,d26

	vdup.8 d4,d0[5]
	vsubl.u8 q12,d30,d28

	vdup.8 d5,d0[4]
	vshr.s16 q12,q12,#1

	vdup.8 d6,d0[3]
	vqadd.s16 q11,q13,q12

	vdup.8 d7,d0[2]
	vqmovun.s16 d22,q11

	vst1.8 {d22},[r2],r3
	vst1.8 {d3},[r2],r3

	vdup.8 d8,d0[1]
	vst1.8 {d4},[r2],r3
	vst1.8 {d5},[r2],r3

	vdup.8 d9,d0[0]
	vst1.8 {d6},[r2],r3
	vst1.8 {d7},[r2],r3

	vst1.8 {d8},[r2],r3
	vst1.8 {d9},[r2],r3
	ldmfd sp!,{r4-r12,r15} @reload the registers from sp
	b end_func


	core_loop_4:
	ldrb lr,[r12] @pu1_ref[two_nt]
	add r12,r12,#1 @pu1_ref[two_nt + 1]
	vld1.8 {d30},[r12] @pu1_ref[two_nt + 1 + col]

	sub r12,r12,#5
	vld1.8 {d0},[r12]
	vdup.8 d28,lr
	vdup.8 d26,d0[3]
	vmovl.u8 q13,d26

	vdup.8 d3,d0[2]
	vsubl.u8 q12,d30,d28

	vdup.8 d4,d0[1]
	vshr.s16 q12,q12,#1

	vdup.8 d5,d0[0]
	vqadd.s16 q11,q13,q12

	vqmovun.s16 d22,q11

	vst1.32 {d22[0]},[r2],r3
	vst1.32 {d3[0]},[r2],r3
	vst1.32 {d4[0]},[r2],r3
	vst1.32 {d5[0]},[r2],r3

	ldmfd sp!,{r4-r12,r15} @reload the registers from sp
	end_func: