common/arm/ideint_cac_a9.s - platform/external/libmpeg2 - Git at Google

 @/******************************************************************************
 @ *
 @ * Copyright (C) 2015 The Android Open Source Project
 @ *
 @ * Licensed under the Apache License, Version 2.0 (the "License");
 @ * you may not use this file except in compliance with the License.
 @ * You may obtain a copy of the License at:
 @ *
 @ * http://www.apache.org/licenses/LICENSE-2.0
 @ *
 @ * Unless required by applicable law or agreed to in writing, software
 @ * distributed under the License is distributed on an "AS IS" BASIS,
 @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 @ * See the License for the specific language governing permissions and
 @ * limitations under the License.
 @ *
 @ *****************************************************************************
 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 @*/

 @******************************************************************************
 @*
 @* @brief
 @*  This file contains definitions of routines for spatial filter
 @*
 @* @author
 @*  Ittiam
 @*
 @* @par List of Functions:
 @*  - ideint_cac_8x8_a9()
 @*
 @* @remarks
 @*  None
 @*
 @*******************************************************************************


 @******************************************************************************
 @*
 @*  @brief Calculates Combing Artifact
 @*
 @*  @par   Description
 @*   This functions calculates combing artifact check (CAC) for given two fields
 @*
 @* @param[in] pu1_top
 @*  UWORD8 pointer to top field
 @*
 @* @param[in] pu1_bot
 @*  UWORD8 pointer to bottom field
 @*
 @* @param[in] top_strd
 @*  Top field stride
 @*
 @* @param[in] bot_strd
 @*  Bottom field stride
 @*
 @* @returns
 @*  None
 @*
 @* @remarks
 @*
 @******************************************************************************

     .global ideint_cac_8x8_a9

 ideint_cac_8x8_a9:

     stmfd       sp!,    {r4-r10, lr}

     @ Load first row of top
     vld1.u8     d28,    [r0],   r2

     @ Load first row of bottom
     vld1.u8     d29,    [r1],   r3

     @ Load second row of top
     vld1.u8     d30,    [r0],   r2

     @ Load second row of bottom
     vld1.u8     d31,    [r1],   r3


     @ Calculate row based adj and alt values
     @ Get row sums
     vpaddl.u8   q0,     q14

     vpaddl.u8   q1,     q15

     vpaddl.u16  q0,     q0

     vpaddl.u16  q1,     q1

     @ Both q0 and q1 have four 32 bit sums corresponding to first 4 rows
     @ Pack q0 and q1 into a single register (sum does not exceed 16bits)

     vshl.u32    q8,     q1,     #16
     vorr.u32    q8,     q0,     q8
     @ q8 now contains 8 sums

     @ Load third row of top
     vld1.u8     d24,    [r0],   r2

     @ Load third row of bottom
     vld1.u8     d25,    [r1],   r3

     @ Load fourth row of top
     vld1.u8     d26,    [r0],   r2

     @ Load fourth row of bottom
     vld1.u8     d27,    [r1],   r3

     @ Get row sums
     vpaddl.u8   q2,     q12

     vpaddl.u8   q3,     q13

     vpaddl.u16  q2,     q2

     vpaddl.u16  q3,     q3
     @ Both q2 and q3 have four 32 bit sums corresponding to last 4 rows
     @ Pack q2 and q3 into a single register (sum does not exceed 16bits)

     vshl.u32    q9,     q3,     #16
     vorr.u32    q9,     q2,     q9
     @ q9 now contains 8 sums

     @ Compute absolute diff between top and bottom row sums
     vabd.u16    d16,    d16,    d17
     vabd.u16    d17,    d18,    d19

     @ RSUM_CSUM_THRESH
     vmov.u16    q9,     #20

     @ Eliminate values smaller than RSUM_CSUM_THRESH
     vcge.u16    q10,    q8,     q9
     vand.u16    q10,    q8,     q10
     @ q10 now contains 8 absolute diff of sums above the threshold


     @ Compute adj
     vadd.u16    d20,    d20,    d21

     @ d20 has four adj values for two sub-blocks

     @ Compute alt
     vabd.u32    q0,     q0,     q1
     vabd.u32    q2,     q2,     q3

     vadd.u32    q0,     q0,     q2
     vadd.u32    d21,    d0,     d1
     @ d21 has two values for two sub-blocks


     @ Calculate column based adj and alt values

     vrhadd.u8   q0,     q14,    q15
     vrhadd.u8   q1,     q12,    q13
     vrhadd.u8   q0,     q0,     q1

     vabd.u8     d0,     d0,     d1

     @ RSUM_CSUM_THRESH >> 2
     vmov.u8     d9,     #5

     @ Eliminate values smaller than RSUM_CSUM_THRESH >> 2
     vcge.u8     d1,     d0,     d9
     vand.u8     d0,     d0,     d1
     @ d0 now contains 8 absolute diff of sums above the threshold


     vpaddl.u8   d0,     d0
     vshl.u16    d0,     d0,     #2

     @ Add row based adj
     vadd.u16    d20,    d0,     d20

     vpaddl.u16  d20,    d20
     @ d20 now contains 2 adj values


     vrhadd.u8   d0,     d28,    d29
     vrhadd.u8   d2,     d24,    d25
     vrhadd.u8   d0,     d0,     d2

     vrhadd.u8   d1,     d30,    d31
     vrhadd.u8   d3,     d26,    d27
     vrhadd.u8   d1,     d1,     d3

     vabd.u8     d0,     d0,     d1
     vpaddl.u8   d0,     d0

     vshl.u16    d0,     d0,     #2
     vpaddl.u16  d0,     d0
     vadd.u32    d21,    d0,     d21


     @ d21 now contains 2 alt values

     @ SAD_BIAS_MULT_SHIFT
     vshr.u32    d0,     d21,    #3
     vadd.u32    d21,    d21,    d0

     @ SAD_BIAS_ADDITIVE >> 1
     vmov.u32    d0,     #4
     vadd.u32    d21,    d21,    d0

     vclt.u32    d0,     d21,    d20
     vpaddl.u32  d0,     d0

     vmov.u32    r0,     d0[0]
     cmp         r0,     #0
     movne       r0,     #1
     ldmfd       sp!,    {r4-r10, pc}
	@/******************************************************************************
	@ *
	@ * Copyright (C) 2015 The Android Open Source Project
	@ *
	@ * Licensed under the Apache License, Version 2.0 (the "License");
	@ * you may not use this file except in compliance with the License.
	@ * You may obtain a copy of the License at:
	@ *
	@ * http://www.apache.org/licenses/LICENSE-2.0
	@ *
	@ * Unless required by applicable law or agreed to in writing, software
	@ * distributed under the License is distributed on an "AS IS" BASIS,
	@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	@ * See the License for the specific language governing permissions and
	@ * limitations under the License.
	@ *
	@ *****************************************************************************
	@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
	@*/

	@******************************************************************************
	@*
	@* @brief
	@* This file contains definitions of routines for spatial filter
	@*
	@* @author
	@* Ittiam
	@*
	@* @par List of Functions:
	@* - ideint_cac_8x8_a9()
	@*
	@* @remarks
	@* None
	@*
	@*******************************************************************************


	@******************************************************************************
	@*
	@* @brief Calculates Combing Artifact
	@*
	@* @par Description
	@* This functions calculates combing artifact check (CAC) for given two fields
	@*
	@* @param[in] pu1_top
	@* UWORD8 pointer to top field
	@*
	@* @param[in] pu1_bot
	@* UWORD8 pointer to bottom field
	@*
	@* @param[in] top_strd
	@* Top field stride
	@*
	@* @param[in] bot_strd
	@* Bottom field stride
	@*
	@* @returns
	@* None
	@*
	@* @remarks
	@*
	@******************************************************************************

	.global ideint_cac_8x8_a9

	ideint_cac_8x8_a9:

	stmfd sp!, {r4-r10, lr}

	@ Load first row of top
	vld1.u8 d28, [r0], r2

	@ Load first row of bottom
	vld1.u8 d29, [r1], r3

	@ Load second row of top
	vld1.u8 d30, [r0], r2

	@ Load second row of bottom
	vld1.u8 d31, [r1], r3


	@ Calculate row based adj and alt values
	@ Get row sums
	vpaddl.u8 q0, q14

	vpaddl.u8 q1, q15

	vpaddl.u16 q0, q0

	vpaddl.u16 q1, q1

	@ Both q0 and q1 have four 32 bit sums corresponding to first 4 rows
	@ Pack q0 and q1 into a single register (sum does not exceed 16bits)

	vshl.u32 q8, q1, #16
	vorr.u32 q8, q0, q8
	@ q8 now contains 8 sums

	@ Load third row of top
	vld1.u8 d24, [r0], r2

	@ Load third row of bottom
	vld1.u8 d25, [r1], r3

	@ Load fourth row of top
	vld1.u8 d26, [r0], r2

	@ Load fourth row of bottom
	vld1.u8 d27, [r1], r3

	@ Get row sums
	vpaddl.u8 q2, q12

	vpaddl.u8 q3, q13

	vpaddl.u16 q2, q2

	vpaddl.u16 q3, q3
	@ Both q2 and q3 have four 32 bit sums corresponding to last 4 rows
	@ Pack q2 and q3 into a single register (sum does not exceed 16bits)

	vshl.u32 q9, q3, #16
	vorr.u32 q9, q2, q9
	@ q9 now contains 8 sums

	@ Compute absolute diff between top and bottom row sums
	vabd.u16 d16, d16, d17
	vabd.u16 d17, d18, d19

	@ RSUM_CSUM_THRESH
	vmov.u16 q9, #20

	@ Eliminate values smaller than RSUM_CSUM_THRESH
	vcge.u16 q10, q8, q9
	vand.u16 q10, q8, q10
	@ q10 now contains 8 absolute diff of sums above the threshold


	@ Compute adj
	vadd.u16 d20, d20, d21

	@ d20 has four adj values for two sub-blocks

	@ Compute alt
	vabd.u32 q0, q0, q1
	vabd.u32 q2, q2, q3

	vadd.u32 q0, q0, q2
	vadd.u32 d21, d0, d1
	@ d21 has two values for two sub-blocks


	@ Calculate column based adj and alt values

	vrhadd.u8 q0, q14, q15
	vrhadd.u8 q1, q12, q13
	vrhadd.u8 q0, q0, q1

	vabd.u8 d0, d0, d1

	@ RSUM_CSUM_THRESH >> 2
	vmov.u8 d9, #5

	@ Eliminate values smaller than RSUM_CSUM_THRESH >> 2
	vcge.u8 d1, d0, d9
	vand.u8 d0, d0, d1
	@ d0 now contains 8 absolute diff of sums above the threshold


	vpaddl.u8 d0, d0
	vshl.u16 d0, d0, #2

	@ Add row based adj
	vadd.u16 d20, d0, d20

	vpaddl.u16 d20, d20
	@ d20 now contains 2 adj values


	vrhadd.u8 d0, d28, d29
	vrhadd.u8 d2, d24, d25
	vrhadd.u8 d0, d0, d2

	vrhadd.u8 d1, d30, d31
	vrhadd.u8 d3, d26, d27
	vrhadd.u8 d1, d1, d3

	vabd.u8 d0, d0, d1
	vpaddl.u8 d0, d0

	vshl.u16 d0, d0, #2
	vpaddl.u16 d0, d0
	vadd.u32 d21, d0, d21


	@ d21 now contains 2 alt values

	@ SAD_BIAS_MULT_SHIFT
	vshr.u32 d0, d21, #3
	vadd.u32 d21, d21, d0

	@ SAD_BIAS_ADDITIVE >> 1
	vmov.u32 d0, #4
	vadd.u32 d21, d21, d0

	vclt.u32 d0, d21, d20
	vpaddl.u32 d0, d0

	vmov.u32 r0, d0[0]
	cmp r0, #0
	movne r0, #1
	ldmfd sp!, {r4-r10, pc}