cpu_ref/rsCpuIntrinsics_neon_Convolve.S - platform/frameworks/rs - Git at Google

 /*
  * Copyright (C) 2012 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 /*
         r0 = dst
         r1 = y0 base pointer
         r2 = y1 base pointer
         r3 = y2 base pointer
         sp = coeffs
         sp = length / 2
 */

 #define ENTRY(f) .text; .align 0; .globl f; .type f,#function; f: .fnstart
 #define END(f) .fnend; .size f, .-f;

 ENTRY(rsdIntrinsicConvolve3x3_K)
         push            {r4-r8, r10, r11, lr}
         vpush           {q4-q7}

         /* Get the coeffs pointer from the stack and load the
            coefficients in the q0, q1 NEON registers */
         ldr r4, [sp, #32+64]
         vld1.16 {q0, q1}, [r4]

         /* Get count from the stack */
         ldr r4, [sp, #36+64]

         /* Load the frequently used immediate in a register */
         mov r5, #8

 1:
         /* Load and post-increase the address by r5=#8 */
         vld1.8 {q13}, [r1], r5
         vld1.8 {q14}, [r2], r5
         vld1.8 {q15}, [r3], r5

         /* Signal memory for data that will be used in the loop after the next */
         pld         [r1, r5]
         pld         [r2, r5]
         pld         [r3, r5]

         vmovl.u8 q2, d26
         vmovl.u8 q3, d27
         vmovl.u8 q4, d28
         vmovl.u8 q5, d29
         vmovl.u8 q6, d30
         vmovl.u8 q7, d31

 /*
         The two pixel source array is
         d4,  d5,  d6,  d7
         d8,  d9,  d10, d11
         d12, d13, d14, d15
 */

         vmull.s16 q8, d4, d0[0]
         vmlal.s16 q8, d5, d0[1]
         vmlal.s16 q8, d6, d0[2]
         vmlal.s16 q8, d8, d0[3]
         vmlal.s16 q8, d9, d1[0]
         vmlal.s16 q8, d10, d1[1]
         vmlal.s16 q8, d12, d1[2]
         vmlal.s16 q8, d13, d1[3]
         vmlal.s16 q8, d14, d2[0]

         vmull.s16 q9, d5, d0[0]
         vmlal.s16 q9, d6, d0[1]
         vmlal.s16 q9, d7, d0[2]
         vmlal.s16 q9, d9, d0[3]
         vmlal.s16 q9, d10, d1[0]
         vmlal.s16 q9, d11, d1[1]
         vmlal.s16 q9, d13, d1[2]
         vmlal.s16 q9, d14, d1[3]
         vmlal.s16 q9, d15, d2[0]

         vshrn.i32 d16, q8, #8
         vshrn.i32 d17, q9, #8

         vqmovun.s16 d16, q8
         vst1.8 d16, [r0]!

         /* Are we done yet? */
         subs r4, r4, #1
         bne 1b

         /* We're done, bye! */
         vpop            {q4-q7}
         pop             {r4-r8, r10, r11, lr}
         bx              lr
 END(rsdIntrinsicConvolve3x3_K)


 /* Convolve 5x5 */

 /*
         r0 = dst
         r1 = y0 base pointer
         r2 = y1 base pointer
         r3 = y2 base pointer
         r4 = y3 base pointer
         r5 = y4 base pointer
         r6 = coeffs
         r7 = length
 */
 ENTRY(rsdIntrinsicConvolve5x5_K)
         push        {r4-r7, lr}
         vpush       {q4-q7}

         /* load y3 in r4 */
         ldr     r4, [sp, #20 + 64]

         /* load y4 in r5 */
         ldr     r5, [sp, #24 + 64]

         /* Load the coefficients pointer */
         ldr     r6, [sp, #28 + 64]

         /* Create the coefficients vector */
         vld1.16     {d0, d1, d2, d3}, [r6]!
         vld1.16     {d4, d5, d6}, [r6]

         vmov.u32  q15, #0x7f

         /* load the count */
         ldr     r6, [sp, #32 + 64]

         /* Load the frequently used immediate in a register */
         mov     r7, #8

 1:
         /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
         vld1.8  {d24, d25, d26}, [r1], r7      @  y0 ( y - 2 )
         vld1.8  {d27, d28, d29}, [r2], r7      @  y0 ( y - 1 )

         /* Signal memory for data that will be used in the loop after the next */
         pld         [r1, r7]
         pld         [r2, r7]

         /* Promoting the 8bit channels to 16bit */
         vmovl.u8 q9,  d24
         vmovl.u8 q10, d25
         vmovl.u8 q11, d26
         vmovl.u8 q12, d27
         vmovl.u8 q13, d28
         vmovl.u8 q14, d29

 /*
         d18,  d19,  d20, d21, d22, d23,
         d24,  d25
 */
         vmull.s16 q4, d18, d0[0]
         vmlal.s16 q4, d19, d0[1]
         vmlal.s16 q4, d20, d0[2]
         vmlal.s16 q4, d21, d0[3]
         vmlal.s16 q4, d22, d1[0]

         vmlal.s16 q4, d24, d1[1]
         vmlal.s16 q4, d25, d1[2]
         vmlal.s16 q4, d26, d1[3]
         vmlal.s16 q4, d27, d2[0]
         vmlal.s16 q4, d28, d2[1]

         vmull.s16 q5, d19, d0[0]
         vmlal.s16 q5, d20, d0[1]
         vmlal.s16 q5, d21, d0[2]
         vmlal.s16 q5, d22, d0[3]
         vmlal.s16 q5, d23, d1[0]

         vmlal.s16 q5, d25, d1[1]
         vmlal.s16 q5, d26, d1[2]
         vmlal.s16 q5, d27, d1[3]
         vmlal.s16 q5, d28, d2[0]
         vmlal.s16 q5, d29, d2[1]


         /* Next 2 rows */
         /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
         vld1.8  {d24, d25, d26}, [r3], r7      @  y0 ( y )
         vld1.8  {d27, d28, d29}, [r4], r7      @  y0 ( y + 1 )

         /* Signal memory for data that will be used in the loop after the next */
         pld         [r3, r7]
         pld         [r4, r7]

         /* Promoting the 8bit channels to 16bit */
         vmovl.u8 q9,  d24
         vmovl.u8 q10, d25
         vmovl.u8 q11, d26
         vmovl.u8 q12, d27
         vmovl.u8 q13, d28
         vmovl.u8 q14, d29

 /*
         d18,  d19,  d20, d21, d22, d23,
         d24,  d25
 */
         vmlal.s16 q4, d18, d2[2]
         vmlal.s16 q4, d19, d2[3]
         vmlal.s16 q4, d20, d3[0]
         vmlal.s16 q4, d21, d3[1]
         vmlal.s16 q4, d22, d3[2]

         vmlal.s16 q4, d24, d3[3]
         vmlal.s16 q4, d25, d4[0]
         vmlal.s16 q4, d26, d4[1]
         vmlal.s16 q4, d27, d4[2]
         vmlal.s16 q4, d28, d4[3]

         vmlal.s16 q5, d19, d2[2]
         vmlal.s16 q5, d20, d2[3]
         vmlal.s16 q5, d21, d3[0]
         vmlal.s16 q5, d22, d3[1]
         vmlal.s16 q5, d23, d3[2]

         vmlal.s16 q5, d25, d3[3]
         vmlal.s16 q5, d26, d4[0]
         vmlal.s16 q5, d27, d4[1]
         vmlal.s16 q5, d28, d4[2]
         vmlal.s16 q5, d29, d4[3]

         /* Last row */
         /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
         vld1.8  {d24, d25, d26}, [r5], r7      @  y0 ( y + 2 )

         /* Signal memory for data that will be used in the loop after the next */
         pld         [r5, r7]

         /* Promoting the 8bit channels to 16bit */
         vmovl.u8 q9,  d24
         vmovl.u8 q10, d25
         vmovl.u8 q11, d26

 /*
         d18,  d19,  d20, d21, d22, d23,
         d24,  d25
 */

         vmlal.s16 q4, d18, d5[0]
         vmlal.s16 q4, d19, d5[1]
         vmlal.s16 q4, d20, d5[2]
         vmlal.s16 q4, d21, d5[3]
         vmlal.s16 q4, d22, d6[0]

         vmlal.s16 q5, d19, d5[0]
         vmlal.s16 q5, d20, d5[1]
         vmlal.s16 q5, d21, d5[2]
         vmlal.s16 q5, d22, d5[3]
         vmlal.s16 q5, d23, d6[0]


         vadd.i32 q4, q4, q15
         vadd.i32 q5, q5, q15

 /*      Narrow it to a d-reg 32 -> 16 bit */
         vrshrn.i32 d8, q4, #8
         vrshrn.i32 d9, q5, #8


 /*      Pack 16 -> 8 bit, saturate, put two pixels into D reg */
         vqmovun.s16 d8, q4

         vst1.8 d8, [r0]!           @ return the output and increase the address of r0

         /* Are we done? */
         subs r6, r6, #1
         bne 1b

         /* Yup, bye */
         vpop        {q4-q7}
         pop         {r4-r7, lr}
         bx          lr

 END(rsdIntrinsicConvolve5x5_K)
	/*
	* Copyright (C) 2012 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/*
	r0 = dst
	r1 = y0 base pointer
	r2 = y1 base pointer
	r3 = y2 base pointer
	sp = coeffs
	sp = length / 2
	*/

	#define ENTRY(f) .text; .align 0; .globl f; .type f,#function; f: .fnstart
	#define END(f) .fnend; .size f, .-f;

	ENTRY(rsdIntrinsicConvolve3x3_K)
	push {r4-r8, r10, r11, lr}
	vpush {q4-q7}

	/* Get the coeffs pointer from the stack and load the
	coefficients in the q0, q1 NEON registers */
	ldr r4, [sp, #32+64]
	vld1.16 {q0, q1}, [r4]

	/* Get count from the stack */
	ldr r4, [sp, #36+64]

	/* Load the frequently used immediate in a register */
	mov r5, #8

	1:
	/* Load and post-increase the address by r5=#8 */
	vld1.8 {q13}, [r1], r5
	vld1.8 {q14}, [r2], r5
	vld1.8 {q15}, [r3], r5

	/* Signal memory for data that will be used in the loop after the next */
	pld [r1, r5]
	pld [r2, r5]
	pld [r3, r5]

	vmovl.u8 q2, d26
	vmovl.u8 q3, d27
	vmovl.u8 q4, d28
	vmovl.u8 q5, d29
	vmovl.u8 q6, d30
	vmovl.u8 q7, d31

	/*
	The two pixel source array is
	d4, d5, d6, d7
	d8, d9, d10, d11
	d12, d13, d14, d15
	*/

	vmull.s16 q8, d4, d0[0]
	vmlal.s16 q8, d5, d0[1]
	vmlal.s16 q8, d6, d0[2]
	vmlal.s16 q8, d8, d0[3]
	vmlal.s16 q8, d9, d1[0]
	vmlal.s16 q8, d10, d1[1]
	vmlal.s16 q8, d12, d1[2]
	vmlal.s16 q8, d13, d1[3]
	vmlal.s16 q8, d14, d2[0]

	vmull.s16 q9, d5, d0[0]
	vmlal.s16 q9, d6, d0[1]
	vmlal.s16 q9, d7, d0[2]
	vmlal.s16 q9, d9, d0[3]
	vmlal.s16 q9, d10, d1[0]
	vmlal.s16 q9, d11, d1[1]
	vmlal.s16 q9, d13, d1[2]
	vmlal.s16 q9, d14, d1[3]
	vmlal.s16 q9, d15, d2[0]

	vshrn.i32 d16, q8, #8
	vshrn.i32 d17, q9, #8

	vqmovun.s16 d16, q8
	vst1.8 d16, [r0]!

	/* Are we done yet? */
	subs r4, r4, #1
	bne 1b

	/* We're done, bye! */
	vpop {q4-q7}
	pop {r4-r8, r10, r11, lr}
	bx lr
	END(rsdIntrinsicConvolve3x3_K)


	/* Convolve 5x5 */

	/*
	r0 = dst
	r1 = y0 base pointer
	r2 = y1 base pointer
	r3 = y2 base pointer
	r4 = y3 base pointer
	r5 = y4 base pointer
	r6 = coeffs
	r7 = length
	*/
	ENTRY(rsdIntrinsicConvolve5x5_K)
	push {r4-r7, lr}
	vpush {q4-q7}

	/* load y3 in r4 */
	ldr r4, [sp, #20 + 64]

	/* load y4 in r5 */
	ldr r5, [sp, #24 + 64]

	/* Load the coefficients pointer */
	ldr r6, [sp, #28 + 64]

	/* Create the coefficients vector */
	vld1.16 {d0, d1, d2, d3}, [r6]!
	vld1.16 {d4, d5, d6}, [r6]

	vmov.u32 q15, #0x7f

	/* load the count */
	ldr r6, [sp, #32 + 64]

	/* Load the frequently used immediate in a register */
	mov r7, #8

	1:
	/* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
	vld1.8 {d24, d25, d26}, [r1], r7 @ y0 ( y - 2 )
	vld1.8 {d27, d28, d29}, [r2], r7 @ y0 ( y - 1 )

	/* Signal memory for data that will be used in the loop after the next */
	pld [r1, r7]
	pld [r2, r7]

	/* Promoting the 8bit channels to 16bit */
	vmovl.u8 q9, d24
	vmovl.u8 q10, d25
	vmovl.u8 q11, d26
	vmovl.u8 q12, d27
	vmovl.u8 q13, d28
	vmovl.u8 q14, d29

	/*
	d18, d19, d20, d21, d22, d23,
	d24, d25
	*/
	vmull.s16 q4, d18, d0[0]
	vmlal.s16 q4, d19, d0[1]
	vmlal.s16 q4, d20, d0[2]
	vmlal.s16 q4, d21, d0[3]
	vmlal.s16 q4, d22, d1[0]

	vmlal.s16 q4, d24, d1[1]
	vmlal.s16 q4, d25, d1[2]
	vmlal.s16 q4, d26, d1[3]
	vmlal.s16 q4, d27, d2[0]
	vmlal.s16 q4, d28, d2[1]

	vmull.s16 q5, d19, d0[0]
	vmlal.s16 q5, d20, d0[1]
	vmlal.s16 q5, d21, d0[2]
	vmlal.s16 q5, d22, d0[3]
	vmlal.s16 q5, d23, d1[0]

	vmlal.s16 q5, d25, d1[1]
	vmlal.s16 q5, d26, d1[2]
	vmlal.s16 q5, d27, d1[3]
	vmlal.s16 q5, d28, d2[0]
	vmlal.s16 q5, d29, d2[1]


	/* Next 2 rows */
	/* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
	vld1.8 {d24, d25, d26}, [r3], r7 @ y0 ( y )
	vld1.8 {d27, d28, d29}, [r4], r7 @ y0 ( y + 1 )

	/* Signal memory for data that will be used in the loop after the next */
	pld [r3, r7]
	pld [r4, r7]

	/* Promoting the 8bit channels to 16bit */
	vmovl.u8 q9, d24
	vmovl.u8 q10, d25
	vmovl.u8 q11, d26
	vmovl.u8 q12, d27
	vmovl.u8 q13, d28
	vmovl.u8 q14, d29

	/*
	d18, d19, d20, d21, d22, d23,
	d24, d25
	*/
	vmlal.s16 q4, d18, d2[2]
	vmlal.s16 q4, d19, d2[3]
	vmlal.s16 q4, d20, d3[0]
	vmlal.s16 q4, d21, d3[1]
	vmlal.s16 q4, d22, d3[2]

	vmlal.s16 q4, d24, d3[3]
	vmlal.s16 q4, d25, d4[0]
	vmlal.s16 q4, d26, d4[1]
	vmlal.s16 q4, d27, d4[2]
	vmlal.s16 q4, d28, d4[3]

	vmlal.s16 q5, d19, d2[2]
	vmlal.s16 q5, d20, d2[3]
	vmlal.s16 q5, d21, d3[0]
	vmlal.s16 q5, d22, d3[1]
	vmlal.s16 q5, d23, d3[2]

	vmlal.s16 q5, d25, d3[3]
	vmlal.s16 q5, d26, d4[0]
	vmlal.s16 q5, d27, d4[1]
	vmlal.s16 q5, d28, d4[2]
	vmlal.s16 q5, d29, d4[3]

	/* Last row */
	/* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
	vld1.8 {d24, d25, d26}, [r5], r7 @ y0 ( y + 2 )

	/* Signal memory for data that will be used in the loop after the next */
	pld [r5, r7]

	/* Promoting the 8bit channels to 16bit */
	vmovl.u8 q9, d24
	vmovl.u8 q10, d25
	vmovl.u8 q11, d26

	/*
	d18, d19, d20, d21, d22, d23,
	d24, d25
	*/

	vmlal.s16 q4, d18, d5[0]
	vmlal.s16 q4, d19, d5[1]
	vmlal.s16 q4, d20, d5[2]
	vmlal.s16 q4, d21, d5[3]
	vmlal.s16 q4, d22, d6[0]

	vmlal.s16 q5, d19, d5[0]
	vmlal.s16 q5, d20, d5[1]
	vmlal.s16 q5, d21, d5[2]
	vmlal.s16 q5, d22, d5[3]
	vmlal.s16 q5, d23, d6[0]



	vadd.i32 q4, q4, q15
	vadd.i32 q5, q5, q15

	/* Narrow it to a d-reg 32 -> 16 bit */
	vrshrn.i32 d8, q4, #8
	vrshrn.i32 d9, q5, #8


	/* Pack 16 -> 8 bit, saturate, put two pixels into D reg */
	vqmovun.s16 d8, q4

	vst1.8 d8, [r0]! @ return the output and increase the address of r0

	/* Are we done? */
	subs r6, r6, #1
	bne 1b

	/* Yup, bye */
	vpop {q4-q7}
	pop {r4-r7, lr}
	bx lr

	END(rsdIntrinsicConvolve5x5_K)