blob: ee10884bf13be7c018c8e4e3c2d211f6ca60003e [file] [log] [blame]
/*
* Copyright (C) 2012 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
r0 = dst
r1 = y0 base pointer
r2 = y1 base pointer
r3 = y2 base pointer
sp = coeffs
sp = length / 2
*/
#define ENTRY(f) .text; .align 0; .globl f; .type f,#function; f: .fnstart
#define END(f) .fnend; .size f, .-f;
ENTRY(rsdIntrinsicConvolve3x3_K)
push {r4-r8, r10, r11, lr}
vpush {q4-q7}
/* Get the coeffs pointer from the stack and load the
coefficients in the q0, q1 NEON registers */
ldr r4, [sp, #32+64]
vld1.16 {q0, q1}, [r4]
/* Get count from the stack */
ldr r4, [sp, #36+64]
/* Load the frequently used immediate in a register */
mov r5, #8
1:
/* Load and post-increase the address by r5=#8 */
vld1.8 {q13}, [r1], r5
vld1.8 {q14}, [r2], r5
vld1.8 {q15}, [r3], r5
/* Signal memory for data that will be used in the loop after the next */
pld [r1, r5]
pld [r2, r5]
pld [r3, r5]
vmovl.u8 q2, d26
vmovl.u8 q3, d27
vmovl.u8 q4, d28
vmovl.u8 q5, d29
vmovl.u8 q6, d30
vmovl.u8 q7, d31
/*
The two pixel source array is
d4, d5, d6, d7
d8, d9, d10, d11
d12, d13, d14, d15
*/
vmull.s16 q8, d4, d0[0]
vmlal.s16 q8, d5, d0[1]
vmlal.s16 q8, d6, d0[2]
vmlal.s16 q8, d8, d0[3]
vmlal.s16 q8, d9, d1[0]
vmlal.s16 q8, d10, d1[1]
vmlal.s16 q8, d12, d1[2]
vmlal.s16 q8, d13, d1[3]
vmlal.s16 q8, d14, d2[0]
vmull.s16 q9, d5, d0[0]
vmlal.s16 q9, d6, d0[1]
vmlal.s16 q9, d7, d0[2]
vmlal.s16 q9, d9, d0[3]
vmlal.s16 q9, d10, d1[0]
vmlal.s16 q9, d11, d1[1]
vmlal.s16 q9, d13, d1[2]
vmlal.s16 q9, d14, d1[3]
vmlal.s16 q9, d15, d2[0]
vshrn.i32 d16, q8, #8
vshrn.i32 d17, q9, #8
vqmovun.s16 d16, q8
vst1.8 d16, [r0]!
/* Are we done yet? */
subs r4, r4, #1
bne 1b
/* We're done, bye! */
vpop {q4-q7}
pop {r4-r8, r10, r11, lr}
bx lr
END(rsdIntrinsicConvolve3x3_K)
/* Convolve 5x5 */
/*
r0 = dst
r1 = y0 base pointer
r2 = y1 base pointer
r3 = y2 base pointer
r4 = y3 base pointer
r5 = y4 base pointer
r6 = coeffs
r7 = length
*/
ENTRY(rsdIntrinsicConvolve5x5_K)
push {r4-r7, lr}
vpush {q4-q7}
/* load y3 in r4 */
ldr r4, [sp, #20 + 64]
/* load y4 in r5 */
ldr r5, [sp, #24 + 64]
/* Load the coefficients pointer */
ldr r6, [sp, #28 + 64]
/* Create the coefficients vector */
vld1.16 {d0, d1, d2, d3}, [r6]!
vld1.16 {d4, d5, d6}, [r6]
vmov.u32 q15, #0x7f
/* load the count */
ldr r6, [sp, #32 + 64]
/* Load the frequently used immediate in a register */
mov r7, #8
1:
/* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
vld1.8 {d24, d25, d26}, [r1], r7 @ y0 ( y - 2 )
vld1.8 {d27, d28, d29}, [r2], r7 @ y0 ( y - 1 )
/* Signal memory for data that will be used in the loop after the next */
pld [r1, r7]
pld [r2, r7]
/* Promoting the 8bit channels to 16bit */
vmovl.u8 q9, d24
vmovl.u8 q10, d25
vmovl.u8 q11, d26
vmovl.u8 q12, d27
vmovl.u8 q13, d28
vmovl.u8 q14, d29
/*
d18, d19, d20, d21, d22, d23,
d24, d25
*/
vmull.s16 q4, d18, d0[0]
vmlal.s16 q4, d19, d0[1]
vmlal.s16 q4, d20, d0[2]
vmlal.s16 q4, d21, d0[3]
vmlal.s16 q4, d22, d1[0]
vmlal.s16 q4, d24, d1[1]
vmlal.s16 q4, d25, d1[2]
vmlal.s16 q4, d26, d1[3]
vmlal.s16 q4, d27, d2[0]
vmlal.s16 q4, d28, d2[1]
vmull.s16 q5, d19, d0[0]
vmlal.s16 q5, d20, d0[1]
vmlal.s16 q5, d21, d0[2]
vmlal.s16 q5, d22, d0[3]
vmlal.s16 q5, d23, d1[0]
vmlal.s16 q5, d25, d1[1]
vmlal.s16 q5, d26, d1[2]
vmlal.s16 q5, d27, d1[3]
vmlal.s16 q5, d28, d2[0]
vmlal.s16 q5, d29, d2[1]
/* Next 2 rows */
/* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
vld1.8 {d24, d25, d26}, [r3], r7 @ y0 ( y )
vld1.8 {d27, d28, d29}, [r4], r7 @ y0 ( y + 1 )
/* Signal memory for data that will be used in the loop after the next */
pld [r3, r7]
pld [r4, r7]
/* Promoting the 8bit channels to 16bit */
vmovl.u8 q9, d24
vmovl.u8 q10, d25
vmovl.u8 q11, d26
vmovl.u8 q12, d27
vmovl.u8 q13, d28
vmovl.u8 q14, d29
/*
d18, d19, d20, d21, d22, d23,
d24, d25
*/
vmlal.s16 q4, d18, d2[2]
vmlal.s16 q4, d19, d2[3]
vmlal.s16 q4, d20, d3[0]
vmlal.s16 q4, d21, d3[1]
vmlal.s16 q4, d22, d3[2]
vmlal.s16 q4, d24, d3[3]
vmlal.s16 q4, d25, d4[0]
vmlal.s16 q4, d26, d4[1]
vmlal.s16 q4, d27, d4[2]
vmlal.s16 q4, d28, d4[3]
vmlal.s16 q5, d19, d2[2]
vmlal.s16 q5, d20, d2[3]
vmlal.s16 q5, d21, d3[0]
vmlal.s16 q5, d22, d3[1]
vmlal.s16 q5, d23, d3[2]
vmlal.s16 q5, d25, d3[3]
vmlal.s16 q5, d26, d4[0]
vmlal.s16 q5, d27, d4[1]
vmlal.s16 q5, d28, d4[2]
vmlal.s16 q5, d29, d4[3]
/* Last row */
/* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
vld1.8 {d24, d25, d26}, [r5], r7 @ y0 ( y + 2 )
/* Signal memory for data that will be used in the loop after the next */
pld [r5, r7]
/* Promoting the 8bit channels to 16bit */
vmovl.u8 q9, d24
vmovl.u8 q10, d25
vmovl.u8 q11, d26
/*
d18, d19, d20, d21, d22, d23,
d24, d25
*/
vmlal.s16 q4, d18, d5[0]
vmlal.s16 q4, d19, d5[1]
vmlal.s16 q4, d20, d5[2]
vmlal.s16 q4, d21, d5[3]
vmlal.s16 q4, d22, d6[0]
vmlal.s16 q5, d19, d5[0]
vmlal.s16 q5, d20, d5[1]
vmlal.s16 q5, d21, d5[2]
vmlal.s16 q5, d22, d5[3]
vmlal.s16 q5, d23, d6[0]
vadd.i32 q4, q4, q15
vadd.i32 q5, q5, q15
/* Narrow it to a d-reg 32 -> 16 bit */
vrshrn.i32 d8, q4, #8
vrshrn.i32 d9, q5, #8
/* Pack 16 -> 8 bit, saturate, put two pixels into D reg */
vqmovun.s16 d8, q4
vst1.8 d8, [r0]! @ return the output and increase the address of r0
/* Are we done? */
subs r6, r6, #1
bne 1b
/* Yup, bye */
vpop {q4-q7}
pop {r4-r7, lr}
bx lr
END(rsdIntrinsicConvolve5x5_K)