blob: 8f049f8cd5f912cf00b75db5bdc69b418028e06b [file] [log] [blame]
@/******************************************************************************
@ *
@ * Copyright (C) 2015 The Android Open Source Project
@ *
@ * Licensed under the Apache License, Version 2.0 (the "License");
@ * you may not use this file except in compliance with the License.
@ * You may obtain a copy of the License at:
@ *
@ * http://www.apache.org/licenses/LICENSE-2.0
@ *
@ * Unless required by applicable law or agreed to in writing, software
@ * distributed under the License is distributed on an "AS IS" BASIS,
@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ * See the License for the specific language governing permissions and
@ * limitations under the License.
@ *
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
@**
@******************************************************************************
@* @file
@* ih264_inter_pred_luma_bilinear_a9q.s
@*
@* @brief
@* Contains function definitions for inter prediction interpolation.
@*
@* @author
@* Ittiam
@*
@* @par List of Functions:
@*
@* - ih264_inter_pred_luma_bilinear_a9q()
@*
@* @remarks
@* None
@*
@*******************************************************************************
@*
@* All the functions here are replicated from ih264_inter_pred_filters.c
@
@**
@**
@**
@ *******************************************************************************
@ * function:ih264_inter_pred_luma_bilinear
@ *
@* @brief
@* This routine applies the bilinear filter to the predictors .
@* The filtering operation is described in
@* sec 8.4.2.2.1 titled "Luma sample interpolation process"
@*
@* @par Description:
@\note
@* This function is called to obtain pixels lying at the following
@* locations (1/4,1), (3/4,1),(1,1/4), (1,3/4) ,(1/4,1/2), (3/4,1/2),(1/2,1/4), (1/2,3/4),(3/4,1/4),(1/4,3/4),(3/4,3/4)&& (1/4,1/4) .
@* The function averages the two adjacent values from the two input arrays in horizontal direction.
@*
@*
@* @param[in] pu1_src1:
@* UWORD8 Pointer to the buffer containing the first input array.
@*
@* @param[in] pu1_src2:
@* UWORD8 Pointer to the buffer containing the second input array.
@*
@* @param[out] pu1_dst
@* UWORD8 pointer to the destination where the output of bilinear filter is stored.
@*
@* @param[in] src_strd1
@* Stride of the first input buffer
@*
@* @param[in] src_strd2
@* Stride of the second input buffer
@*
@* @param[in] dst_strd
@* integer destination stride of pu1_dst
@*
@* @param[in] ht
@* integer height of the array
@*
@* @param[in] wd
@* integer width of the array
@*
@* @returns
@*
@* @remarks
@* None
@*
@*******************************************************************************
@*
@void ih264_inter_pred_luma_bilinear(UWORD8 *pu1_src1,
@ UWORD8 *pu1_src2,
@ UWORD8 *pu1_dst,
@ WORD32 src_strd1,
@ WORD32 src_strd2,
@ WORD32 dst_strd,
@ WORD32 height,
@ WORD32 width)
@
@**************Variables Vs Registers*****************************************
@ r0 => *pu1_src1
@ r1 => *pu1_src2
@ r2 => *pu1_dst
@ r3 => src_strd1
@ r4 => src_strd2
@ r5 => dst_strd
@ r6 => height
@ r7 => width
@
.text
.p2align 2
.global ih264_inter_pred_luma_bilinear_a9q
ih264_inter_pred_luma_bilinear_a9q:
stmfd sp!, {r4-r12, r14} @store register values to stack
vstmdb sp!, {d8-d15} @push neon registers to stack
ldr r4, [sp, #104]
ldr r5, [sp, #108] @
ldr r6, [sp, #112]
ldr r7, [sp, #116]
subs r12, r7, #4 @if wd=4 branch to loop_4
beq loop_4
subs r12, r7, #8 @if wd=8 branch to loop_8
beq loop_8
loop_16: @when wd=16
vld1.8 {q0}, [r0], r3 @// Load row0 ;src1
vld1.8 {q2}, [r1], r4 @// Load row0 ;src2
vld1.8 {q1}, [r0], r3 @// Load row1 ;src1
vaddl.u8 q10, d0, d4
vld1.8 {q3}, [r1], r4 @// Load row1 ;src2
vaddl.u8 q11, d1, d5
vld1.8 {q4}, [r0], r3 @// Load row2 ;src1
vaddl.u8 q12, d2, d6
vld1.8 {q5}, [r0], r3 @// Load row3 ;src1
vaddl.u8 q13, d3, d7
vld1.8 {q6}, [r1], r4 @// Load row2 ;src2
vaddl.u8 q8, d8, d12
vld1.8 {q7}, [r1], r4 @// Load row3 ;src2
vaddl.u8 q9, d9, d13
vqrshrun.s16 d28, q10, #1
vqrshrun.s16 d29, q11, #1
vaddl.u8 q10, d10, d14
vqrshrun.s16 d30, q12, #1
vqrshrun.s16 d31, q13, #1
vst1.8 {q14}, [r2], r5 @//Store dest row0
vaddl.u8 q11, d11, d15
vst1.8 {q15}, [r2], r5 @//Store dest row1
vqrshrun.s16 d28, q8, #1
vld1.8 {q0}, [r0], r3 @// Load row4 ;src1
vqrshrun.s16 d29, q9, #1
vld1.8 {q1}, [r0], r3 @// Load row5 ;src1
vqrshrun.s16 d30, q10, #1
vld1.8 {q2}, [r1], r4 @// Load row4 ;src2
vqrshrun.s16 d31, q11, #1
vld1.8 {q3}, [r1], r4 @// Load row5 ;src2
vaddl.u8 q10, d0, d4
vst1.8 {q14}, [r2], r5 @//Store dest row2
vaddl.u8 q13, d3, d7
vst1.8 {q15}, [r2], r5 @//Store dest row3
vaddl.u8 q11, d1, d5
vld1.8 {q4}, [r0], r3 @// Load row6 ;src1
vaddl.u8 q12, d2, d6
vld1.8 {q5}, [r0], r3 @// Load row7 ;src1
vqrshrun.s16 d28, q10, #1
vld1.8 {q6}, [r1], r4 @// Load row6 ;src2
vqrshrun.s16 d29, q11, #1
vld1.8 {q7}, [r1], r4 @// Load row7 ;src2
vaddl.u8 q8, d8, d12
vaddl.u8 q9, d9, d13
vaddl.u8 q10, d10, d14
vqrshrun.s16 d30, q12, #1
vqrshrun.s16 d31, q13, #1
vst1.8 {q14}, [r2], r5 @//Store dest row4
vaddl.u8 q11, d11, d15
vst1.8 {q15}, [r2], r5 @//Store dest row5
vqrshrun.s16 d28, q8, #1
vqrshrun.s16 d30, q10, #1
vqrshrun.s16 d29, q9, #1
vld1.8 {q2}, [r1], r4 @// Load row8 ;src2
vqrshrun.s16 d31, q11, #1
vst1.8 {q14}, [r2], r5 @//Store dest row6
subs r12, r6, #8
vst1.8 {q15}, [r2], r5 @//Store dest row7
beq end_func @ end function if ht=8
vld1.8 {q0}, [r0], r3 @// Load row8 ;src1
vaddl.u8 q10, d0, d4
vld1.8 {q1}, [r0], r3 @// Load row9 ;src1
vaddl.u8 q11, d1, d5
vld1.8 {q3}, [r1], r4 @// Load row9 ;src2
vqrshrun.s16 d28, q10, #1
vld1.8 {q4}, [r0], r3 @// Load row10 ;src1
vqrshrun.s16 d29, q11, #1
vld1.8 {q5}, [r0], r3 @// Load row11 ;src1
vaddl.u8 q12, d2, d6
vld1.8 {q6}, [r1], r4 @// Load row10 ;src2
vaddl.u8 q13, d3, d7
vld1.8 {q7}, [r1], r4 @// Load row11 ;src2
vaddl.u8 q8, d8, d12
vaddl.u8 q9, d9, d13
vaddl.u8 q10, d10, d14
vqrshrun.s16 d30, q12, #1
vst1.8 {q14}, [r2], r5 @//Store dest row8
vqrshrun.s16 d31, q13, #1
vst1.8 {q15}, [r2], r5 @//Store dest row9
vqrshrun.s16 d28, q8, #1
vld1.8 {q0}, [r0], r3 @// Load row12 ;src1
vaddl.u8 q11, d11, d15
vld1.8 {q1}, [r0], r3 @// Load row13 ;src1
vqrshrun.s16 d29, q9, #1
vld1.8 {q2}, [r1], r4 @// Load row12 ;src2
vqrshrun.s16 d30, q10, #1
vld1.8 {q3}, [r1], r4 @// Load row13 ;src2
vqrshrun.s16 d31, q11, #1
vst1.8 {q14}, [r2], r5 @//Store dest row10
vaddl.u8 q10, d0, d4
vst1.8 {q15}, [r2], r5 @//Store dest row11
vaddl.u8 q11, d1, d5
vld1.8 {q4}, [r0], r3 @// Load row14 ;src1
vaddl.u8 q13, d3, d7
vld1.8 {q5}, [r0], r3 @// Load row15 ;src1
vaddl.u8 q12, d2, d6
vld1.8 {q6}, [r1], r4 @// Load row14 ;src2
vaddl.u8 q8, d8, d12
vld1.8 {q7}, [r1], r4 @// Load row15 ;src2
vaddl.u8 q9, d9, d13
vqrshrun.s16 d28, q10, #1
vqrshrun.s16 d29, q11, #1
vaddl.u8 q10, d10, d14
vst1.8 {q14}, [r2], r5 @//Store dest row12
vqrshrun.s16 d30, q12, #1
vqrshrun.s16 d31, q13, #1
vaddl.u8 q11, d11, d15
vst1.8 {q15}, [r2], r5 @//Store dest row13
vqrshrun.s16 d28, q8, #1
vqrshrun.s16 d29, q9, #1
vqrshrun.s16 d30, q10, #1
vst1.8 {q14}, [r2], r5 @//Store dest row14
vqrshrun.s16 d31, q11, #1
vst1.8 {q15}, [r2], r5 @//Store dest row15
b end_func
loop_8: @wd=8;
vld1.8 {d0}, [r0], r3 @// Load row0 ;src1
vld1.8 {d4}, [r1], r4 @// Load row0 ;src2
vld1.8 {d1}, [r0], r3 @// Load row1 ;src1
vaddl.u8 q10, d0, d4
vld1.8 {d5}, [r1], r4 @// Load row1 ;src2
vld1.8 {d2}, [r0], r3 @// Load row2 ;src1
vqrshrun.s16 d28, q10, #1
vld1.8 {d6}, [r1], r4 @// Load row2 ;src2
vaddl.u8 q11, d1, d5
vld1.8 {d3}, [r0], r3 @// Load row3 ;src1
vaddl.u8 q12, d2, d6
vst1.8 {d28}, [r2], r5 @//Store dest row0
vqrshrun.s16 d29, q11, #1
vld1.8 {d7}, [r1], r4 @// Load row3 ;src2
vqrshrun.s16 d30, q12, #1
vst1.8 {d29}, [r2], r5 @//Store dest row1
vaddl.u8 q13, d3, d7
vst1.8 {d30}, [r2], r5 @//Store dest row2
vqrshrun.s16 d31, q13, #1
subs r12, r6, #4
vst1.8 {d31}, [r2], r5 @//Store dest row3
beq end_func @ end function if ht=4
vld1.8 {d12}, [r1], r4 @// Load row4 ;src2
vld1.8 {d8}, [r0], r3 @// Load row4 ;src1
vld1.8 {d9}, [r0], r3 @// Load row5 ;src1
vaddl.u8 q8, d8, d12
vld1.8 {d13}, [r1], r4 @// Load row5 ;src2
vld1.8 {d10}, [r0], r3 @// Load row6;src1
vaddl.u8 q9, d9, d13
vld1.8 {d14}, [r1], r4 @// Load row6 ;src2
vqrshrun.s16 d28, q8, #1
vld1.8 {d11}, [r0], r3 @// Load row7 ;src1
vqrshrun.s16 d29, q9, #1
vst1.8 {d28}, [r2], r5 @//Store dest row4
vaddl.u8 q10, d10, d14
vst1.8 {d29}, [r2], r5 @//Store dest row5
vqrshrun.s16 d30, q10, #1
vld1.8 {d15}, [r1], r4 @// Load row7 ;src2
vaddl.u8 q11, d11, d15
vst1.8 {d30}, [r2], r5 @//Store dest row6
vqrshrun.s16 d31, q11, #1
subs r12, r6, #8
vst1.8 {d31}, [r2], r5 @//Store dest row7
beq end_func @ end function if ht=8
vld1.8 {d0}, [r0], r3 @// Load row8 ;src1
vld1.8 {d4}, [r1], r4 @// Load row8 ;src2
vld1.8 {d1}, [r0], r3 @// Load row9 ;src1
vaddl.u8 q10, d0, d4
vld1.8 {d5}, [r1], r4 @// Load row9 ;src2
vld1.8 {d2}, [r0], r3 @// Load row10 ;src1
vaddl.u8 q11, d1, d5
vld1.8 {d6}, [r1], r4 @// Load row10 ;src2
vqrshrun.s16 d28, q10, #1
vld1.8 {d3}, [r0], r3 @// Load row11 ;src1
vaddl.u8 q12, d2, d6
vld1.8 {d7}, [r1], r4 @// Load row11 ;src2
vqrshrun.s16 d29, q11, #1
vld1.8 {d8}, [r0], r3 @// Load row12 ;src1
vaddl.u8 q13, d3, d7
vst1.8 {d28}, [r2], r5 @//Store dest row8
vqrshrun.s16 d30, q12, #1
vld1.8 {d12}, [r1], r4 @// Load row12 ;src2
vqrshrun.s16 d31, q13, #1
vst1.8 {d29}, [r2], r5 @//Store dest row9
vaddl.u8 q8, d8, d12
vld1.8 {d9}, [r0], r3 @// Load row13 ;src1
vqrshrun.s16 d28, q8, #1
vld1.8 {d13}, [r1], r4 @// Load row13 ;src2
vld1.8 {d10}, [r0], r3 @// Load row14;src1
vaddl.u8 q9, d9, d13
vld1.8 {d11}, [r0], r3 @// Load row15 ;src1
vld1.8 {d14}, [r1], r4 @// Load row14 ;src2
vqrshrun.s16 d29, q9, #1
vld1.8 {d15}, [r1], r4 @// Load roW15 ;src2
vaddl.u8 q10, d10, d14
vst1.8 {d30}, [r2], r5 @//Store dest row10
vaddl.u8 q11, d11, d15
vst1.8 {d31}, [r2], r5 @//Store dest row11
vqrshrun.s16 d30, q10, #1
vst1.8 {d28}, [r2], r5 @//Store dest row12
vqrshrun.s16 d31, q11, #1
vst1.8 {d29}, [r2], r5 @//Store dest row13
vst1.8 {d30}, [r2], r5 @//Store dest row14
vst1.8 {d31}, [r2], r5 @//Store dest row15
b end_func
loop_4:
vld1.32 d0[0], [r0], r3 @// Load row0 ;src1
vld1.32 d4[0], [r1], r4 @// Load row0 ;src2
vld1.32 d1[0], [r0], r3 @// Load row1 ;src1
vaddl.u8 q10, d0, d4
vld1.32 d5[0], [r1], r4 @// Load row1 ;src2
vld1.32 d2[0], [r0], r3 @// Load row2 ;src1
vqrshrun.s16 d28, q10, #1
vld1.32 d6[0], [r1], r4 @// Load row2 ;src2
vaddl.u8 q11, d1, d5
vld1.32 d3[0], [r0], r3 @// Load row3 ;src1
vaddl.u8 q12, d2, d6
vst1.32 d28[0], [r2], r5 @//Store dest row0
vqrshrun.s16 d29, q11, #1
vld1.32 d7[0], [r1], r4 @// Load row3 ;src2
vqrshrun.s16 d30, q12, #1
vst1.32 d29[0], [r2], r5 @//Store dest row1
vaddl.u8 q13, d3, d7
vst1.32 d30[0], [r2], r5 @//Store dest row2
vqrshrun.s16 d31, q13, #1
subs r12, r6, #4
vst1.32 d31[0], [r2], r5 @//Store dest row3
beq end_func @ end function if ht=4
vld1.32 d12[0], [r1], r4 @// Load row4 ;src2
vld1.32 d8[0], [r0], r3 @// Load row4 ;src1
vld1.32 d9[0], [r0], r3 @// Load row5 ;src1
vaddl.u8 q8, d8, d12
vld1.32 d13[0], [r1], r4 @// Load row5 ;src2
vld1.32 d10[0], [r0], r3 @// Load row6;src1
vaddl.u8 q9, d9, d13
vld1.32 d14[0], [r1], r4 @// Load row6 ;src2
vqrshrun.s16 d28, q8, #1
vld1.32 d11[0], [r0], r3 @// Load row7 ;src1
vqrshrun.s16 d29, q9, #1
vst1.32 d28[0], [r2], r5 @//Store dest row4
vaddl.u8 q10, d10, d14
vst1.32 d29[0], [r2], r5 @//Store dest row5
vqrshrun.s16 d30, q10, #1
vld1.32 d15[0], [r1], r4 @// Load row7 ;src2
vaddl.u8 q11, d11, d15
vst1.32 d30[0], [r2], r5 @//Store dest row6
vqrshrun.s16 d31, q11, #1
vst1.32 d31[0], [r2], r5 @//Store dest row7
end_func:
vldmia sp!, {d8-d15} @ Restore neon registers that were saved
ldmfd sp!, {r4-r12, pc} @Restoring registers from stack