blob: 61370549f6729231f7e6876e7feceab443662c1f [file] [log] [blame]
@/******************************************************************************
@ *
@ * Copyright (C) 2015 The Android Open Source Project
@ *
@ * Licensed under the Apache License, Version 2.0 (the "License");
@ * you may not use this file except in compliance with the License.
@ * You may obtain a copy of the License at:
@ *
@ * http://www.apache.org/licenses/LICENSE-2.0
@ *
@ * Unless required by applicable law or agreed to in writing, software
@ * distributed under the License is distributed on an "AS IS" BASIS,
@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ * See the License for the specific language governing permissions and
@ * limitations under the License.
@ *
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
.data
.p2align 2
scratch_intrapred_luma_4x4_prediction:
.long ver, hor, d_c, dia_dl
.long dia_dr, ver_r, hor_d, ver_l
.long hor_u
.text
.p2align 2
scratch_intrapred_luma_4x4_prediction_addr1:
.long scratch_intrapred_luma_4x4_prediction - scrintra_4x4 - 8
@/**
@******************************************************************************
@*
@* @brief :Evaluate best intra 4x4 mode
@* and do the prediction.
@*
@* @par Description
@* This function evaluates 4x4 modes and compute corresponding sad
@* and return the buffer predicted with best mode.
@*
@* @param[in] pu1_src
@* UWORD8 pointer to the source
@*
@** @param[in] pu1_ngbr_pels
@* UWORD8 pointer to neighbouring pels
@*
@* @param[out] pu1_dst
@* UWORD8 pointer to the destination
@*
@* @param[in] src_strd
@* integer source stride
@*
@* @param[in] dst_strd
@* integer destination stride
@*
@* @param[in] u4_n_avblty
@* availability of neighbouring pixels
@*
@* @param[in] u4_intra_mode
@* Pointer to the variable in which best mode is returned
@*
@* @param[in] pu4_sadmin
@* Pointer to the variable in which minimum cost is returned
@*
@* @param[in] u4_valid_intra_modes
@* Says what all modes are valid
@*
@* * @param[in] u4_lambda
@* Lamda value for computing cost from SAD
@*
@* @param[in] u4_predictd_mode
@* Predicted mode for cost computation
@*
@*
@*
@* @return none
@*
@******************************************************************************
@*/
@void ih264e_evaluate_intra_4x4_modes(UWORD8 *pu1_src,
@ UWORD8 *pu1_ngbr_pels,
@ UWORD8 *pu1_dst,
@ UWORD32 src_strd,
@ UWORD32 dst_strd,
@ WORD32 u4_n_avblty,
@ UWORD32 *u4_intra_mode,
@ WORD32 *pu4_sadmin,
@ UWORD32 u4_valid_intra_modes,
@ UWORD32 u4_lambda,
@ UWORD32 u4_predictd_mode)
.global ih264e_evaluate_intra_4x4_modes_a9q
ih264e_evaluate_intra_4x4_modes_a9q:
@r0 = pu1_src,
@r1 = pu1_ngbr_pels_i16,
@r2 = pu1_dst,
@r3 = src_strd,
@r4 = dst_strd,
@r5 = u4_n_avblty,
@r6 = u4_intra_mode,
@r7 = pu4_sadmin
@r8 = u4_valid_intra_modes
@r0 =u4_lambda
@r1 = u4_predictd_mode
stmfd sp!, {r4-r12, r14} @store register values to stack
@--------------------
ldr r5, [sp, #44] @r5 = u4_n_avblty,
@----------------------
vpush {d8-d15}
@Loading neighbours
vld1.32 {q0}, [r1]
add r4, r1, #12
vld1.8 d1[5], [r4]
vld1.8 d1[7], [r1]
@--------------------------------
ldr r8, [sp, #120] @u4_valid_intra_modes
@----------------------------------------------
@ LOADING pu1_src
vld1.32 {d20[0]}, [r0], r3
vext.8 q1, q0, q0, #1
vld1.32 {d20[1]}, [r0], r3
mov r11, #1
vld1.32 {d21[0]}, [r0], r3
lsl r11, r11, #30
vld1.32 {d21[1]}, [r0], r3
@--------------------------------
ldr r0, [sp, #124] @r0 =u4_lambda
ldr r1, [sp, #128] @r1 = u4_predictd_mode
@------
vert:
ands r10, r8, #01 @VERT sad ??
beq horz
vdup.32 q2, d2[1]
vabdl.u8 q14, d4, d20
vabal.u8 q14, d4, d21
vadd.i16 d28, d29, d28
subs r6, r1, #0
vpaddl.u16 d28, d28 @
lslne r6, r0, #2
vpaddl.u32 d28, d28 @/
moveq r6, r0 @
vmov.u32 r9, d28[0] @ vert
add r9, r6, r9
subs r6, r11, r9
movgt r11, r9
movgt r12, #0
horz:
ands r10, r8, #02 @HORZ sad ??
beq dc
vdup.32 q3, d0[0]
vmov.32 q4, q3
vtrn.8 q3, q4
vtrn.16 d7, d6
vtrn.16 d9, d8
vtrn.32 d9, d7
vtrn.32 d8, d6
vabdl.u8 q14, d6, d20
subs r6, r1, #1
vabal.u8 q14, d7, d21
vadd.i16 d28, d29, d28
lslne r6, r0, #2
vpaddl.u16 d28, d28 @
vpaddl.u32 d28, d28 @/
vmov.u32 r9, d28[0] @
moveq r6, r0 @
add r9, r6, r9
subs r6, r11, r9
movgt r11, r9
movgt r12, #1
dc:
ands r10, r8, #04 @DC sad ??
beq diags
vext.8 q4, q0, q0, #5
vaddl.u8 q4, d0, d8
vpaddl.u16 d8, d8 @
vpaddl.u32 d8, d8 @/
vmov.u32 r4, d8[0] @
mov r14, #1
ands r10, r5, #1
addne r4, r4, #2
addne r14, r14, #1
ands r10, r5, #4
addne r4, r4, #2
addne r14, r14, #1
ands r10, r5, #5
moveq r4, #128
moveq r14, #0
subs r6, r1, #2
lsr r4, r4, r14
vdup.8 q4, r4
lslne r6, r0, #2
vabdl.u8 q14, d8, d20
vabal.u8 q14, d9, d21
vadd.i16 d28, d29, d28
vpaddl.u16 d28, d28 @
vpaddl.u32 d28, d28 @/
vmov.u32 r9, d28[0] @
moveq r6, r0 @
add r9, r6, r9
subs r6, r11, r9
movgt r11, r9
movgt r12, #2
diags:
ands r10, r8, #504 @/* if modes other than VERT, HORZ and DC are valid ????*/
beq pred
@/* Performing FILT11 and FILT121 operation for all neighbour values*/
vext.8 q5, q0, q0, #2
vaddl.u8 q6, d0, d2
vaddl.u8 q7, d1, d3
vaddl.u8 q8, d10, d2
vaddl.u8 q9, d11, d3
vadd.u16 q12, q10, q11
vqrshrun.s16 d10, q6, #1
vqrshrun.s16 d11, q7, #1
vadd.u16 q11, q6, q8
vadd.u16 q12, q7, q9
vqrshrun.s16 d12, q11, #2
vqrshrun.s16 d13, q12, #2
mov r14, #0
vdup.32 q13 , r14
mov r14, #-1
vmov.i32 d26[0], r14
diag_dl:
ands r10, r8, #0x08 @DIAG_DL sad ??
beq diag_dr
vext.8 q15, q6, q6, #5
vbit.32 d14, d30, d26
vext.8 q15, q6, q6, #15
vbit.32 d15, d31, d26
vext.8 q15, q6, q6, #2
vext.32 q14, q13, q13, #3
vbit.32 d14, d30, d28
vext.8 q15, q6, q6, #4
vbit.32 d15, d30, d28
vabdl.u8 q14, d14, d20
subs r6, r1, #3
vabal.u8 q14, d15, d21
vadd.i16 d28, d29, d28
vpaddl.u16 d28, d28 @
lslne r6, r0, #2
vpaddl.u32 d28, d28 @/
vmov.u32 r9, d28[0] @
moveq r6, r0 @
add r9, r6, r9
subs r6, r11, r9
movgt r11, r9
movgt r12, #3
diag_dr:
ands r10, r8, #16 @DIAG_DR sad ??
beq vert_r
vext.8 q15, q6, q6, #3
vbit.32 d16, d30, d26
vext.8 q15, q6, q6, #1
vbit.32 d17, d30, d26
vext.8 q15, q6, q6, #4
vext.32 q14, q13, q13, #3
vbit.32 d17, d31, d28
vext.8 q15, q6, q6, #6
vbit.32 d16, d31, d28
vabdl.u8 q14, d16, d20
subs r6, r1, #4
vabal.u8 q14, d17, d21
vadd.i16 d28, d29, d28
vpaddl.u16 d28, d28 @
lslne r6, r0, #2
vpaddl.u32 d28, d28 @/
vmov.u32 r9, d28[0] @
moveq r6, r0 @
add r9, r6, r9
subs r6, r11, r9
movgt r11, r9
movgt r12, #4
vert_r:
ands r10, r8, #32 @VERT_R sad ??
beq horz_d
vext.8 q15, q5, q5, #4
vbit.32 d18, d30, d26
vext.8 q15, q5, q5, #3
vbit.32 d19, d30, d26
vext.32 q14, q13, q13, #3
vext.8 q15, q6, q6, #15
vbit.32 d18, d30, d28
vext.8 q15, q6, q6, #14
vbit.32 d19, d30, d28
mov r14, #0
vdup.32 q14 , r14
mov r14, #0xff
vmov.i8 d28[0], r14
vext.8 q15, q6, q6, #2
vbit.32 d19, d30, d28
vext.32 q14, q14, q14, #3
subs r6, r1, #5
vext.8 q15, q6, q6, #13
vbit.32 d19, d30, d28
lslne r6, r0, #2
vabdl.u8 q14, d18, d20
vabal.u8 q14, d19, d21
vadd.i16 d28, d29, d28
vpaddl.u16 d28, d28 @
vpaddl.u32 d28, d28 @/
vmov.u32 r9, d28[0] @
moveq r6, r0 @
add r9, r6, r9
subs r6, r11, r9
movgt r11, r9
movgt r12, #5
horz_d:
vmov.8 q1, q5
vmov.8 q15, q6
vzip.8 q1, q15
ands r10, r8, #64 @HORZ_D sad ??
beq vert_l
vext.8 q15, q6, q6, #2
vbit.32 d8, d30, d26
mov r14, #0
vdup.32 q14 , r14
mov r14, #0xff
vmov.i8 d28[0], r14
vext.8 q15, q5, q5, #3
vbit.32 d8, d30, d28
vext.8 q15, q1, q1, #2
vbit.32 d9, d30, d26
vext.32 q14, q13, q13, #3
vbit.32 d8, d2, d28
subs r6, r1, #6
vext.8 q15, q1, q1, #12
vbit.32 d9, d30, d28
vabdl.u8 q14, d8, d20
vabal.u8 q14, d9, d21
vadd.i16 d28, d29, d28
vpaddl.u16 d28, d28 @
lslne r6, r0, #2
vpaddl.u32 d28, d28 @/
vmov.u32 r9, d28[0] @
moveq r6, r0 @
add r9, r6, r9
subs r6, r11, r9
movgt r11, r9
movgt r12, #6
vert_l:
ands r10, r8, #128 @VERT_L sad ??
beq horz_u
vext.8 q15, q5, q5, #5
vbit.32 d24, d30, d26
vext.8 q15, q15, q15, #1
vbit.32 d25, d30, d26
vext.8 q15, q6, q6, #1
vext.32 q14, q13, q13, #3
vbit.32 d24, d30, d28
vext.8 q15, q15, q15, #1
subs r6, r1, #7
vbit.32 d25, d30, d28
vabdl.u8 q14, d24, d20
vabal.u8 q14, d25, d21
vadd.i16 d28, d29, d28
vpaddl.u16 d28, d28 @
lslne r6, r0, #2
vpaddl.u32 d28, d28 @/
vmov.u32 r9, d28[0] @
moveq r6, r0 @
add r9, r6, r9
subs r6, r11, r9
movgt r11, r9
movgt r12, #7
horz_u:
ands r10, r8, #256 @HORZ_U sad ??
beq pred
vrev64.8 q5, q1
vdup.8 q1, d0[0]
vext.8 q6, q6, #7
mov r14, #0
vdup.32 q14 , r14
mov r14, #0xff
vmov.i8 d28[0], r14
vbit.32 d11, d13, d28
movw r14, #0xffff
vmov.i16 d28[0], r14
vext.8 q6, q5, q5, #7
subs r6, r1, #8
vbit.32 d3, d12, d28
vext.8 q6, q5, q5, #3
vbit.32 d2, d12, d26
vext.32 q14, q13, q13, #3
vext.8 q6, q5, q5, #1
vbit.32 d2, d12, d28
vabdl.u8 q14, d2, d20
vabal.u8 q14, d3, d21
vadd.i16 d28, d29, d28
vpaddl.u16 d28, d28 @
lslne r6, r0, #2
vpaddl.u32 d28, d28 @/
vmov.u32 r9, d28[0] @
moveq r6, r0 @
add r9, r6, r9
subs r6, r11, r9
movgt r11, r9
movgt r12, #8
pred: @/*dOING FINAL PREDICTION*/
@---------------------------
ldr r7, [sp, #116] @r7 = pu4_sadmin
ldr r6, [sp, #112] @ R6 =MODE
@--------------------------
str r11, [r7] @/STORING MIN SAD*/
str r12, [r6] @/FINAL MODE*/
ldr r3, scratch_intrapred_luma_4x4_prediction_addr1
scrintra_4x4:
add r3, r3, pc
lsl r12, r12, #2
add r3, r3, r12
ldr r5, [r3]
and r5, r5, #0xfffffffe
bx r5
ver:
vext.8 q0, q0, q0, #1
vdup.32 q15, d0[1]
b store
hor:
vmov.32 q15, q3
b store
d_c:
vdup.8 q15, r4
b store
dia_dl:
vmov.32 q15, q7
b store
dia_dr:
vmov.32 q15, q8
b store
ver_r:
vmov.32 q15, q9
b store
hor_d:
vmov.32 q15, q4
b store
ver_l:
vmov.32 q15, q12
b store
hor_u:
vmov.32 q15, q1
store: @/* storing to pu1_dst*/
ldr r4, [sp, #104] @r4 = dst_strd,
vst1.32 {d30[0]}, [r2], r4
vst1.32 {d30[1]}, [r2], r4
vst1.32 {d31[0]}, [r2], r4
vst1.32 {d31[1]}, [r2], r4
end_func:
vpop {d8-d15}
ldmfd sp!, {r4-r12, pc} @Restoring registers from stack