blob: db9e34770f702cb9cdb82437f0e48c7fa8918871 [file] [log] [blame]
///*****************************************************************************
//*
//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************/
///*******************************************************************************
//* @file
//* ihevc_deblk_luma_vert.s
//*
//* @brief
//* contains function definitions for inter prediction interpolation.
//* functions are coded using neon intrinsics and can be compiled using
//* rvct
//*
//* @author
//* anand s
//*
//* @par list of functions:
//*
//*
//* @remarks
//* none
//*
//*******************************************************************************/
.text
.align 4
.extern gai4_ihevc_tc_table
.extern gai4_ihevc_beta_table
.globl ihevc_deblk_luma_horz_av8
.type ihevc_deblk_luma_horz_av8, %function
ihevc_deblk_luma_horz_av8:
// stmfd sp!, {x3-x12,x14}
sxtw x5,w5
sxtw x6,w6
stp d8,d9,[sp,#-16]! // Storing d9 using { sub sp,sp,#8; str d9,[sp] } is giving bus error.
// d8 is used as dummy register and stored along with d9 using stp. d8 is not used in the function.
stp d10,d11,[sp,#-16]!
stp d12,d13,[sp,#-16]!
stp d14,d15,[sp,#-16]!
stp x19, x20,[sp,#-16]!
stp x21, x22,[sp,#-16]!
mov x21,x7
ldr w22,[sp,#96]
add x3,x3,x4
add x3,x3,#1
asr x3,x3,#1
add x7,x3,x5,lsl #1
add x3,x3,x6,lsl #1
cmp x7,#0x33
mov x20,#0x33
csel x7, x20, x7,gt
bgt l1.1532
cmp x7,#0x0
mov x20,#0x0
csel x7, x20, x7,lt // x7 has the beta_index value
l1.1532:
// bic x2,x2,#1
asr x2,x2,#1
add x3,x3,x2,lsl #1
cmp x3,#0x35
mov x20,#0x35
csel x3, x20, x3,gt
bgt l1.1564
cmp x3,#0x0
mov x20,#0x0
csel x3, x20, x3,lt // x3 has the tc_index value
// qp_luma = (quant_param_p + quant_param_q + 1) >> 1@
// beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)@
// tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)@
l1.1564:
adrp x2, :got:gai4_ihevc_beta_table
ldr x2, [x2, #:got_lo12:gai4_ihevc_beta_table]
adrp x4, :got:gai4_ihevc_tc_table
ldr x4, [x4, #:got_lo12:gai4_ihevc_tc_table]
ldr w5, [x2,x7,lsl #2] // beta
ldr w6, [x4,x3,lsl #2] // tc
cmp x6,#0
beq l1.2404
movi v0.4h, #0x2
lsl x7,x6,#1
add x14,x1,x1,lsl #1
neg x19,x14
ldr w8, [x0,x19] // -3 value
dup v1.8b,w7
lsl x19,x1,#1
neg x19,x19
ldr w10, [x0,x19] //-2 value
dup v23.2s,w8 // -3 value
neg x19,x1
ldr w11, [x0,x19] //-1 value
dup v24.2s,w10 // -2 value
and x8,x8,#0xff
ldr w12, [x0,#0] // 0 value
dup v25.2s,w11 // -1 value
and x10,x10,#0xff
ldr w9, [x0,x1] // 1 value
dup v26.2s,w12 // 0 value
and x11,x11,#0xff
lsl x19,x1,#1
ldr w2, [x0,x19] // 2 value
dup v27.2s,w9 // 1value
and x12,x12,#0xff
dup v28.2s,w2 // 2 value
and x9,x9,#0xff
and x2,x2,#0xff
add x12,x12,x2
subs x9,x12,x9,lsl #1 // dq0 value is stored in x9
csneg x9,x9,x9,pl
//dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )@
add x8,x8,x11
subs x8,x8,x10,lsl #1
csneg x8,x8,x8,pl // dp0 value is stored in x8
// dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )@
add x3,x1,x1,lsl #1
add x14,x0,#3
neg x19,x3
ldrb w2,[x14,x19] // -2 value
lsl x19,x1,#1
neg x19,x19
ldrb w10,[x14,x19] // -2 value
neg x19,x1
ldrb w11,[x14,x19] // -1 value
ldrb w12,[x14,#0] // 0 value
ldrb w3,[x14,x1] // 1 value
lsl x19,x1,#1
ldrb w4,[x14,x19] // 2 value
add x12,x12,x4
subs x12,x12,x3,lsl #1 // dq3value is stored in x12
csneg x12,x12,x12,pl
// dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )@
add x2,x2,x11
subs x11,x2,x10,lsl #1
csneg x11,x11,x11,pl // dp3 value is stored in x8
// dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2] + pu1_src[3 * src_strd - 1] )@
add x3,x8,x9 // x3 has the d0 value
add x4,x11,x12 // x4 has the d3 value
// d0 = dp0 + dq0@
// d3 = dp3 + dq3@
add x14,x8,x11 // x13 has the value dp
add x12,x12,x9 // x12 has the value dq
// dp = dp0 + dp3@
// dq = dq0 + dq3@
add x11, x3, x4 // x3 has the value d
// d = d0 + d3@
cmp x11,x5
bge l1.2404
// if(d < beta)
// registers which cannont be altered : x3,x4 x5,x6,x12,x13,x0,x1,x11
// registers for use: x2,x7,x8,x9,x10,
asr x10,x5,#2
uqadd v30.8b, v26.8b , v1.8b
cmp x10,x3,lsl #1
uqsub v31.8b, v26.8b , v1.8b
ble l1.1840
add x10,x1,x1,lsl #1
uaddl v6.8h, v25.8b , v26.8b
neg x19,x1
ldr w2, [x0,x19,lsl #2] // has the -4 value
neg x19, x1
ldrb w7,[x0,x19] // has the -1 value
dup v22.2s,w2 // -4 value
uaddw v7.8h, v6.8h , v27.8b
ldrb w3,[x0,#0] // x4 has the 0 value
uqadd v16.8b, v27.8b , v1.8b
and x2,x2,#0xff
mul v12.8h, v7.8h, v0.h[0]
ldr w8, [x0,x10] // has the 3 value
uaddl v10.8h, v24.8b , v28.8b
subs x2,x2,x7
uqsub v17.8b, v27.8b , v1.8b
dup v29.2s,w8 // 3 value
and x8,x8,#0xff
add v12.8h, v12.8h , v10.8h
csneg x2,x2,x2,pl
rshrn v20.8b, v12.8h,#3
subs x8,x8,x3
csneg x8,x8,x8,pl
umin v18.8b, v20.8b , v30.8b
add x8,x8,x2
cmp x8,x5,asr #3
bge l1.1840
uaddw v14.8h, v7.8h , v28.8b
subs x7,x3,x7
umax v4.8b, v18.8b , v31.8b
csneg x7,x7,x7,pl
uqadd v30.8b, v28.8b , v1.8b
mov x10,#5
rshrn v21.8b, v14.8h,#2
mul x10, x10, x6
uqsub v31.8b, v28.8b , v1.8b
add x10, x10,#1
cmp x7,x10,asr #1
umin v18.8b, v21.8b , v16.8b
bge l1.1840
// if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4]) < (beta >> 3) )
// && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) )
umax v5.8b, v18.8b , v17.8b
asr x10,x5,#2
uaddl v16.8h, v29.8b , v28.8b
cmp x10,x4,lsl #1
ble l1.1840
add x10,x1,x1,lsl #1
mul v16.8h, v16.8h, v0.h[0]
add x4,x0,#3
lsl x19,x1,#2
neg x19,x19
ldrb w2,[x4,x19]
add v16.8h, v16.8h , v14.8h
neg x19,x1
ldrb w7,[x4,x19]
rshrn v19.8b, v16.8h,#3
ldrb w3,[x4,#0]
ldrb w8,[x4,x10]
// ubfx x7,x2,#24,#8 @ has the -1 value
// and x2,#0xff @ has the -4 value
// ubfx x8,x3,#24,#8 @ has the 3 value
// and x3,#0xff @ x4 has the 0 value
subs x8,x8,x3
umin v18.8b, v19.8b , v30.8b
csneg x8,x8,x8,pl
uaddl v6.8h, v25.8b , v24.8b
subs x2,x2,x7
umax v3.8b, v18.8b , v31.8b
csneg x2,x2,x2,pl
uaddw v7.8h, v6.8h , v26.8b
add x8,x8,x2
uqadd v30.8b, v25.8b , v1.8b
cmp x8,x5,asr #3
uqsub v31.8b, v25.8b , v1.8b
bge l1.1840
mul v12.8h, v7.8h, v0.h[0]
subs x7,x3,x7
uqadd v16.8b, v24.8b , v1.8b
csneg x7,x7,x7,pl
uaddl v10.8h, v23.8b , v27.8b
mov x10,#5
uqsub v17.8b, v24.8b , v1.8b
mul x10, x10, x6
add v12.8h, v12.8h , v10.8h
add x10, x10,#1
rshrn v20.8b, v12.8h,#3
cmp x7,x10,asr #1
uaddw v14.8h, v7.8h , v23.8b
bge l1.1840
umin v18.8b, v20.8b , v30.8b
mov x2,#2
uqadd v30.8b, v23.8b , v1.8b
mov w4,w21
umax v2.8b, v18.8b , v31.8b
mov w5,w22
rshrn v21.8b, v14.8h,#2
b end_dep_deq_decision_horz
// x2 has the value of de
// x6 has teh value of tc
// x5 has the value of beta
// x14 has the value of dp
// x12 has the value of dq
// x0 has the value of source address
// x1 has the src stride
l1.1840:
mov x2,#1
mov x11,x5
mov w4,w21
mov w5,w22
cmp x6,#1
mov x20,#0
csel x9, x20, x9,eq
mov x20,#0
csel x10, x20, x10,eq
beq end_dep_deq_decision_horz
and x7,x4,x5
cmp x7,#1
beq both_flags_set_horz
cmp x4,#0
beq set_flag_dep_zero_horz
add x8,x11,x11,asr #1
mov x10,#0
asr x8,x8,#3
cmp x8,x14
mov x20,#1
csel x9, x20, x9,gt
mov x20,#0
csel x9, x20, x9,le
b end_dep_deq_decision_horz
set_flag_dep_zero_horz:
add x8,x11,x11,asr #1
mov x9,#0
asr x8,x8,#3
cmp x8,x12
mov x20,#1
csel x10, x20, x10,gt
mov x20,#0
csel x10, x20, x10,le
b end_dep_deq_decision_horz
both_flags_set_horz:
add x8,x11,x11,asr #1
asr x8,x8,#3
cmp x8,x14
mov x20,#1
csel x9, x20, x9,gt
mov x20,#0
csel x9, x20, x9,le
cmp x8,x12
mov x20,#1
csel x10, x20, x10,gt
mov x20,#0
csel x10, x20, x10,le
end_dep_deq_decision_horz:
//x0=source address
//x1=stride
// x2 =de
// x4=flag p
//x5= flag q
//x6 =tc
// x9 =dep
// x10=deq
// add x14,x1,x1,lsl #1
// lsl x7,x6,#1
// vdup.8 d1,x7
// vmov.i16 d0,#0x2
umin v18.8b, v21.8b , v16.8b
cmp x2,#1
uqsub v31.8b, v23.8b , v1.8b
beq l1.2408
uaddl v7.8h, v23.8b , v22.8b
cmp x5,#1
bne strong_filtering_p
strong_filtering_q:
mov x12,x0
st1 {v4.s}[0],[x12],x1
st1 {v5.s}[0],[x12],x1
st1 {v3.s}[0],[x12]
cmp x4,#1
bne l1.2404
strong_filtering_p:
umax v5.8b, v18.8b , v17.8b
mov x12,x0
mul v7.8h, v7.8h, v0.h[0]
sub x20,x1,#0
neg x11, x20
add v16.8h, v7.8h , v14.8h
add x12,x12,x11
rshrn v19.8b, v16.8h,#3
st1 {v2.s}[0],[x12],x11
umin v18.8b, v19.8b , v30.8b
st1 {v5.s}[0],[x12],x11
umax v3.8b, v18.8b , v31.8b
st1 {v3.s}[0],[x12]
l1.2404:
// ldmfd sp!, {x3-x12,pc}
ldp x21, x22,[sp],#16
ldp x19, x20,[sp],#16
ldp d14,d15,[sp],#16
ldp d12,d13,[sp],#16
ldp d10,d11,[sp],#16
ldp d8,d9,[sp],#16 // Loading d9 using { ldr d9,[sp]; add sp,sp,#8 } is giving bus error.
// d8 is used as dummy register and loaded along with d9 using ldp. d8 is not used in the function.
ret
// x4=flag p
//x5= flag q
//x6 =tc
// x9 =dep
// x10=deq
// d22 -4 value
//d23 @ -3 value
// vdup.32 d24,x11 @ -2 value
// vdup.32 d25, x11 @-1 value
// vdup.32 d26,x11 @ 0 value
// vdup.32 d27,x11 @ 1value
// vdup.32 d28,x11 @ 2 value
// vdup.32 d29,x11 @ 3 value
l1.2408:
movi v0.4h, #0x9
usubl v10.8h, v26.8b , v25.8b
mul v10.8h, v10.8h, v0.h[0]
movi v0.4h, #0x3
usubl v12.8h, v27.8b , v24.8b
mul v12.8h, v12.8h, v0.h[0]
dup v30.8b,w6 // duplicating the +tc value
sub x20,x6,#0
neg x12, x20
dup v31.8b,w12 // duplicating the -tc value
sub v10.8h, v10.8h , v12.8h
srshr v10.8h, v10.8h,#4
// delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4@
abs v7.8h, v10.8h
xtn v9.8b, v7.8h
// storing the absolute values of delta in d9
sqxtn v10.8b, v10.8h
// storing the clipped values of delta in d16
smin v11.8b, v10.8b , v30.8b
smax v7.8b, v31.8b , v11.8b // d8 has the value delta = clip3(delta, -tc, tc)//
uxtl v6.8h, v25.8b
saddw v4.8h, v6.8h , v7.8b
sqxtun v12.8b, v4.8h
uxtl v6.8h, v26.8b
ssubw v4.8h, v6.8h , v7.8b
sqxtun v13.8b, v4.8h
mov x11,#0xa
mul x12, x11, x6
dup v2.8b,w12 // d2 has the 10*tc value
mov v18.8b, v24.8b
dup v0.8b,w6
sshr v0.8b,v0.8b,#1
neg v1.8b, v0.8b
cmp x4,#1
bne l1.2724
cmp x9,#1
bne l1.2700
// d12 and d13 have the value temp_p0 and temp_q0
uaddl v14.8h, v23.8b , v25.8b
rshrn v14.8b, v14.8h,#1
usubl v14.8h, v14.8b , v24.8b
saddw v14.8h, v14.8h , v7.8b
sqshrn v14.8b, v14.8h,#1
smin v15.8b, v14.8b , v0.8b
smax v14.8b, v1.8b , v15.8b
// d14 has the delta p value
uxtl v16.8h, v24.8b
saddw v16.8h, v16.8h , v14.8b
sqxtun v14.8b, v16.8h
// d14 =tmp_p1 = clip_u8(pu1_src[-2 * src_strd] + delta_p)@
cmhs v18.8b,v9.8b,v2.8b
bsl v18.8b,v24.8b,v14.8b
l1.2700:
mov x12,x0
sub x20,x1,#0
neg x11, x20
add x12,x12,x11
cmhs v19.8b,v9.8b,v2.8b
bsl v19.8b,v25.8b,v12.8b
st1 {v19.s}[0],[x12],x11
st1 {v18.s}[0],[x12]
l1.2724:
cmp x5,#1
bne l1.2404
cmp x10,#1
mov v18.8b, v27.8b
bne l1.2852
uaddl v14.8h, v26.8b , v28.8b
rshrn v14.8b, v14.8h,#1
usubl v14.8h, v14.8b , v27.8b
ssubw v14.8h, v14.8h , v7.8b
sqshrn v14.8b, v14.8h,#1
smin v15.8b, v14.8b , v0.8b
smax v14.8b, v1.8b , v15.8b
// d14 has the delta p value
uxtl v16.8h, v27.8b
saddw v16.8h, v16.8h , v14.8b
sqxtun v14.8b, v16.8h
cmhs v18.8b,v9.8b,v2.8b
bsl v18.8b,v27.8b,v14.8b
l1.2852:
mov x12,x0
cmhs v19.8b,v9.8b,v2.8b
bsl v19.8b,v26.8b,v13.8b
st1 {v19.s}[0],[x12],x1
st1 {v18.s}[0],[x12]
// ldmfd sp!, {x3-x12,x15}
ldp x21, x22,[sp],#16
ldp x19, x20,[sp],#16
ldp d14,d15,[sp],#16
ldp d12,d13,[sp],#16
ldp d10,d11,[sp],#16
ldp d8,d9,[sp],#16 // Loading d9 using { ldr d9,[sp]; add sp,sp,#8 } is giving bus error.
// d8 is used as dummy register and loaded along with d9 using ldp. d8 is not used in the function.
ret