| ///***************************************************************************** |
| //* |
| //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore |
| //* |
| //* Licensed under the Apache License, Version 2.0 (the "License"); |
| //* you may not use this file except in compliance with the License. |
| //* You may obtain a copy of the License at: |
| //* |
| //* http://www.apache.org/licenses/LICENSE-2.0 |
| //* |
| //* Unless required by applicable law or agreed to in writing, software |
| //* distributed under the License is distributed on an "AS IS" BASIS, |
| //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| //* See the License for the specific language governing permissions and |
| //* limitations under the License. |
| //* |
| //*****************************************************************************/ |
| ///******************************************************************************* |
| //* @file |
| //* ihevc_deblk_luma_vert.s |
| //* |
| //* @brief |
| //* contains function definitions for inter prediction interpolation. |
| //* functions are coded using neon intrinsics and can be compiled using |
| |
| //* rvct |
| //* |
| //* @author |
| //* anand s |
| //* |
| //* @par list of functions: |
| //* |
| //* |
| //* @remarks |
| //* none |
| //* |
| //*******************************************************************************/ |
| |
| .text |
| .align 4 |
| |
| |
| .extern gai4_ihevc_tc_table |
| .extern gai4_ihevc_beta_table |
| .globl ihevc_deblk_luma_horz_av8 |
| |
| .type ihevc_deblk_luma_horz_av8, %function |
| |
| ihevc_deblk_luma_horz_av8: |
| // stmfd sp!, {x3-x12,x14} |
| sxtw x5,w5 |
| sxtw x6,w6 |
| stp d8,d9,[sp,#-16]! // Storing d9 using { sub sp,sp,#8; str d9,[sp] } is giving bus error. |
| // d8 is used as dummy register and stored along with d9 using stp. d8 is not used in the function. |
| stp d10,d11,[sp,#-16]! |
| stp d12,d13,[sp,#-16]! |
| stp d14,d15,[sp,#-16]! |
| stp x19, x20,[sp,#-16]! |
| stp x21, x22,[sp,#-16]! |
| |
| mov x21,x7 |
| ldr w22,[sp,#96] |
| |
| add x3,x3,x4 |
| add x3,x3,#1 |
| asr x3,x3,#1 |
| add x7,x3,x5,lsl #1 |
| add x3,x3,x6,lsl #1 |
| cmp x7,#0x33 |
| mov x20,#0x33 |
| csel x7, x20, x7,gt |
| bgt l1.1532 |
| cmp x7,#0x0 |
| mov x20,#0x0 |
| csel x7, x20, x7,lt // x7 has the beta_index value |
| l1.1532: |
| // bic x2,x2,#1 |
| asr x2,x2,#1 |
| |
| add x3,x3,x2,lsl #1 |
| cmp x3,#0x35 |
| mov x20,#0x35 |
| csel x3, x20, x3,gt |
| bgt l1.1564 |
| cmp x3,#0x0 |
| mov x20,#0x0 |
| csel x3, x20, x3,lt // x3 has the tc_index value |
| |
| // qp_luma = (quant_param_p + quant_param_q + 1) >> 1@ |
| // beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)@ |
| // tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)@ |
| |
| l1.1564: |
| adrp x2, :got:gai4_ihevc_beta_table |
| ldr x2, [x2, #:got_lo12:gai4_ihevc_beta_table] |
| |
| adrp x4, :got:gai4_ihevc_tc_table |
| ldr x4, [x4, #:got_lo12:gai4_ihevc_tc_table] |
| |
| ldr w5, [x2,x7,lsl #2] // beta |
| ldr w6, [x4,x3,lsl #2] // tc |
| |
| |
| |
| cmp x6,#0 |
| beq l1.2404 |
| movi v0.4h, #0x2 |
| lsl x7,x6,#1 |
| add x14,x1,x1,lsl #1 |
| neg x19,x14 |
| ldr w8, [x0,x19] // -3 value |
| dup v1.8b,w7 |
| lsl x19,x1,#1 |
| neg x19,x19 |
| ldr w10, [x0,x19] //-2 value |
| dup v23.2s,w8 // -3 value |
| neg x19,x1 |
| ldr w11, [x0,x19] //-1 value |
| dup v24.2s,w10 // -2 value |
| and x8,x8,#0xff |
| ldr w12, [x0,#0] // 0 value |
| dup v25.2s,w11 // -1 value |
| and x10,x10,#0xff |
| ldr w9, [x0,x1] // 1 value |
| dup v26.2s,w12 // 0 value |
| and x11,x11,#0xff |
| lsl x19,x1,#1 |
| ldr w2, [x0,x19] // 2 value |
| dup v27.2s,w9 // 1value |
| and x12,x12,#0xff |
| dup v28.2s,w2 // 2 value |
| and x9,x9,#0xff |
| and x2,x2,#0xff |
| |
| add x12,x12,x2 |
| subs x9,x12,x9,lsl #1 // dq0 value is stored in x9 |
| csneg x9,x9,x9,pl |
| //dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )@ |
| |
| add x8,x8,x11 |
| subs x8,x8,x10,lsl #1 |
| csneg x8,x8,x8,pl // dp0 value is stored in x8 |
| // dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )@ |
| |
| |
| |
| add x3,x1,x1,lsl #1 |
| add x14,x0,#3 |
| |
| |
| neg x19,x3 |
| ldrb w2,[x14,x19] // -2 value |
| lsl x19,x1,#1 |
| neg x19,x19 |
| ldrb w10,[x14,x19] // -2 value |
| neg x19,x1 |
| ldrb w11,[x14,x19] // -1 value |
| ldrb w12,[x14,#0] // 0 value |
| ldrb w3,[x14,x1] // 1 value |
| lsl x19,x1,#1 |
| ldrb w4,[x14,x19] // 2 value |
| |
| |
| add x12,x12,x4 |
| subs x12,x12,x3,lsl #1 // dq3value is stored in x12 |
| csneg x12,x12,x12,pl |
| // dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )@ |
| |
| |
| add x2,x2,x11 |
| subs x11,x2,x10,lsl #1 |
| csneg x11,x11,x11,pl // dp3 value is stored in x8 |
| // dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2] + pu1_src[3 * src_strd - 1] )@ |
| |
| |
| |
| add x3,x8,x9 // x3 has the d0 value |
| add x4,x11,x12 // x4 has the d3 value |
| |
| |
| // d0 = dp0 + dq0@ |
| // d3 = dp3 + dq3@ |
| |
| add x14,x8,x11 // x13 has the value dp |
| add x12,x12,x9 // x12 has the value dq |
| // dp = dp0 + dp3@ |
| // dq = dq0 + dq3@ |
| |
| add x11, x3, x4 // x3 has the value d |
| |
| // d = d0 + d3@ |
| |
| |
| cmp x11,x5 |
| bge l1.2404 |
| |
| // if(d < beta) |
| |
| |
| // registers which cannont be altered : x3,x4 x5,x6,x12,x13,x0,x1,x11 |
| |
| // registers for use: x2,x7,x8,x9,x10, |
| |
| asr x10,x5,#2 |
| uqadd v30.8b, v26.8b , v1.8b |
| cmp x10,x3,lsl #1 |
| uqsub v31.8b, v26.8b , v1.8b |
| ble l1.1840 |
| add x10,x1,x1,lsl #1 |
| uaddl v6.8h, v25.8b , v26.8b |
| neg x19,x1 |
| ldr w2, [x0,x19,lsl #2] // has the -4 value |
| neg x19, x1 |
| ldrb w7,[x0,x19] // has the -1 value |
| dup v22.2s,w2 // -4 value |
| uaddw v7.8h, v6.8h , v27.8b |
| ldrb w3,[x0,#0] // x4 has the 0 value |
| uqadd v16.8b, v27.8b , v1.8b |
| and x2,x2,#0xff |
| mul v12.8h, v7.8h, v0.h[0] |
| ldr w8, [x0,x10] // has the 3 value |
| uaddl v10.8h, v24.8b , v28.8b |
| subs x2,x2,x7 |
| uqsub v17.8b, v27.8b , v1.8b |
| dup v29.2s,w8 // 3 value |
| and x8,x8,#0xff |
| add v12.8h, v12.8h , v10.8h |
| csneg x2,x2,x2,pl |
| rshrn v20.8b, v12.8h,#3 |
| subs x8,x8,x3 |
| csneg x8,x8,x8,pl |
| umin v18.8b, v20.8b , v30.8b |
| add x8,x8,x2 |
| |
| cmp x8,x5,asr #3 |
| bge l1.1840 |
| uaddw v14.8h, v7.8h , v28.8b |
| subs x7,x3,x7 |
| umax v4.8b, v18.8b , v31.8b |
| csneg x7,x7,x7,pl |
| uqadd v30.8b, v28.8b , v1.8b |
| mov x10,#5 |
| rshrn v21.8b, v14.8h,#2 |
| mul x10, x10, x6 |
| uqsub v31.8b, v28.8b , v1.8b |
| add x10, x10,#1 |
| cmp x7,x10,asr #1 |
| umin v18.8b, v21.8b , v16.8b |
| bge l1.1840 |
| |
| |
| // if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4]) < (beta >> 3) ) |
| // && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) ) |
| |
| umax v5.8b, v18.8b , v17.8b |
| asr x10,x5,#2 |
| uaddl v16.8h, v29.8b , v28.8b |
| cmp x10,x4,lsl #1 |
| ble l1.1840 |
| |
| add x10,x1,x1,lsl #1 |
| mul v16.8h, v16.8h, v0.h[0] |
| add x4,x0,#3 |
| |
| |
| lsl x19,x1,#2 |
| neg x19,x19 |
| ldrb w2,[x4,x19] |
| add v16.8h, v16.8h , v14.8h |
| neg x19,x1 |
| ldrb w7,[x4,x19] |
| rshrn v19.8b, v16.8h,#3 |
| ldrb w3,[x4,#0] |
| ldrb w8,[x4,x10] |
| // ubfx x7,x2,#24,#8 @ has the -1 value |
| // and x2,#0xff @ has the -4 value |
| // ubfx x8,x3,#24,#8 @ has the 3 value |
| // and x3,#0xff @ x4 has the 0 value |
| |
| |
| |
| subs x8,x8,x3 |
| umin v18.8b, v19.8b , v30.8b |
| csneg x8,x8,x8,pl |
| uaddl v6.8h, v25.8b , v24.8b |
| subs x2,x2,x7 |
| umax v3.8b, v18.8b , v31.8b |
| csneg x2,x2,x2,pl |
| uaddw v7.8h, v6.8h , v26.8b |
| add x8,x8,x2 |
| uqadd v30.8b, v25.8b , v1.8b |
| cmp x8,x5,asr #3 |
| uqsub v31.8b, v25.8b , v1.8b |
| bge l1.1840 |
| mul v12.8h, v7.8h, v0.h[0] |
| subs x7,x3,x7 |
| uqadd v16.8b, v24.8b , v1.8b |
| csneg x7,x7,x7,pl |
| uaddl v10.8h, v23.8b , v27.8b |
| mov x10,#5 |
| uqsub v17.8b, v24.8b , v1.8b |
| mul x10, x10, x6 |
| add v12.8h, v12.8h , v10.8h |
| add x10, x10,#1 |
| rshrn v20.8b, v12.8h,#3 |
| cmp x7,x10,asr #1 |
| uaddw v14.8h, v7.8h , v23.8b |
| bge l1.1840 |
| umin v18.8b, v20.8b , v30.8b |
| mov x2,#2 |
| uqadd v30.8b, v23.8b , v1.8b |
| mov w4,w21 |
| umax v2.8b, v18.8b , v31.8b |
| mov w5,w22 |
| rshrn v21.8b, v14.8h,#2 |
| b end_dep_deq_decision_horz |
| // x2 has the value of de |
| // x6 has teh value of tc |
| // x5 has the value of beta |
| // x14 has the value of dp |
| // x12 has the value of dq |
| // x0 has the value of source address |
| // x1 has the src stride |
| |
| l1.1840: |
| mov x2,#1 |
| |
| mov x11,x5 |
| mov w4,w21 |
| mov w5,w22 |
| |
| cmp x6,#1 |
| mov x20,#0 |
| csel x9, x20, x9,eq |
| mov x20,#0 |
| csel x10, x20, x10,eq |
| beq end_dep_deq_decision_horz |
| |
| and x7,x4,x5 |
| cmp x7,#1 |
| beq both_flags_set_horz |
| cmp x4,#0 |
| beq set_flag_dep_zero_horz |
| |
| |
| add x8,x11,x11,asr #1 |
| mov x10,#0 |
| asr x8,x8,#3 |
| cmp x8,x14 |
| mov x20,#1 |
| csel x9, x20, x9,gt |
| mov x20,#0 |
| csel x9, x20, x9,le |
| b end_dep_deq_decision_horz |
| set_flag_dep_zero_horz: |
| |
| add x8,x11,x11,asr #1 |
| mov x9,#0 |
| asr x8,x8,#3 |
| cmp x8,x12 |
| mov x20,#1 |
| csel x10, x20, x10,gt |
| mov x20,#0 |
| csel x10, x20, x10,le |
| b end_dep_deq_decision_horz |
| |
| both_flags_set_horz: |
| add x8,x11,x11,asr #1 |
| asr x8,x8,#3 |
| cmp x8,x14 |
| mov x20,#1 |
| csel x9, x20, x9,gt |
| mov x20,#0 |
| csel x9, x20, x9,le |
| cmp x8,x12 |
| mov x20,#1 |
| csel x10, x20, x10,gt |
| mov x20,#0 |
| csel x10, x20, x10,le |
| end_dep_deq_decision_horz: |
| |
| //x0=source address |
| //x1=stride |
| // x2 =de |
| // x4=flag p |
| //x5= flag q |
| //x6 =tc |
| // x9 =dep |
| // x10=deq |
| |
| |
| |
| // add x14,x1,x1,lsl #1 |
| // lsl x7,x6,#1 |
| // vdup.8 d1,x7 |
| // vmov.i16 d0,#0x2 |
| umin v18.8b, v21.8b , v16.8b |
| cmp x2,#1 |
| uqsub v31.8b, v23.8b , v1.8b |
| beq l1.2408 |
| uaddl v7.8h, v23.8b , v22.8b |
| cmp x5,#1 |
| |
| bne strong_filtering_p |
| |
| strong_filtering_q: |
| mov x12,x0 |
| st1 {v4.s}[0],[x12],x1 |
| st1 {v5.s}[0],[x12],x1 |
| st1 {v3.s}[0],[x12] |
| cmp x4,#1 |
| bne l1.2404 |
| strong_filtering_p: |
| umax v5.8b, v18.8b , v17.8b |
| mov x12,x0 |
| mul v7.8h, v7.8h, v0.h[0] |
| sub x20,x1,#0 |
| neg x11, x20 |
| add v16.8h, v7.8h , v14.8h |
| add x12,x12,x11 |
| rshrn v19.8b, v16.8h,#3 |
| st1 {v2.s}[0],[x12],x11 |
| umin v18.8b, v19.8b , v30.8b |
| st1 {v5.s}[0],[x12],x11 |
| umax v3.8b, v18.8b , v31.8b |
| st1 {v3.s}[0],[x12] |
| |
| l1.2404: |
| // ldmfd sp!, {x3-x12,pc} |
| ldp x21, x22,[sp],#16 |
| ldp x19, x20,[sp],#16 |
| ldp d14,d15,[sp],#16 |
| ldp d12,d13,[sp],#16 |
| ldp d10,d11,[sp],#16 |
| ldp d8,d9,[sp],#16 // Loading d9 using { ldr d9,[sp]; add sp,sp,#8 } is giving bus error. |
| // d8 is used as dummy register and loaded along with d9 using ldp. d8 is not used in the function. |
| ret |
| |
| // x4=flag p |
| //x5= flag q |
| //x6 =tc |
| // x9 =dep |
| // x10=deq |
| |
| |
| // d22 -4 value |
| |
| //d23 @ -3 value |
| |
| // vdup.32 d24,x11 @ -2 value |
| |
| // vdup.32 d25, x11 @-1 value |
| |
| // vdup.32 d26,x11 @ 0 value |
| |
| // vdup.32 d27,x11 @ 1value |
| |
| // vdup.32 d28,x11 @ 2 value |
| |
| // vdup.32 d29,x11 @ 3 value |
| |
| l1.2408: |
| |
| movi v0.4h, #0x9 |
| |
| usubl v10.8h, v26.8b , v25.8b |
| |
| mul v10.8h, v10.8h, v0.h[0] |
| |
| movi v0.4h, #0x3 |
| |
| usubl v12.8h, v27.8b , v24.8b |
| mul v12.8h, v12.8h, v0.h[0] |
| |
| |
| dup v30.8b,w6 // duplicating the +tc value |
| |
| sub x20,x6,#0 |
| neg x12, x20 |
| dup v31.8b,w12 // duplicating the -tc value |
| |
| |
| |
| sub v10.8h, v10.8h , v12.8h |
| |
| |
| |
| srshr v10.8h, v10.8h,#4 |
| // delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4@ |
| |
| abs v7.8h, v10.8h |
| xtn v9.8b, v7.8h |
| // storing the absolute values of delta in d9 |
| |
| sqxtn v10.8b, v10.8h |
| // storing the clipped values of delta in d16 |
| |
| |
| smin v11.8b, v10.8b , v30.8b |
| smax v7.8b, v31.8b , v11.8b // d8 has the value delta = clip3(delta, -tc, tc)// |
| |
| |
| uxtl v6.8h, v25.8b |
| |
| saddw v4.8h, v6.8h , v7.8b |
| |
| sqxtun v12.8b, v4.8h |
| uxtl v6.8h, v26.8b |
| ssubw v4.8h, v6.8h , v7.8b |
| sqxtun v13.8b, v4.8h |
| |
| |
| mov x11,#0xa |
| mul x12, x11, x6 |
| dup v2.8b,w12 // d2 has the 10*tc value |
| mov v18.8b, v24.8b |
| dup v0.8b,w6 |
| sshr v0.8b,v0.8b,#1 |
| neg v1.8b, v0.8b |
| |
| cmp x4,#1 |
| bne l1.2724 |
| cmp x9,#1 |
| bne l1.2700 |
| |
| // d12 and d13 have the value temp_p0 and temp_q0 |
| uaddl v14.8h, v23.8b , v25.8b |
| rshrn v14.8b, v14.8h,#1 |
| usubl v14.8h, v14.8b , v24.8b |
| saddw v14.8h, v14.8h , v7.8b |
| sqshrn v14.8b, v14.8h,#1 |
| smin v15.8b, v14.8b , v0.8b |
| smax v14.8b, v1.8b , v15.8b |
| |
| // d14 has the delta p value |
| uxtl v16.8h, v24.8b |
| saddw v16.8h, v16.8h , v14.8b |
| sqxtun v14.8b, v16.8h |
| |
| // d14 =tmp_p1 = clip_u8(pu1_src[-2 * src_strd] + delta_p)@ |
| cmhs v18.8b,v9.8b,v2.8b |
| bsl v18.8b,v24.8b,v14.8b |
| |
| l1.2700: |
| mov x12,x0 |
| sub x20,x1,#0 |
| neg x11, x20 |
| add x12,x12,x11 |
| cmhs v19.8b,v9.8b,v2.8b |
| bsl v19.8b,v25.8b,v12.8b |
| st1 {v19.s}[0],[x12],x11 |
| st1 {v18.s}[0],[x12] |
| l1.2724: |
| cmp x5,#1 |
| bne l1.2404 |
| cmp x10,#1 |
| mov v18.8b, v27.8b |
| bne l1.2852 |
| |
| uaddl v14.8h, v26.8b , v28.8b |
| rshrn v14.8b, v14.8h,#1 |
| usubl v14.8h, v14.8b , v27.8b |
| ssubw v14.8h, v14.8h , v7.8b |
| sqshrn v14.8b, v14.8h,#1 |
| smin v15.8b, v14.8b , v0.8b |
| smax v14.8b, v1.8b , v15.8b |
| // d14 has the delta p value |
| uxtl v16.8h, v27.8b |
| saddw v16.8h, v16.8h , v14.8b |
| sqxtun v14.8b, v16.8h |
| cmhs v18.8b,v9.8b,v2.8b |
| bsl v18.8b,v27.8b,v14.8b |
| l1.2852: |
| mov x12,x0 |
| cmhs v19.8b,v9.8b,v2.8b |
| bsl v19.8b,v26.8b,v13.8b |
| st1 {v19.s}[0],[x12],x1 |
| st1 {v18.s}[0],[x12] |
| // ldmfd sp!, {x3-x12,x15} |
| ldp x21, x22,[sp],#16 |
| ldp x19, x20,[sp],#16 |
| ldp d14,d15,[sp],#16 |
| ldp d12,d13,[sp],#16 |
| ldp d10,d11,[sp],#16 |
| ldp d8,d9,[sp],#16 // Loading d9 using { ldr d9,[sp]; add sp,sp,#8 } is giving bus error. |
| // d8 is used as dummy register and loaded along with d9 using ldp. d8 is not used in the function. |
| ret |
| |
| |