common/arm64/ihevc_deblk_luma_vert.s - platform/external/libhevc - Git at Google

 ///*****************************************************************************
 //*
 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
 //*
 //* Licensed under the Apache License, Version 2.0 (the "License");
 //* you may not use this file except in compliance with the License.
 //* You may obtain a copy of the License at:
 //*
 //* http://www.apache.org/licenses/LICENSE-2.0
 //*
 //* Unless required by applicable law or agreed to in writing, software
 //* distributed under the License is distributed on an "AS IS" BASIS,
 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 //* See the License for the specific language governing permissions and
 //* limitations under the License.
 //*
 //*****************************************************************************/
 ///**
 ///*******************************************************************************
 //* //file
 //*  ihevc_deblk_luma_vert.s
 //*
 //* //brief
 //*  contains function definitions for inter prediction  interpolation.
 //* functions are coded using neon  intrinsics and can be compiled using

 //* rvct
 //*
 //* //author
 //*  anand s
 //*
 //* //par list of functions:
 //*
 //*
 //* //remarks
 //*  none
 //*
 //*******************************************************************************/

 .text
 .align 4


 .extern gai4_ihevc_tc_table
 .extern gai4_ihevc_beta_table

 .globl ihevc_deblk_luma_vert_av8

 .type ihevc_deblk_luma_vert_av8, %function

 ihevc_deblk_luma_vert_av8:

     sxtw        x5,w5
     sxtw        x6,w6
     stp         d8,d9,[sp,#-16]!
     stp         d10,d11,[sp,#-16]!
     stp         d12,d13,[sp,#-16]!
     stp         d14,d15,[sp,#-16]!
     stp         x19, x20,[sp,#-16]!
     stp         x21, x22,[sp,#-16]!
     mov         x21,x7
     ldr         w22,[sp,#96]
     add         x3,x3,x4
     add         x3,x3,#1
     asr         x3,x3,#1
     add         x7,x3,x5,lsl #1
     add         x3,x3,x6,lsl #1
     cmp         x7,#0x33
     mov         x20,#0x33
     csel        x7, x20, x7,gt
     bgt         l1.56
     cmp         x7,#0x0
     mov         x20,#0x0
     csel        x7, x20, x7,lt              // x7 has the beta_index value
 l1.56:

 //     bic      x2,x2,#1
     asr         x2,x2,#1

     add         x3,x3,x2,lsl #1
     cmp         x3,#0x35
     mov         x20,#0x35
     csel        x3, x20, x3,gt
     bgt         l1.88
     cmp         x3,#0x0
     mov         x20,#0x0
     csel        x3, x20, x3,lt              // x3 has the tc_index value

 //    qp_luma = (quant_param_p + quant_param_q + 1) >> 1//
 //    beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)//
 //    tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)//

 l1.88:
     adrp        x2, :got:gai4_ihevc_beta_table
     ldr         x2, [x2, #:got_lo12:gai4_ihevc_beta_table]

     movi        v18.8b, #0x2
     adrp        x4, :got:gai4_ihevc_tc_table
     ldr         x4, [x4, #:got_lo12:gai4_ihevc_tc_table]

     ldr         w5,[x2,x7,lsl #2]           // beta
     movi        v16.8h, #0x2
     ldr         w6,[x4,x3,lsl #2]           // tc
     lsl         x8,x6,#1
     cmp         x6,#0
     dup         v19.8b,w8
     sub         x7,x0,#4
     movi        v23.8b, #0x3
     beq         l1.964


     sub         x19,x0,#3
     ld1         {v15.8b},[x7],x1
     ldrb        w8,[x19]                    // -3 value
     ld1         {v1.8b},[x7],x1
     ldrb        w10,[x19,#1]                //-2 value
     ld1         {v29.8b},[x7],x1
     ldrb        w11,[x19,#2]                //-1 value
     ld1         {v0.8b},[x7]
     ldrb        w12,[x0,#0]                 // 0 value
     ldrb        w9,[x0,#1]                  // 1 value
     trn1        v24.8b,v15.8b,v1.8b
     trn2        v1.8b,v15.8b,v1.8b
     ldrb        w2,[x0,#2]                  // 2 value
     trn1        v2.8b,v29.8b,v0.8b
     trn2        v0.8b,v29.8b,v0.8b
     add         x12,x12,x2
     subs        x9,x12,x9,lsl #1            // dq0 value is stored in x9
     csneg       x9,x9,x9,pl
 //dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )//
     mov         v29.8b,v24.8b
     trn1        v24.4h,v29.4h,v2.4h
     trn2        v2.4h,v29.4h,v2.4h
     add         x8,x8,x11
     mov         v15.8b,v1.8b
     trn1        v1.4h,v15.4h,v0.4h
     trn2        v0.4h,v15.4h,v0.4h
     subs        x8,x8,x10,lsl #1
     csneg       x8,x8,x8,pl
 //  dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )//


     add         x14,x1,x1,lsl #1
     add         x14,x0,x14

     sub         x19,x14,#3
     dup         v4.2s, v24.s[1]
     ldrb        w2,[x19]                    // -2 value
     dup         v7.2s, v2.s[1]
     ldrb        w10,[x19,#1]                // -2 value
     dup         v3.2s, v2.s[0]
     ldrb        w11,[x19,#2]                // -1 value
     dup         v5.2s, v1.s[1]
     ldrb        w12,[x14,#0]                // 0 value
     dup         v6.2s, v1.s[0]
     ldrb        w3,[x14,#1]                 // 1 value
     dup         v2.2s, v0.s[0]
     ldrb        w4,[x14,#2]                 // 2 value


     add         x12,x12,x4
     subs        x12,x12,x3,lsl #1           // dq3value is stored in x12
     csneg       x12,x12,x12,pl
 //    dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )//


     add         x2,x2,x11
     subs        x11,x2,x10,lsl #1
     csneg       x11,x11,x11,pl              // dp3 value is stored in x8
 //    dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2]   + pu1_src[3 * src_strd - 1] )//


     add         x3,x8,x9                    // x3 has the d0 value
     add         x4,x11,x12                  // x4 has the d3 value


 //    d0 = dp0 + dq0//
 //    d3 = dp3 + dq3//

     add         x14,x8,x11                  // x13 has the value dp
     add         x12,x12,x9                  // x12 has the value  dq
 //    dp = dp0 + dp3//
 //   dq = dq0 + dq3//

     add         x11, x3, x4                 // x3 has the value d

 //   d = d0 + d3//


     cmp         x11,x5
     dup         v22.2s, v0.s[1]
     bge         l1.964

 //    if(d < beta)


     // registers which cannont be altered : x3,x4 x5,x6,x12,x13,x0,x1,x11

     // registers for use: x2,x7,x8,x9,x10,
     uqsub       v30.8b,v7.8b,v19.8b
     asr         x10,x5,#2
     uqadd       v31.8b,v7.8b,v19.8b
     cmp         x10,x3,lsl #1
     uaddl       v0.8h,v5.8b,v4.8b
     ble         l1.336

     sub         x19,x0,4
     ldrb        w2,[x19]
     uaddw       v0.8h,  v0.8h ,  v2.8b
     ldrb        w7,[x19,#3]
     umull       v20.8h, v7.8b, v23.8b
     ldrb        w3,[x0,#0]
     umlal       v20.8h, v22.8b, v18.8b
     ldrb        w8,[x0,#3]
 //   ubfx   x7,x2,#24,#8           // has the -1 value
 //  and    x2,#0xff               // has the -4 value
 //  ubfx   x8,x3,#24,#8           // has the 3 value
 //  and    x3,#0xff               // x4 has the 0 value

     add         v20.8h,  v20.8h ,  v0.8h
     subs        x8,x8,x3
     rshrn       v22.8b,v20.8h,#3
     csneg       x8,x8,x8,pl
     subs        x2,x2,x7
     umin        v21.8b,  v22.8b ,  v31.8b
     csneg       x2,x2,x2,pl
     umax        v22.8b,  v21.8b ,  v30.8b
     add         x8,x8,x2
     uaddl       v20.8h,v7.8b,v3.8b
     cmp         x8,x5,asr #3
     mla         v20.8h, v0.8h, v16.8h
     bge         l1.336
     uaddw       v0.8h,  v0.8h ,  v7.8b
     subs        x7,x3,x7
     rshrn       v20.8b,v20.8h,#3
     csneg       x7,x7,x7,pl
     rshrn       v0.8b,v0.8h,#2
     mov         x10,#5
     uqadd       v30.8b,v5.8b,v19.8b
     mul         x10, x10, x6
     uqsub       v31.8b,v5.8b,v19.8b
     add         x10, x10,#1
     cmp         x7,x10,asr #1
     bge         l1.336


 //        if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4])  < (beta >> 3) )
 //            && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) )


     asr         x10,x5,#2
     uqsub       v25.8b,v4.8b,v19.8b
     cmp         x10,x4,lsl #1
     uqadd       v21.8b,v4.8b,v19.8b
     ble         l1.336
     umin        v26.8b,  v20.8b ,  v21.8b
     add         x4,x1,x1,lsl #1
     add         x4,x4,x0
     umax        v20.8b,  v26.8b ,  v25.8b
     sub         x19,x4,#4
     ldrb        w2,[x19]
     umin        v19.8b,  v0.8b ,  v30.8b
     ldrb        w7,[x19,#3]
     umax        v21.8b,  v19.8b ,  v31.8b
     ldrb        w3,[x4,#0]
     lsl         x10,x6,#1
     ldrb        w8,[x4,#3]
 //   ubfx   x7,x2,#24,#8           // has the -1 value
 //  and    x2,#0xff               // has the -4 value
 //  ubfx   x8,x3,#24,#8           // has the 3 value
 //  and    x3,#0xff               // x4 has the 0 value
     uaddl       v0.8h,v2.8b,v3.8b
     dup         v19.8b,w10
     subs        x8,x8,x3
     uaddw       v0.8h,  v0.8h ,  v4.8b
     csneg       x8,x8,x8,pl
     uqadd       v30.8b,v2.8b,v19.8b
     subs        x2,x2,x7
     uqsub       v31.8b,v2.8b,v19.8b
     csneg       x2,x2,x2,pl
     uaddl       v26.8h,v5.8b,v6.8b
     add         x8,x8,x2
     mla         v26.8h, v0.8h, v16.8h
     cmp         x8,x5,asr #3
     bge         l1.336
     rshrn       v26.8b,v26.8h,#3
     subs        x7,x3,x7
     uqadd       v27.8b,v3.8b,v19.8b
     csneg       x7,x7,x7,pl
     uqsub       v28.8b,v3.8b,v19.8b
     mov         x10,#5
     umin        v16.8b,  v26.8b ,  v30.8b
     mul         x10, x10, x6
     add         x10, x10,#1
     cmp         x7,x10,asr #1
     umax        v26.8b,  v16.8b ,  v31.8b
     bge         l1.336
     uqadd       v30.8b,v6.8b,v19.8b

     mov         x2,#2
     mov         x4,x21
     uqsub       v31.8b,v6.8b,v19.8b
     mov         x5,x22
     b           end_dep_deq_decision
 // x2 has the value of de
 // x6 has teh value of tc
 // x5 has the value of beta
 // x14 has the value of dp
 // x12 has the value of dq
 // x0 has the value of source address
 // x1 has the src stride

 l1.336:
     mov         x2,#1
 l1.424:
     mov         x11,x5
     mov         x4,x21
     mov         x5,x22

     cmp         x6,#1
     mov         x20,#0
     csel        x9, x20, x9,eq
     mov         x20,#0
     csel        x10, x20, x10,eq
     beq         end_dep_deq_decision

     and         x7,x4,x5

     cmp         x7,#1
     beq         both_flags_set
     cmp         x4,#0
     beq         set_flag_dep_zero


     add         x8,x11,x11,asr #1
     mov         x10,#0
     asr         x8,x8,#3
     cmp         x8,x14
     mov         x20,#1
     csel        x9, x20, x9,gt
     mov         x20,#0
     csel        x9, x20, x9,le
     b           end_dep_deq_decision
 set_flag_dep_zero:

     add         x8,x11,x11,asr #1
     mov         x9,#0
     asr         x8,x8,#3
     cmp         x8,x12
     mov         x20,#1
     csel        x10, x20, x10,gt
     mov         x20,#0
     csel        x10, x20, x10,le
     b           end_dep_deq_decision

 both_flags_set:
     add         x8,x11,x11,asr #1
     asr         x8,x8,#3
     cmp         x8,x14
     mov         x20,#1
     csel        x9, x20, x9,gt
     mov         x20,#0
     csel        x9, x20, x9,le
     cmp         x8,x12
     mov         x20,#1
     csel        x10, x20, x10,gt
     mov         x20,#0
     csel        x10, x20, x10,le
 end_dep_deq_decision:

 //x0=source address
 //x1=stride
 // x2 =de
 // x4=flag p
 //x5= flag q
 //x6 =tc
 // x9 =dep
 // x10=deq
 //    b    l1.964


     cmp         x2,#2
 // x4 has the value of de
     bne         l1.968

     cmp         x5,#0
     beq         l1.780
 // x5 has the flag of q

     add         x3,x0,#2
     st1         {v22.b}[0],[x3],x1

     st1         {v22.b}[1],[x3],x1

     st1         {v22.b}[2],[x3],x1

     st1         {v22.b}[3],[x3]
     add         x3,x0,x1
     mov         v29.8b,v20.8b
     trn1        v20.8b,v29.8b,v21.8b
     trn2        v21.8b,v29.8b,v21.8b

     st1         {v20.h}[0],[x0]
     st1         {v21.h}[0],[x3],x1
     st1         {v20.h}[1],[x3],x1
     st1         {v21.h}[1],[x3]


 l1.780:
     cmp         x4,#0
     beq         l1.964
     // x4 has the flag p


     dup         v7.2s, v24.s[0]
     sub         x3,x0,#1
     uaddw       v16.8h,  v0.8h ,  v6.8b
     add         x7,x3,x1
     rshrn       v2.8b,v16.8h,#2
     st1         {v26.b}[0],[x3]
     sub         x0,x0,#3
     umin        v16.8b,  v2.8b ,  v27.8b
     st1         {v26.b}[1],[x7],x1
     umull       v2.8h, v6.8b, v23.8b
     umlal       v2.8h, v7.8b, v18.8b
     st1         {v26.b}[2],[x7],x1
     umax        v5.8b,  v16.8b ,  v28.8b
     st1         {v26.b}[3],[x7]
     add         v0.8h,  v2.8h ,  v0.8h
     rshrn       v0.8b,v0.8h,#3


     umin        v1.8b,  v0.8b ,  v30.8b
     umax        v0.8b,  v1.8b ,  v31.8b

     mov         v29.8b,v0.8b
     trn1        v0.8b,v29.8b,v5.8b
     trn2        v5.8b,v29.8b,v5.8b
     st1         {v0.h}[0],[x0],x1
     st1         {v5.h}[0],[x0],x1
     st1         {v0.h}[1],[x0],x1
     st1         {v5.h}[1],[x0]
 l1.964:
     ldp         x21, x22,[sp],#16
     ldp         x19, x20,[sp],#16
     ldp         d14,d15,[sp],#16
     ldp         d12,d13,[sp],#16
     ldp         d10,d11,[sp],#16
     ldp         d8,d9,[sp],#16
     ret

 l1.968:


     movi        v0.8h, #0x9
     neg         x11, x6
     cmp         x4,#0
     // checks for the flag p
     movi        v16.8h, #0x3
     movi        v24.8b, #0x1


     dup         v30.8b,w11
     and         x11,x6,#0xff
     dup         v31.8b,w11

     usubl       v18.8h,v4.8b,v2.8b
     mul         v18.8h, v18.8h, v0.8h
     usubl       v0.8h,v5.8b,v3.8b


     mul         v16.8h, v0.8h, v16.8h
     sub         v16.8h,  v18.8h ,  v16.8h
     srshr       v16.8h,v16.8h,#4
 //   delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4//

     abs         v0.8h, v16.8h
     xtn         v0.8b,  v0.8h
     // storing the absolute values of delta in d0

     sqxtn       v16.8b,v16.8h
     // storing the clipped values of delta in d16

     movi        v1.8b, #0xa
     dup         v21.8b,w11
     mul         v1.8b, v1.8b, v21.8b
     // d1 stores the value (10 * tc)

 //if(abs(delta) < 10 * tc)

     smin        v18.8b,  v16.8b ,  v31.8b
     smax        v20.8b,  v18.8b ,  v30.8b

 // delta = clip3(delta, -tc, tc)//
     sxtl        v16.8h, v20.8b
     uxtl        v18.8h, v2.8b
     add         v18.8h,  v18.8h ,  v16.8h

     sqxtun      v22.8b, v18.8h
     uxtl        v18.8h, v4.8b
     sub         v16.8h,  v18.8h ,  v16.8h
     sqxtun      v23.8b, v16.8h
 // tmp_p0 = clip_u8(pu1_src[-1] + delta)//
 //  tmp_q0 = clip_u8(pu1_src[0] - delta)//
     beq         l1.1272


     cmp         x9,#1
     bne         l1.1212
 // checks for the flag dep

     asr         x3,x6,#1


     uaddl       v16.8h,v6.8b,v2.8b
     uaddw       v16.8h,  v16.8h ,  v24.8b
     dup         v18.8b,w3
     sub         x20,x3,#0
     neg         x3, x20
     dup         v19.8b,w3
     ushr        v16.8h,v16.8h,#1
     xtn         v16.8b,  v16.8h

     usubl       v16.8h,v16.8b,v3.8b
     saddw       v16.8h,  v16.8h ,  v20.8b
     sshr        v16.8h,v16.8h,#1
     sqxtn       v16.8b,v16.8h

     smin        v17.8b,  v16.8b ,  v18.8b
     smax        v16.8b,  v19.8b ,  v17.8b


     uxtl        v18.8h, v3.8b
     sxtl        v16.8h, v16.8b
     add         v16.8h,  v18.8h ,  v16.8h

     sqxtun      v16.8b, v16.8h
     mov         v30.8b,v3.8b
     cmhs        v3.8b,v0.8b,v1.8b


     bsl         v3.8b,v30.8b,v16.8b
 l1.1212:
     dup         v16.8b,w11
     sub         x12,x0,#3
     sub         x3,x0,#1
 //     smul v16.8b, v16.8b, v1.8b
     mov         v29.8b,v6.8b
     trn1        v6.8b,v29.8b,v3.8b
     trn2        v3.8b,v29.8b,v3.8b
     st1         {v6.h}[0],[x12],x1
     cmhs        v16.8b,v0.8b,v1.8b
     st1         {v3.h}[0],[x12],x1
     bsl         v16.8b,v2.8b,v22.8b
     st1         {v16.b}[0],[x3],x1
     st1         {v16.b}[1],[x3],x1
     st1         {v6.h}[1],[x12],x1
     st1         {v16.b}[2],[x3],x1
     st1         {v3.h}[1],[x12]
     st1         {v16.b}[3],[x3]
 l1.1272:
     cmp         x5,#0
     beq         l1.964
     // checks for the flag q
     cmp         x10,#1
     bne         l1.1412
     // checks for the flag deq
     mov         v2.8b,v7.8b
     asr         x3,x6,#1

     dup         v6.8b,w3
     sub         x20,x3,#0
     neg         x3, x20
     dup         v16.8b,w3
     uaddl       v2.8h,v2.8b,v4.8b
     uaddw       v2.8h,  v2.8h ,  v24.8b
     ushr        v2.8h,v2.8h,#1
     xtn         v2.8b,  v2.8h

     usubl       v2.8h,v2.8b,v5.8b
     ssubw       v2.8h,  v2.8h ,  v20.8b
     sshr        v2.8h,v2.8h,#1
     sqxtn       v3.8b,v2.8h

     smin        v2.8b,  v3.8b ,  v6.8b
     smax        v3.8b,  v16.8b ,  v2.8b
     //  dup  v6.8b,w2
     //   smul v6.8b, v6.8b, v1.8b


     uxtl        v16.8h, v5.8b
     sxtl        v2.8h, v3.8b
     add         v2.8h,  v16.8h ,  v2.8h
     sqxtun      v3.8b, v2.8h
     mov         v30.8b,v5.8b
     cmhs        v5.8b,v0.8b,v1.8b


     bsl         v5.8b,v30.8b,v3.8b
 l1.1412:
     //  dup  v2.8b,w2
     add         x3,x0,#2
     add         x11,x3,x1
     //   smul v1.8b, v2.8b, v1.8b
     st1         {v7.b}[0],[x3]
     st1         {v7.b}[1],[x11],x1
     st1         {v7.b}[2],[x11],x1
     cmhs        v0.8b,v0.8b,v1.8b
     st1         {v7.b}[3],[x11]
     bsl         v0.8b,v4.8b,v23.8b
     mov         v29.8b,v0.8b
     trn1        v0.8b,v29.8b,v5.8b
     trn2        v5.8b,v29.8b,v5.8b
     st1         {v0.h}[0],[x0],x1
     st1         {v5.h}[0],[x0],x1
     st1         {v0.h}[1],[x0],x1
     st1         {v5.h}[1],[x0]

     ldp         x21, x22,[sp],#16
     ldp         x19, x20,[sp],#16
     ldp         d14,d15,[sp],#16
     ldp         d12,d13,[sp],#16
     ldp         d10,d11,[sp],#16
     ldp         d8,d9,[sp],#16
     ret
	///*****************************************************************************
	//*
	//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
	//*
	//* Licensed under the Apache License, Version 2.0 (the "License");
	//* you may not use this file except in compliance with the License.
	//* You may obtain a copy of the License at:
	//*
	//* http://www.apache.org/licenses/LICENSE-2.0
	//*
	//* Unless required by applicable law or agreed to in writing, software
	//* distributed under the License is distributed on an "AS IS" BASIS,
	//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	//* See the License for the specific language governing permissions and
	//* limitations under the License.
	//*
	//*****************************************************************************/
	///**
	///*******************************************************************************
	//* //file
	//* ihevc_deblk_luma_vert.s
	//*
	//* //brief
	//* contains function definitions for inter prediction interpolation.
	//* functions are coded using neon intrinsics and can be compiled using

	//* rvct
	//*
	//* //author
	//* anand s
	//*
	//* //par list of functions:
	//*
	//*
	//* //remarks
	//* none
	//*
	//*******************************************************************************/

	.text
	.align 4



	.extern gai4_ihevc_tc_table
	.extern gai4_ihevc_beta_table

	.globl ihevc_deblk_luma_vert_av8

	.type ihevc_deblk_luma_vert_av8, %function

	ihevc_deblk_luma_vert_av8:

	sxtw x5,w5
	sxtw x6,w6
	stp d8,d9,[sp,#-16]!
	stp d10,d11,[sp,#-16]!
	stp d12,d13,[sp,#-16]!
	stp d14,d15,[sp,#-16]!
	stp x19, x20,[sp,#-16]!
	stp x21, x22,[sp,#-16]!
	mov x21,x7
	ldr w22,[sp,#96]
	add x3,x3,x4
	add x3,x3,#1
	asr x3,x3,#1
	add x7,x3,x5,lsl #1
	add x3,x3,x6,lsl #1
	cmp x7,#0x33
	mov x20,#0x33
	csel x7, x20, x7,gt
	bgt l1.56
	cmp x7,#0x0
	mov x20,#0x0
	csel x7, x20, x7,lt // x7 has the beta_index value
	l1.56:

	// bic x2,x2,#1
	asr x2,x2,#1

	add x3,x3,x2,lsl #1
	cmp x3,#0x35
	mov x20,#0x35
	csel x3, x20, x3,gt
	bgt l1.88
	cmp x3,#0x0
	mov x20,#0x0
	csel x3, x20, x3,lt // x3 has the tc_index value

	// qp_luma = (quant_param_p + quant_param_q + 1) >> 1//
	// beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)//
	// tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)//

	l1.88:
	adrp x2, :got:gai4_ihevc_beta_table
	ldr x2, [x2, #:got_lo12:gai4_ihevc_beta_table]

	movi v18.8b, #0x2
	adrp x4, :got:gai4_ihevc_tc_table
	ldr x4, [x4, #:got_lo12:gai4_ihevc_tc_table]

	ldr w5,[x2,x7,lsl #2] // beta
	movi v16.8h, #0x2
	ldr w6,[x4,x3,lsl #2] // tc
	lsl x8,x6,#1
	cmp x6,#0
	dup v19.8b,w8
	sub x7,x0,#4
	movi v23.8b, #0x3
	beq l1.964


	sub x19,x0,#3
	ld1 {v15.8b},[x7],x1
	ldrb w8,[x19] // -3 value
	ld1 {v1.8b},[x7],x1
	ldrb w10,[x19,#1] //-2 value
	ld1 {v29.8b},[x7],x1
	ldrb w11,[x19,#2] //-1 value
	ld1 {v0.8b},[x7]
	ldrb w12,[x0,#0] // 0 value
	ldrb w9,[x0,#1] // 1 value
	trn1 v24.8b,v15.8b,v1.8b
	trn2 v1.8b,v15.8b,v1.8b
	ldrb w2,[x0,#2] // 2 value
	trn1 v2.8b,v29.8b,v0.8b
	trn2 v0.8b,v29.8b,v0.8b
	add x12,x12,x2
	subs x9,x12,x9,lsl #1 // dq0 value is stored in x9
	csneg x9,x9,x9,pl
	//dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )//
	mov v29.8b,v24.8b
	trn1 v24.4h,v29.4h,v2.4h
	trn2 v2.4h,v29.4h,v2.4h
	add x8,x8,x11
	mov v15.8b,v1.8b
	trn1 v1.4h,v15.4h,v0.4h
	trn2 v0.4h,v15.4h,v0.4h
	subs x8,x8,x10,lsl #1
	csneg x8,x8,x8,pl
	// dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )//



	add x14,x1,x1,lsl #1
	add x14,x0,x14

	sub x19,x14,#3
	dup v4.2s, v24.s[1]
	ldrb w2,[x19] // -2 value
	dup v7.2s, v2.s[1]
	ldrb w10,[x19,#1] // -2 value
	dup v3.2s, v2.s[0]
	ldrb w11,[x19,#2] // -1 value
	dup v5.2s, v1.s[1]
	ldrb w12,[x14,#0] // 0 value
	dup v6.2s, v1.s[0]
	ldrb w3,[x14,#1] // 1 value
	dup v2.2s, v0.s[0]
	ldrb w4,[x14,#2] // 2 value


	add x12,x12,x4
	subs x12,x12,x3,lsl #1 // dq3value is stored in x12
	csneg x12,x12,x12,pl
	// dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )//


	add x2,x2,x11
	subs x11,x2,x10,lsl #1
	csneg x11,x11,x11,pl // dp3 value is stored in x8
	// dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2] + pu1_src[3 * src_strd - 1] )//



	add x3,x8,x9 // x3 has the d0 value
	add x4,x11,x12 // x4 has the d3 value


	// d0 = dp0 + dq0//
	// d3 = dp3 + dq3//

	add x14,x8,x11 // x13 has the value dp
	add x12,x12,x9 // x12 has the value dq
	// dp = dp0 + dp3//
	// dq = dq0 + dq3//

	add x11, x3, x4 // x3 has the value d

	// d = d0 + d3//


	cmp x11,x5
	dup v22.2s, v0.s[1]
	bge l1.964

	// if(d < beta)


	// registers which cannont be altered : x3,x4 x5,x6,x12,x13,x0,x1,x11

	// registers for use: x2,x7,x8,x9,x10,
	uqsub v30.8b,v7.8b,v19.8b
	asr x10,x5,#2
	uqadd v31.8b,v7.8b,v19.8b
	cmp x10,x3,lsl #1
	uaddl v0.8h,v5.8b,v4.8b
	ble l1.336

	sub x19,x0,4
	ldrb w2,[x19]
	uaddw v0.8h, v0.8h , v2.8b
	ldrb w7,[x19,#3]
	umull v20.8h, v7.8b, v23.8b
	ldrb w3,[x0,#0]
	umlal v20.8h, v22.8b, v18.8b
	ldrb w8,[x0,#3]
	// ubfx x7,x2,#24,#8 // has the -1 value
	// and x2,#0xff // has the -4 value
	// ubfx x8,x3,#24,#8 // has the 3 value
	// and x3,#0xff // x4 has the 0 value

	add v20.8h, v20.8h , v0.8h
	subs x8,x8,x3
	rshrn v22.8b,v20.8h,#3
	csneg x8,x8,x8,pl
	subs x2,x2,x7
	umin v21.8b, v22.8b , v31.8b
	csneg x2,x2,x2,pl
	umax v22.8b, v21.8b , v30.8b
	add x8,x8,x2
	uaddl v20.8h,v7.8b,v3.8b
	cmp x8,x5,asr #3
	mla v20.8h, v0.8h, v16.8h
	bge l1.336
	uaddw v0.8h, v0.8h , v7.8b
	subs x7,x3,x7
	rshrn v20.8b,v20.8h,#3
	csneg x7,x7,x7,pl
	rshrn v0.8b,v0.8h,#2
	mov x10,#5
	uqadd v30.8b,v5.8b,v19.8b
	mul x10, x10, x6
	uqsub v31.8b,v5.8b,v19.8b
	add x10, x10,#1
	cmp x7,x10,asr #1
	bge l1.336


	// if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4]) < (beta >> 3) )
	// && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) )


	asr x10,x5,#2
	uqsub v25.8b,v4.8b,v19.8b
	cmp x10,x4,lsl #1
	uqadd v21.8b,v4.8b,v19.8b
	ble l1.336
	umin v26.8b, v20.8b , v21.8b
	add x4,x1,x1,lsl #1
	add x4,x4,x0
	umax v20.8b, v26.8b , v25.8b
	sub x19,x4,#4
	ldrb w2,[x19]
	umin v19.8b, v0.8b , v30.8b
	ldrb w7,[x19,#3]
	umax v21.8b, v19.8b , v31.8b
	ldrb w3,[x4,#0]
	lsl x10,x6,#1
	ldrb w8,[x4,#3]
	// ubfx x7,x2,#24,#8 // has the -1 value
	// and x2,#0xff // has the -4 value
	// ubfx x8,x3,#24,#8 // has the 3 value
	// and x3,#0xff // x4 has the 0 value
	uaddl v0.8h,v2.8b,v3.8b
	dup v19.8b,w10
	subs x8,x8,x3
	uaddw v0.8h, v0.8h , v4.8b
	csneg x8,x8,x8,pl
	uqadd v30.8b,v2.8b,v19.8b
	subs x2,x2,x7
	uqsub v31.8b,v2.8b,v19.8b
	csneg x2,x2,x2,pl
	uaddl v26.8h,v5.8b,v6.8b
	add x8,x8,x2
	mla v26.8h, v0.8h, v16.8h
	cmp x8,x5,asr #3
	bge l1.336
	rshrn v26.8b,v26.8h,#3
	subs x7,x3,x7
	uqadd v27.8b,v3.8b,v19.8b
	csneg x7,x7,x7,pl
	uqsub v28.8b,v3.8b,v19.8b
	mov x10,#5
	umin v16.8b, v26.8b , v30.8b
	mul x10, x10, x6
	add x10, x10,#1
	cmp x7,x10,asr #1
	umax v26.8b, v16.8b , v31.8b
	bge l1.336
	uqadd v30.8b,v6.8b,v19.8b

	mov x2,#2
	mov x4,x21
	uqsub v31.8b,v6.8b,v19.8b
	mov x5,x22
	b end_dep_deq_decision
	// x2 has the value of de
	// x6 has teh value of tc
	// x5 has the value of beta
	// x14 has the value of dp
	// x12 has the value of dq
	// x0 has the value of source address
	// x1 has the src stride

	l1.336:
	mov x2,#1
	l1.424:
	mov x11,x5
	mov x4,x21
	mov x5,x22

	cmp x6,#1
	mov x20,#0
	csel x9, x20, x9,eq
	mov x20,#0
	csel x10, x20, x10,eq
	beq end_dep_deq_decision

	and x7,x4,x5

	cmp x7,#1
	beq both_flags_set
	cmp x4,#0
	beq set_flag_dep_zero


	add x8,x11,x11,asr #1
	mov x10,#0
	asr x8,x8,#3
	cmp x8,x14
	mov x20,#1
	csel x9, x20, x9,gt
	mov x20,#0
	csel x9, x20, x9,le
	b end_dep_deq_decision
	set_flag_dep_zero:

	add x8,x11,x11,asr #1
	mov x9,#0
	asr x8,x8,#3
	cmp x8,x12
	mov x20,#1
	csel x10, x20, x10,gt
	mov x20,#0
	csel x10, x20, x10,le
	b end_dep_deq_decision

	both_flags_set:
	add x8,x11,x11,asr #1
	asr x8,x8,#3
	cmp x8,x14
	mov x20,#1
	csel x9, x20, x9,gt
	mov x20,#0
	csel x9, x20, x9,le
	cmp x8,x12
	mov x20,#1
	csel x10, x20, x10,gt
	mov x20,#0
	csel x10, x20, x10,le
	end_dep_deq_decision:

	//x0=source address
	//x1=stride
	// x2 =de
	// x4=flag p
	//x5= flag q
	//x6 =tc
	// x9 =dep
	// x10=deq
	// b l1.964


	cmp x2,#2
	// x4 has the value of de
	bne l1.968

	cmp x5,#0
	beq l1.780
	// x5 has the flag of q

	add x3,x0,#2
	st1 {v22.b}[0],[x3],x1

	st1 {v22.b}[1],[x3],x1

	st1 {v22.b}[2],[x3],x1

	st1 {v22.b}[3],[x3]
	add x3,x0,x1
	mov v29.8b,v20.8b
	trn1 v20.8b,v29.8b,v21.8b
	trn2 v21.8b,v29.8b,v21.8b

	st1 {v20.h}[0],[x0]
	st1 {v21.h}[0],[x3],x1
	st1 {v20.h}[1],[x3],x1
	st1 {v21.h}[1],[x3]


	l1.780:
	cmp x4,#0
	beq l1.964
	// x4 has the flag p


	dup v7.2s, v24.s[0]
	sub x3,x0,#1
	uaddw v16.8h, v0.8h , v6.8b
	add x7,x3,x1
	rshrn v2.8b,v16.8h,#2
	st1 {v26.b}[0],[x3]
	sub x0,x0,#3
	umin v16.8b, v2.8b , v27.8b
	st1 {v26.b}[1],[x7],x1
	umull v2.8h, v6.8b, v23.8b
	umlal v2.8h, v7.8b, v18.8b
	st1 {v26.b}[2],[x7],x1
	umax v5.8b, v16.8b , v28.8b
	st1 {v26.b}[3],[x7]
	add v0.8h, v2.8h , v0.8h
	rshrn v0.8b,v0.8h,#3


	umin v1.8b, v0.8b , v30.8b
	umax v0.8b, v1.8b , v31.8b

	mov v29.8b,v0.8b
	trn1 v0.8b,v29.8b,v5.8b
	trn2 v5.8b,v29.8b,v5.8b
	st1 {v0.h}[0],[x0],x1
	st1 {v5.h}[0],[x0],x1
	st1 {v0.h}[1],[x0],x1
	st1 {v5.h}[1],[x0]
	l1.964:
	ldp x21, x22,[sp],#16
	ldp x19, x20,[sp],#16
	ldp d14,d15,[sp],#16
	ldp d12,d13,[sp],#16
	ldp d10,d11,[sp],#16
	ldp d8,d9,[sp],#16
	ret

	l1.968:


	movi v0.8h, #0x9
	neg x11, x6
	cmp x4,#0
	// checks for the flag p
	movi v16.8h, #0x3
	movi v24.8b, #0x1


	dup v30.8b,w11
	and x11,x6,#0xff
	dup v31.8b,w11

	usubl v18.8h,v4.8b,v2.8b
	mul v18.8h, v18.8h, v0.8h
	usubl v0.8h,v5.8b,v3.8b



	mul v16.8h, v0.8h, v16.8h
	sub v16.8h, v18.8h , v16.8h
	srshr v16.8h,v16.8h,#4
	// delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4//

	abs v0.8h, v16.8h
	xtn v0.8b, v0.8h
	// storing the absolute values of delta in d0

	sqxtn v16.8b,v16.8h
	// storing the clipped values of delta in d16

	movi v1.8b, #0xa
	dup v21.8b,w11
	mul v1.8b, v1.8b, v21.8b
	// d1 stores the value (10 * tc)

	//if(abs(delta) < 10 * tc)

	smin v18.8b, v16.8b , v31.8b
	smax v20.8b, v18.8b , v30.8b

	// delta = clip3(delta, -tc, tc)//
	sxtl v16.8h, v20.8b
	uxtl v18.8h, v2.8b
	add v18.8h, v18.8h , v16.8h

	sqxtun v22.8b, v18.8h
	uxtl v18.8h, v4.8b
	sub v16.8h, v18.8h , v16.8h
	sqxtun v23.8b, v16.8h
	// tmp_p0 = clip_u8(pu1_src[-1] + delta)//
	// tmp_q0 = clip_u8(pu1_src[0] - delta)//
	beq l1.1272



	cmp x9,#1
	bne l1.1212
	// checks for the flag dep

	asr x3,x6,#1


	uaddl v16.8h,v6.8b,v2.8b
	uaddw v16.8h, v16.8h , v24.8b
	dup v18.8b,w3
	sub x20,x3,#0
	neg x3, x20
	dup v19.8b,w3
	ushr v16.8h,v16.8h,#1
	xtn v16.8b, v16.8h

	usubl v16.8h,v16.8b,v3.8b
	saddw v16.8h, v16.8h , v20.8b
	sshr v16.8h,v16.8h,#1
	sqxtn v16.8b,v16.8h

	smin v17.8b, v16.8b , v18.8b
	smax v16.8b, v19.8b , v17.8b




	uxtl v18.8h, v3.8b
	sxtl v16.8h, v16.8b
	add v16.8h, v18.8h , v16.8h

	sqxtun v16.8b, v16.8h
	mov v30.8b,v3.8b
	cmhs v3.8b,v0.8b,v1.8b


	bsl v3.8b,v30.8b,v16.8b
	l1.1212:
	dup v16.8b,w11
	sub x12,x0,#3
	sub x3,x0,#1
	// smul v16.8b, v16.8b, v1.8b
	mov v29.8b,v6.8b
	trn1 v6.8b,v29.8b,v3.8b
	trn2 v3.8b,v29.8b,v3.8b
	st1 {v6.h}[0],[x12],x1
	cmhs v16.8b,v0.8b,v1.8b
	st1 {v3.h}[0],[x12],x1
	bsl v16.8b,v2.8b,v22.8b
	st1 {v16.b}[0],[x3],x1
	st1 {v16.b}[1],[x3],x1
	st1 {v6.h}[1],[x12],x1
	st1 {v16.b}[2],[x3],x1
	st1 {v3.h}[1],[x12]
	st1 {v16.b}[3],[x3]
	l1.1272:
	cmp x5,#0
	beq l1.964
	// checks for the flag q
	cmp x10,#1
	bne l1.1412
	// checks for the flag deq
	mov v2.8b,v7.8b
	asr x3,x6,#1

	dup v6.8b,w3
	sub x20,x3,#0
	neg x3, x20
	dup v16.8b,w3
	uaddl v2.8h,v2.8b,v4.8b
	uaddw v2.8h, v2.8h , v24.8b
	ushr v2.8h,v2.8h,#1
	xtn v2.8b, v2.8h

	usubl v2.8h,v2.8b,v5.8b
	ssubw v2.8h, v2.8h , v20.8b
	sshr v2.8h,v2.8h,#1
	sqxtn v3.8b,v2.8h

	smin v2.8b, v3.8b , v6.8b
	smax v3.8b, v16.8b , v2.8b
	// dup v6.8b,w2
	// smul v6.8b, v6.8b, v1.8b



	uxtl v16.8h, v5.8b
	sxtl v2.8h, v3.8b
	add v2.8h, v16.8h , v2.8h
	sqxtun v3.8b, v2.8h
	mov v30.8b,v5.8b
	cmhs v5.8b,v0.8b,v1.8b


	bsl v5.8b,v30.8b,v3.8b
	l1.1412:
	// dup v2.8b,w2
	add x3,x0,#2
	add x11,x3,x1
	// smul v1.8b, v2.8b, v1.8b
	st1 {v7.b}[0],[x3]
	st1 {v7.b}[1],[x11],x1
	st1 {v7.b}[2],[x11],x1
	cmhs v0.8b,v0.8b,v1.8b
	st1 {v7.b}[3],[x11]
	bsl v0.8b,v4.8b,v23.8b
	mov v29.8b,v0.8b
	trn1 v0.8b,v29.8b,v5.8b
	trn2 v5.8b,v29.8b,v5.8b
	st1 {v0.h}[0],[x0],x1
	st1 {v5.h}[0],[x0],x1
	st1 {v0.h}[1],[x0],x1
	st1 {v5.h}[1],[x0]

	ldp x21, x22,[sp],#16
	ldp x19, x20,[sp],#16
	ldp d14,d15,[sp],#16
	ldp d12,d13,[sp],#16
	ldp d10,d11,[sp],#16
	ldp d8,d9,[sp],#16
	ret