| //.include "ihevc_neon_macros.s" |
| .macro push_v_regs |
| stp X8, X9, [sp, #-16]! |
| stp X10, X11, [sp, #-16]! |
| stp X12, X13, [sp, #-16]! |
| stp X14, X15, [sp, #-16]! |
| stp X16, X17, [sp, #-16]! |
| stp x19, x20, [sp, #-16]! |
| stp x21, x22, [sp, #-16]! |
| stp X29, X30, [sp, #-16]! |
| .endm |
| .macro pop_v_regs |
| ldp X29, X30, [sp], #16 |
| ldp x21, x22, [sp], #16 |
| ldp x19, x20, [sp], #16 |
| ldp X16, X17, [sp], #16 |
| ldp X14, X15, [sp], #16 |
| ldp X12, X13, [sp], #16 |
| ldp X10, X11, [sp], #16 |
| ldp X8, X9, [sp], #16 |
| .endm |
| |
| .text |
| .p2align 2 |
| .global ixheaacd_scale_factor_process_armv8 |
| |
| ixheaacd_scale_factor_process_armv8: |
| |
| push_v_regs |
| |
| MOV x9, x4 |
| |
| MOV x21, x6 |
| MOV x22, x7 |
| CMP x2, #0 // Tbands |
| |
| BGT lbl17 |
| |
| pop_v_regs |
| ret |
| lbl17: |
| MOV x10, #0 |
| CMP x5, #2 |
| BGT ADD_34 |
| MOV x11, #0x25 |
| B TBANDS_LOOP |
| ADD_34: |
| MOV x11, #0x22 |
| // MOV x11, #0x25 // temp=37 |
| |
| TBANDS_LOOP: |
| LDRSH x5, [x1], #2 // scale_factor = *Scfactor++; |
| LDRB w4, [x3], #1 //Offset [1] |
| sxtw x4, w4 |
| |
| |
| CMP x5, #0x18 //if(scale_factor < 24) |
| BGE SCALE_FACTOR_GE_12 // |
| |
| CMP x4, #0 |
| BLE OFFSET_ZERO |
| |
| SCALE_FACTOR_LT_12: |
| |
| STR x10, [x0], #8 |
| STR x10, [x0], #8 |
| SUBS x4, x4, #4 |
| BGT SCALE_FACTOR_LT_12 |
| B OFFSET_ZERO |
| |
| SCALE_FACTOR_GE_12: |
| |
| SUBS x6, x11, x5, ASR #2 // 37-(scale_factor >> 2) |
| AND x5, x5, #3 // scale_factor & 0x0003 |
| |
| //ADD x5,x9,x5,LSL #1 ; scale_table_ptr[(scale_factor & 0x0003)]; |
| LDR w5, [x9, x5, LSL #2] // scale_short = scale_table_ptr[(scale_factor & 0x0003)]; |
| sxtw x5, w5 |
| AND w17, w5, #0x0000FFFF |
| sxth w17, w17 //16-bit value stored as 32-bit,so SMULWB can still be used |
| BLE SHIFT_LE_ZERO // if shift less than or equal to zero |
| |
| SUB x14, x6, #1 //dont do that extra LSL #1 in SMULWB |
| |
| SHIFT_POSITIVE: //loop over sfbWidth a multiple of 4 |
| LDP w6, w7 , [x0, #0] // temp1 = *x_invquant |
| LDP w19, w20, [x0, #8] |
| |
| //SMULWB x6, x6, x5 // buffex1 = mult32x16in32(temp1, scale_short); |
| SMULL x6, w6, w17 |
| SMULL x7, w7, w17 |
| SMULL x19, w19, w17 |
| SMULL x20, w20, w17 |
| |
| ASR x6, x6, #16 |
| ASR x7, x7 , #16 |
| ASR x19, x19 , #16 |
| ASR x20, x20 , #16 |
| |
| ASR x6, x6, x14 // buffex1 = shx32(buffex1, shift); |
| ASR x7, x7, x14 |
| ASR x19, x19, x14 |
| ASR x20, x20, x14 |
| |
| stp w6, w7, [x0], #8 |
| stp w19, w20, [x0], #8 |
| |
| SUBS x4, x4, #4 |
| |
| BGT SHIFT_POSITIVE |
| B OFFSET_ZERO |
| SHIFT_LE_ZERO: |
| |
| //RSBS x14, x6, #0 //-shift |
| NEGS x14, x6 |
| BGT SHIFT_NEGTIVE1 |
| |
| SHIFT_ZERO: //loop over sfbWidth a multiple of 4 |
| LDP w6, w7, [x0, #0] // temp1 = *x_invquant; |
| |
| //SMULWB x6, x6, x5 // buffex1 = mult32x16in32(temp1, scale_short); |
| SMULL x6, w6, w17 |
| SMULL x7, w7, w17 |
| |
| ASR x6, x6, #16 |
| ASR x7, x7, #16 |
| |
| LSL x6, x6, #1 |
| LSL x7, x7, #1 |
| |
| STP w6, w7, [x0], #8 // *x_invquant++ = buffex1; |
| |
| SUBS x4, x4, #2 |
| |
| BGT SHIFT_ZERO |
| B OFFSET_ZERO |
| |
| SHIFT_NEGTIVE1: |
| SUB x14, x14, #1 |
| SHIFT_NEGTIVE: //;loop over sfbWidth a multiple of 4 |
| |
| LDP w6, w7, [x0, #0] |
| LSL w6, w6, w14 // buffex1 = shl32(buffex1, shift-1); |
| LSL w7, w7, w14 // buffex1 = shl32(buffex1, shift-1); |
| |
| //SMULWB x6, x6, x5 // buffex1 = mult32x16in32(temp1, scale_short); |
| SMULL x6, w6, w17 |
| SMULL x7, w7, w17 |
| ASR x6, x6, #16 |
| ASR x7, x7, #16 |
| |
| LSL x6, x6, #2 // shl for fixmul_32x16b and shl32(buffer,1) |
| LSL x7, x7, #2 // shl for fixmul_32x16b and shl32(buffer,1) |
| |
| STP w6, w7, [x0], #8 // *x_invquant++ = buffex1; |
| |
| SUBS x4, x4, #2 |
| |
| BGT SHIFT_NEGTIVE |
| |
| OFFSET_ZERO: |
| SUBS x2, x2, #1 |
| BGT TBANDS_LOOP |
| |
| pop_v_regs |
| ret |