| .macro push_v_regs |
| stp q8, q9, [sp, #-32]! |
| stp q10, q11, [sp, #-32]! |
| stp q12, q13, [sp, #-32]! |
| stp q14, q15, [sp, #-32]! |
| stp X8, X9, [sp, #-16]! |
| stp X10, X11, [sp, #-16]! |
| stp X12, X13, [sp, #-16]! |
| stp X14, X15, [sp, #-16]! |
| stp X16, X17, [sp, #-16]! |
| stp X18, X19, [sp, #-16]! |
| stp X20, X21, [sp, #-16]! |
| stp X29, X30, [sp, #-16]! |
| .endm |
| |
| .macro pop_v_regs |
| ldp X29, X30, [sp], #16 |
| ldp X20, X21, [sp], #16 |
| ldp X18, X19, [sp], #16 |
| ldp X16, X17, [sp], #16 |
| ldp X14, X15, [sp], #16 |
| ldp X12, X13, [sp], #16 |
| ldp X10, X11, [sp], #16 |
| ldp X8, X9, [sp], #16 |
| ldp q14, q15, [sp], #32 |
| ldp q12, q13, [sp], #32 |
| ldp q10, q11, [sp], #32 |
| ldp q8, q9, [sp], #32 |
| .endm |
| |
| .text |
| .p2align 2 |
| |
| .global ixheaacd_cos_sin_mod_loop2 |
| ixheaacd_cos_sin_mod_loop2: |
| |
| // STMFD sp!, {x4-x12, x14} |
| push_v_regs |
| //stp x19, x20,[sp,#-16]! |
| //VPUSH {D8-D15} |
| //generating load addresses |
| ADD x3, x0, x2, LSL #3 //psubband1 = &subband[2 * M - 1]; |
| SUB x3, x3, #4 |
| ADD x10, x0, #256 |
| ADD x11, x10, x2, LSL #3 |
| SUB x11, x11, #4 |
| MOV x8, #-4 |
| MOV w19, #0 |
| DUP V0.4s, w19 |
| DUP V1.4s, w19 |
| |
| LDR w6, [x0] |
| sxtw x6, w6 |
| ASR x4, x2, #1 //M_2 = ixheaacd_shx32(M, 1); |
| SUB x4, x4, #1 |
| |
| ASR x6, x6, #1 //*psubband = *psubband >> 1; |
| LD1 {v2.s}[0], [x3] |
| |
| STR w6, [x0], #4 //psubband++; |
| sxtw x6, w6 |
| LDR w7, [x0] |
| sxtw x7, w7 |
| ASR x7, x7, #1 |
| sub x20, x7, #0 |
| neg x6, x20 |
| STR w6, [x3], #-4 |
| sxtw x6, w6 |
| LD1 {v3.s}[0], [x3] // im = *psubband1; |
| |
| LD2 {v0.h, v1.h}[0], [x1], #4 |
| sxtl v0.4s, v0.4h |
| sxtl v1.4s, v1.4h |
| dup v0.2s, v0.s[0] |
| dup v1.2s, v1.s[0] |
| |
| LD1 {v2.s}[1], [x11] //re = *psubband12; |
| |
| // LDR w6, [x10] |
| // sxtw x6,w6 |
| // ASR x7, x6, #1 |
| // MOV x9, #0 |
| // QSUB x7, x9, x7 |
| LD1 {v4.s}[0], [x10] |
| SSHR v4.2s, v4.2s, #1 |
| MOV x9, #0 |
| DUP v6.2s, w9 |
| SQSUB v4.2s, v6.2s, v4.2s |
| |
| ST1 {v4.s}[0], [x11] |
| // str X7, [X11] |
| SUB x11, x11, #4 |
| // sxtw x7,w7 |
| |
| LDR w6, [x10, #4] |
| sxtw x6, w6 |
| ASR x6, x6, #1 |
| STR w6, [x10], #4 |
| sxtw x6, w6 |
| |
| LD1 {v3.s}[1], [x11] |
| |
| sMULL v4.2d, v0.2s, v2.2s //qsub 2nd |
| sshr v4.2d, v4.2d, #16 |
| sMULL v6.2d, v0.2s, v3.2s //add 2nd |
| sshr v6.2d, v6.2d, #16 |
| sMULL v8.2d, v1.2s, v2.2s //add 1st |
| sshr v8.2d, v8.2d, #16 |
| sMULL v10.2d, v1.2s, v3.2s //qsub 1st |
| sshr v10.2d, v10.2d, #16 |
| |
| add v12.2d, v8.2d , v6.2d |
| SQSUB v14.2d, v10.2d , v4.2d |
| SQSUB v16.2d, v4.2d , v10.2d |
| |
| //shrn v12.2s, v12.2d,#32 |
| //shrn v14.2s, v14.2d,#32 |
| //shrn v16.2s, v16.2d,#32 |
| |
| ST1 {v12.s}[0], [x3], x8 |
| |
| ST1 {v14.s}[0], [x0], #4 |
| |
| SQNEG v12.4s, v12.4s |
| |
| |
| ST1 {v12.s}[2], [x10], #4 |
| |
| ST1 {v16.s}[2], [x11], x8 |
| |
| LOOP1: |
| LD1 {v2.2s}, [x0] |
| LD1 {v3.2s}, [x10] |
| LDR w5, [x3] //RE2 |
| sxtw x5, w5 |
| LDR w6, [x11] //RE3 |
| sxtw x6, w6 |
| //VTRN.32 D2, D3 |
| TRN1 v4.2s, v2.2s, v3.2s |
| TRN2 v3.2s, v2.2s, v3.2s |
| MOV v2.8b, v4.8b |
| |
| sMULL v4.2d, v0.2s, v2.2s //qsub 2nd |
| sshr v4.2d, v4.2d, #16 |
| sMULL v6.2d, v0.2s, v3.2s //add 2nd |
| sshr v6.2d, v6.2d, #16 |
| sMULL v8.2d, v1.2s, v2.2s //add 1st |
| sshr v8.2d, v8.2d, #16 |
| sMULL v10.2d, v1.2s, v3.2s //qsub 1st |
| sshr v10.2d, v10.2d, #16 |
| |
| add v12.2d, v8.2d , v6.2d |
| SQSUB v14.2d, v4.2d , v10.2d |
| SQSUB v16.2d, v10.2d , v4.2d |
| |
| //shrn v12.2s, v12.2d,#32 |
| //shrn v14.2s, v14.2d,#32 |
| //shrn v16.2s, v16.2d,#32 |
| |
| ST1 {v12.s}[0], [x0], #4 |
| ST1 {v14.s}[0], [x3], x8 |
| SQNEG v12.4s, v12.4s |
| |
| ST1 {v12.s}[2], [x11], x8 |
| ST1 {v16.s}[2], [x10], #4 |
| |
| MOV w19, #0 |
| DUP V0.4s, w19 |
| DUP V1.4s, w19 |
| // second part |
| LD2 {v0.h, v1.h}[0], [x1], #4 |
| sxtl v0.4s, v0.4h |
| sxtl v1.4s, v1.4h |
| dup v0.2s, v0.s[0] |
| dup v1.2s, v1.s[0] |
| |
| mov v3.s[0], w5 |
| mov v3.s[1], w6 |
| LD1 {v2.s}[0], [x3] |
| LD1 {v2.s}[1], [x11] |
| |
| sMULL v4.2d, v0.2s, v2.2s //qsub 2nd |
| sshr v4.2d, v4.2d, #16 |
| sMULL v6.2d, v0.2s, v3.2s //add 2nd |
| sshr v6.2d, v6.2d, #16 |
| sMULL v8.2d, v1.2s, v2.2s //add 1st |
| sshr v8.2d, v8.2d, #16 |
| sMULL v10.2d, v1.2s, v3.2s //qsub 1st |
| sshr v10.2d, v10.2d, #16 |
| |
| add v12.2d, v4.2d , v10.2d |
| SQSUB v14.2d, v8.2d , v6.2d |
| SQSUB v16.2d, v6.2d , v8.2d |
| |
| //shrn v12.2s, v12.2d,#32 |
| //shrn v14.2s, v14.2d,#32 |
| //shrn v16.2s, v16.2d,#32 |
| |
| ST1 {v12.s}[0], [x3], x8 |
| ST1 {v14.s}[0], [x0], #4 |
| |
| SQNEG v12.4s, v12.4s |
| |
| subs x4, x4, #1 |
| ST1 {v12.s}[2], [x10], #4 |
| ST1 {v16.s}[2], [x11], x8 |
| |
| BGT LOOP1 |
| //VPOP {D8-D15} |
| // LDMFD sp!, {x4-x12, x15} |
| //ldp x19, x20,[sp],#16 |
| pop_v_regs |
| ret |