blob: a48a7051b9387c1876418f18f162c3a30f31ea37 [file] [log] [blame]
.macro push_v_regs
stp q8, q9, [sp, #-32]!
stp q10, q11, [sp, #-32]!
stp q12, q13, [sp, #-32]!
stp q14, q15, [sp, #-32]!
stp X8, X9, [sp, #-16]!
stp X10, X11, [sp, #-16]!
stp X12, X13, [sp, #-16]!
stp X14, X15, [sp, #-16]!
stp X16, X17, [sp, #-16]!
stp X18, X19, [sp, #-16]!
stp X20, X21, [sp, #-16]!
stp X29, X30, [sp, #-16]!
.endm
.macro pop_v_regs
ldp X29, X30, [sp], #16
ldp X20, X21, [sp], #16
ldp X18, X19, [sp], #16
ldp X16, X17, [sp], #16
ldp X14, X15, [sp], #16
ldp X12, X13, [sp], #16
ldp X10, X11, [sp], #16
ldp X8, X9, [sp], #16
ldp q14, q15, [sp], #32
ldp q12, q13, [sp], #32
ldp q10, q11, [sp], #32
ldp q8, q9, [sp], #32
.endm
.text
.p2align 2
.global ixheaacd_cos_sin_mod_loop2
ixheaacd_cos_sin_mod_loop2:
// STMFD sp!, {x4-x12, x14}
push_v_regs
//stp x19, x20,[sp,#-16]!
//VPUSH {D8-D15}
//generating load addresses
ADD x3, x0, x2, LSL #3 //psubband1 = &subband[2 * M - 1];
SUB x3, x3, #4
ADD x10, x0, #256
ADD x11, x10, x2, LSL #3
SUB x11, x11, #4
MOV x8, #-4
LDR w19, =0
DUP V0.4s, w19
DUP V1.4s, w19
LDR w6, [x0]
sxtw x6, w6
ASR x4, x2, #1 //M_2 = ixheaacd_shx32(M, 1);
SUB x4, x4, #1
ASR x6, x6, #1 //*psubband = *psubband >> 1;
LD1 {v2.s}[0], [x3]
STR w6, [x0], #4 //psubband++;
sxtw x6, w6
LDR w7, [x0]
sxtw x7, w7
ASR x7, x7, #1
sub x20, x7, #0
neg x6, x20
STR w6, [x3], #-4
sxtw x6, w6
LD1 {v3.s}[0], [x3] // im = *psubband1;
LD2 {v0.h, v1.h}[0], [x1], #4
sxtl v0.4s, v0.4h
sxtl v1.4s, v1.4h
dup v0.2s, v0.s[0]
dup v1.2s, v1.s[0]
LD1 {v2.s}[1], [x11] //re = *psubband12;
// LDR w6, [x10]
// sxtw x6,w6
// ASR x7, x6, #1
// MOV x9, #0
// QSUB x7, x9, x7
LD1 {v4.s}[0], [x10]
SSHR v4.2s, v4.2s, #1
MOV x9, #0
DUP v6.2s, w9
SQSUB v4.2s, v6.2s, v4.2s
ST1 {v4.s}[0], [x11]
// str X7, [X11]
SUB x11, x11, #4
// sxtw x7,w7
LDR w6, [x10, #4]
sxtw x6, w6
ASR x6, x6, #1
STR w6, [x10], #4
sxtw x6, w6
LD1 {v3.s}[1], [x11]
sMULL v4.2d, v0.2s, v2.2s //qsub 2nd
sshr v4.2d, v4.2d, #16
sMULL v6.2d, v0.2s, v3.2s //add 2nd
sshr v6.2d, v6.2d, #16
sMULL v8.2d, v1.2s, v2.2s //add 1st
sshr v8.2d, v8.2d, #16
sMULL v10.2d, v1.2s, v3.2s //qsub 1st
sshr v10.2d, v10.2d, #16
add v12.2d, v8.2d , v6.2d
SQSUB v14.2d, v10.2d , v4.2d
SQSUB v16.2d, v4.2d , v10.2d
//shrn v12.2s, v12.2d,#32
//shrn v14.2s, v14.2d,#32
//shrn v16.2s, v16.2d,#32
ST1 {v12.s}[0], [x3], x8
ST1 {v14.s}[0], [x0], #4
SQNEG v12.4s, v12.4s
ST1 {v12.s}[2], [x10], #4
ST1 {v16.s}[2], [x11], x8
LOOP1:
LD1 {v2.2s}, [x0]
LD1 {v3.2s}, [x10]
LDR w5, [x3] //RE2
sxtw x5, w5
LDR w6, [x11] //RE3
sxtw x6, w6
//VTRN.32 D2, D3
TRN1 v4.2s, v2.2s, v3.2s
TRN2 v3.2s, v2.2s, v3.2s
MOV v2.8b, v4.8b
sMULL v4.2d, v0.2s, v2.2s //qsub 2nd
sshr v4.2d, v4.2d, #16
sMULL v6.2d, v0.2s, v3.2s //add 2nd
sshr v6.2d, v6.2d, #16
sMULL v8.2d, v1.2s, v2.2s //add 1st
sshr v8.2d, v8.2d, #16
sMULL v10.2d, v1.2s, v3.2s //qsub 1st
sshr v10.2d, v10.2d, #16
add v12.2d, v8.2d , v6.2d
SQSUB v14.2d, v4.2d , v10.2d
SQSUB v16.2d, v10.2d , v4.2d
//shrn v12.2s, v12.2d,#32
//shrn v14.2s, v14.2d,#32
//shrn v16.2s, v16.2d,#32
ST1 {v12.s}[0], [x0], #4
ST1 {v14.s}[0], [x3], x8
SQNEG v12.4s, v12.4s
ST1 {v12.s}[2], [x11], x8
ST1 {v16.s}[2], [x10], #4
LDR w19, =0
DUP V0.4s, w19
DUP V1.4s, w19
// second part
LD2 {v0.h, v1.h}[0], [x1], #4
sxtl v0.4s, v0.4h
sxtl v1.4s, v1.4h
dup v0.2s, v0.s[0]
dup v1.2s, v1.s[0]
mov v3.s[0], w5
mov v3.s[1], w6
LD1 {v2.s}[0], [x3]
LD1 {v2.s}[1], [x11]
sMULL v4.2d, v0.2s, v2.2s //qsub 2nd
sshr v4.2d, v4.2d, #16
sMULL v6.2d, v0.2s, v3.2s //add 2nd
sshr v6.2d, v6.2d, #16
sMULL v8.2d, v1.2s, v2.2s //add 1st
sshr v8.2d, v8.2d, #16
sMULL v10.2d, v1.2s, v3.2s //qsub 1st
sshr v10.2d, v10.2d, #16
add v12.2d, v4.2d , v10.2d
SQSUB v14.2d, v8.2d , v6.2d
SQSUB v16.2d, v6.2d , v8.2d
//shrn v12.2s, v12.2d,#32
//shrn v14.2s, v14.2d,#32
//shrn v16.2s, v16.2d,#32
ST1 {v12.s}[0], [x3], x8
ST1 {v14.s}[0], [x0], #4
SQNEG v12.4s, v12.4s
subs x4, x4, #1
ST1 {v12.s}[2], [x10], #4
ST1 {v16.s}[2], [x11], x8
BGT LOOP1
//VPOP {D8-D15}
// LDMFD sp!, {x4-x12, x15}
//ldp x19, x20,[sp],#16
pop_v_regs
ret