blob: ddd60f1b31bea09a2821cc91ac729019b19b07dd [file] [log] [blame]
//.include "ihevc_neon_macros.s"
.macro push_v_regs
stp x8, x9, [sp, #-16]!
stp x10, X11, [sp, #-16]!
stp X12, X13, [sp, #-16]!
stp X14, X15, [sp, #-16]!
stp X29, X30, [sp, #-16]!
.endm
.macro pop_v_regs
ldp X29, X30, [sp], #16
ldp X14, X15, [sp], #16
ldp X12, X13, [sp], #16
ldp X10, X11, [sp], #16
ldp X8, X9, [sp], #16
.endm
.text
.p2align 2
.global ixheaacd_postradixcompute4
ixheaacd_postradixcompute4:
// STMFD sp!, {x4-x12, x14}
push_v_regs
//SUB sp, sp, #16
//HARD CODED for FFT Length of 16
// x3 is always 16
//SUB x4, x3, #2 ; y to y offset calculated
//MOV x4, #14
//STR x4, [sp, #8] ; (npoints / 2)*4bytes - 4bytes
//STR x0, [sp, #12] ; (3*(npoints/2))*4bytes - 4bytes
// x0 to x2 offset (npoints / 2)*4bytes
ADD x4, x1, x3, lsl #1 // x1 -> x0, x4 -> x2
MOV x3, #2
POSTRADIX4_START:
// LDMIA x1!, {x5-x12} // x_0 :x_7
LDP w5, w6, [x1], #8 // x_0 :x_1
LDP w7, w8, [x1], #8 // x_2 :x_3
LDP w9, w10, [x1], #8 // x_4 :x_5
LDP w11, w12, [x1], #8 // x_6 :x_7
ADD w14, w5, w9 // xh0_0 = x_0 + x_4
SUB w5, w5, w9 // xl0_0 = x_0 - x_4
ADD w9, w6, w10 // xh1_0 = x_1 + x_5
SUB w6, w6, w10 // xl1_0 = x_1 - x_5
ADD w10, w7, w11 // xh0_1 = x_2 + x_6
SUB w7, w7, w11 // xl0_1 = x_2 - x_6
ADD w11, w8, w12 // xh1_1 = x_3 + x_7
SUB w8, w8, w12 // xl1_1 = x_3 - x_7
ADD w12, w14, w10 // n00 = xh0_0 + xh0_1
SUB w14, w14, w10 // n20 = xh0_0 - xh0_1
ADD w10, w9, w11 // n01 = xh1_0 + xh1_1
SUB w9, w9, w11 // n21 = xh1_0 - xh1_1
ADD w11, w5, w8 // n10 = xl0_0 + xl1_1
SUB w5, w5, w8 // n30 = xl0_0 - xl1_1
ADD w8, w6, w7 // n31 = xl1_0 + xl0_1
SUB w6, w6, w7 // n11 = xl1_0 - xl0_1
STR w12, [x0], #4 // y0[h2] = n00, x7 -> y0[h2 + 1]
STR w10, [x0], #14<<1 // y0[h2 + 1] = n01, x7 -> y1[h2]
STR w11, [x0], #4 // y1[h2] = n10, x7 -> y1[h2 + 1]
STR w6 , [x0], #14<<1 // y1[h2 + 1] = n11, x7 -> y2[h2]
STR w14, [x0], #4 // y2[h2] = n20, x7 -> y2[h2 + 1]
STR w9 , [x0], #14<<1 // y2[h2 + 1] = n21, x7 -> y3[h2]
STR w5, [x0], #4 // y3[h2] = n30, x7 -> y3[h2 + 1]
STR w8, [x0], #0 // y3[h2 + 1] = n31, x7 -> y0[h2+2]
// LDMIA x4!, {x5-x12} // x_0 :x_7
LDP w5, w6, [x4], #8 // x_8 :x_8
LDP w7, w8, [x4], #8 // x_a :x_b
LDP w9, w10, [x4], #8 // x_c :x_d
LDP w11, w12, [x4], #8 // x_e :x_f
SUB x0, x0, #92 // #4*3 + #14<<1 * 3 - 8
ADD w14, w5, w9
SUB w5, w5, w9
ADD w9, w6, w10
SUB w6, w6, w10
ADD w10, w7, w11
SUB w7, w7, w11
ADD w11, w8, w12
SUB w8, w8, w12
ADD w12, w14, w10
SUB w14, w14, w10
ADD w10, w9, w11
SUB w9, w9, w11
ADD w11, w5, w8
SUB w5, w5, w8
ADD w8, w6, w7
SUB w6, w6, w7
STR w12, [x0], #4
STR w10, [x0], #14<<1
STR w11, [x0], #4
STR w6, [x0], #14<<1
STR w14, [x0], #4
STR w9, [x0], #14<<1
STR w5, [x0], #4
STR w8, [x0], #0
ADD x1, x1, #1 << 5 // x0 += (Word32) npoints >> 1
ADD x4, x4, #1 << 5 // x2 += (Word32) npoints >> 1
SUB x0, x0, #100-8
SUBS w3, w3, #1
BGT POSTRADIX4_START
// LDMFD sp!, {x4-x12, x15}
pop_v_regs
ret