blob: 5e245b679f1dae8ce201699ac615e105e8b5f39a [file] [log] [blame]
.text
.p2align 2
.global ixheaacd_fft_15_ld_armv7
ixheaacd_fft_15_ld_armv7:
STMFD r13!, {r4 - r12, r14} @
STR r1 , [r13, #-4]! @
STR r3 , [r13, #-4]! @
MOV lr, r2 @ lr - fft3out
MOV r12, #384 @
LOOP_FFT5:
LDRD r2, [r0] @ r2 = buf1a[0] and r3 = buf1a[1]
ADD r0, r0, r12
LDRD r4, [r0] @ r4 = buf1a[2] and r5 = buf1a[3]
ADD r0, r0, r12
LDRD r6, [r0] @ r6 = buf1a[4] and r7 = buf1a[5]
ADD r0, r0, r12
LDRD r8, [r0] @ r8 = buf1a[6] and r9 = buf1a[7]
ADD r0, r0, r12
LDRD r10, [r0] @ r10 = buf1a[8] and r11 = buf1a[9]
ADD r1, r4, r10 @ r1 = buf1a[2] + buf1a[8]
SUB r4, r4, r10 @ r4 = buf1a[2] - buf1a[8]@
MOVW r10, #0xB000
MOVT r10, #0x478E
ADD r12, r6, r8 @ r3 = buf1a[4] + buf1a[6]
SUB r8, r6, r8 @ r2 = buf1a[4] - buf1a[6]
SUB r6, r1, r12 @ (r1 - r3)
SMULWT r6, r6, r10 @ t = mult32x16in32_shl((r1 - r3), C54)
ADD r1, r1, r12 @ r1 = r1 + r3@
ADD r2, r2, r1 @ temp1 = inp[0] + r1@
SMULWB r1, r1, r10 @ mult32_shl(r1, C55)
ADD r1, r2, r1, lsl #2 @ r1 = temp1 + ((mult32_shl(r1, C55)) << 1)@
MOVW r10, #0x9D84
MOVT r10, #0x79BC
STR r2, [lr], #4 @ *buf2++ = temp1@
SUB r12, r1, r6, LSL #1 @ r3 = r1 - t@
ADD r1, r1, r6, LSL #1 @ r1 = r1 + t@
ADD r2, r4, r8 @ (r4 + r2)
SMULWT r2, r2, r10 @ t = mult32_shl((r4 + r2), C51)@
@LSL r2, r2, #1
MOV r2, r2, LSL #1
SMULWB r4, r4, r10 @ mult32_shl(r4, C52)
MOVW r10, #0xD180
MOVT r10, #0xFFFF
ADD r4, r2, r4, LSL #2 @ r4 = t + (mult32_shl(r4, C52) << 1)@
SMULWB r8, r8, r10 @ mult32_shl(r2, C53)
ADD r2, r2, r8, LSL #1 @ r2 = t + mult32_shl(r2, C53)@
ADD r6, r5, r11 @ s1 = buf1a[3] + buf1a[9]
SUB r8, r5, r11 @ s4 = buf1a[3] - buf1a[9]
MOVW r10, #0xB000
MOVT r10, #0x478E
ADD r5, r7, r9 @ s3 = buf1a[5] + buf1a[7]@
SUB r7, r7, r9 @ s2 = buf1a[5] + buf1a[7]@
SUB r9, r6, r5 @ (s1 - s3)
SMULWT r9, r9, r10 @ t = mult32x16in32_shl((s1 - s3), C54)
ADD r6, r6, r5 @ s1 = s1 + s3@
ADD r3, r3, r6 @ temp2 = buf1a[1] + s1
SMULWB r6, r6, r10 @ mult32_shl(s1, C55)
ADD r6, r3, r6, lsl #2 @ s1 = temp1 + ((mult32_shl(s1, C55)) << 1)@
MOVW r10, #0x9D84
MOVT r10, #0x79BC
STR r3, [lr], #4 @ *buf2++ = temp2@
SUB r5, r6, r9, LSL #1 @ s3 = s1 - t@
ADD r6, r6, r9, LSL #1 @ s1 = s1 + t@
SUB r0, r0, #896 @ r0 -inp[160]
ADD r11, r7, r8 @ (s4 + s2)
SMULWT r11, r11, r10 @ t = mult32_shl((s4 + s2), C51)@
@LSL r11, r11, #1 @
MOV r11, r11, LSL #1
SMULWB r8, r8, r10 @ mult32_shl(s4, C52)
MOVW r10, #0xD180
MOVT r10, #0xFFFF
ADD r8, r11, r8, LSL #2 @ s4 = t + (mult32_shl(s4, C52) << 1)@
SMULWB r7, r7, r10 @ mult32_shl(s2, C53)
ADD r7, r11, r7, LSL #1 @ s2 = t + mult32_shl(s2, C53)@
ADD r3, r1, r7 @ buf2[2] = r1 + s2
SUB r9, r6, r2 @ buf2[3] = s1 - r2
SUB r10, r12, r8 @ buf2[4] = r3 - s4
ADD r11, r5, r4 @ buf2[5] = s3 + r4
ADD r12, r12, r8 @ buf2[6] = r3 + s4
SUB r4, r5, r4 @ buf2[7] = s3 - r4
SUB r5, r1, r7 @ buf2[8] = r1 - s2
ADD r6, r6, r2 @ buf2[9] = s1 + r2
STMIA lr!, {r3, r9-r12} @
MOV r12, #384 @
MOVW r1, #0xFA00
MOVT r1, #0xFFFF
STMIA lr!, {r4-r6} @
LDRD r2, [r0] @ r2 = buf1a[0] and r3 = buf1a[1]
ADD r0, r0, r12
LDRD r4, [r0] @ r4 = buf1a[2] and r5 = buf1a[3]
ADD r0, r0, r12
LDRD r6, [r0] @ r6 = buf1a[4] and r7 = buf1a[5]
ADD r0, r0, r12
LDRD r8, [r0] @ r8 = buf1a[6] and r9 = buf1a[7]
ADD r0, r0, r1
LDRD r10, [r0] @ r10 = buf1a[8] and r11 = buf1a[9]
ADD r0, r0, #1024 @ r0 -inp[320]
ADD r1, r4, r10 @ r1 = buf1a[2] + buf1a[8]
SUB r4, r4, r10 @ r4 = buf1a[2] - buf1a[8]@
MOVW r10, #0xB000
MOVT r10, #0x478E
ADD r12, r6, r8 @ r3 = buf1a[4] + buf1a[6]
SUB r8, r6, r8 @ r2 = buf1a[4] - buf1a[6]
SUB r6, r1, r12 @ (r1 - r3)
SMULWT r6, r6, r10 @ t = mult32x16in32_shl((r1 - r3), C54)
ADD r1, r1, r12 @ r1 = r1 + r3@
ADD r2, r2, r1 @ temp1 = inp[0] + r1@
SMULWB r1, r1, r10 @ mult32_shl(r1, C55)
ADD r1, r2, r1, lsl #2 @ r1 = temp1 + ((mult32_shl(r1, C55)) << 1)@
MOVW r10, #0x9D84
MOVT r10, #0x79BC
STR r2, [lr], #4 @ *buf2++ = temp1@
SUB r12, r1, r6, LSL #1 @ r3 = r1 - t@
ADD r1, r1, r6, LSL #1 @ r1 = r1 + t@
ADD r2, r4, r8 @ (r4 + r2)
SMULWT r2, r2, r10 @ t = mult32_shl((r4 + r2), C51)@
@LSL r2, r2, #1
MOV r2, r2, LSL #1
SMULWB r4, r4, r10 @ mult32_shl(r4, C52)
MOVW r10, #0xD180
MOVT r10, #0xFFFF
ADD r4, r2, r4, LSL #2 @ r4 = t + (mult32_shl(r4, C52) << 1)@
SMULWB r8, r8, r10 @ mult32_shl(r2, C53)
ADD r2, r2, r8, LSL #1 @ r2 = t + mult32_shl(r2, C53)@
ADD r6, r5, r11 @ s1 = buf1a[3] + buf1a[9]
SUB r8, r5, r11 @ s4 = buf1a[3] - buf1a[9]
MOVW r10, #0xB000
MOVT r10, #0x478E
ADD r5, r7, r9 @ s3 = buf1a[5] + buf1a[7]@
SUB r7, r7, r9 @ s2 = buf1a[5] + buf1a[7]@
SUB r9, r6, r5 @ (s1 - s3)
SMULWT r9, r9, r10 @ t = mult32x16in32_shl((s1 - s3), C54)
ADD r6, r6, r5 @ s1 = s1 + s3@
ADD r3, r3, r6 @ temp2 = buf1a[1] + s1
SMULWB r6, r6, r10 @ mult32_shl(s1, C55)
ADD r6, r3, r6, lsl #2 @ s1 = temp1 + ((mult32_shl(s1, C55)) << 1)@
MOVW r10, #0x9D84
MOVT r10, #0x79BC
STR r3, [lr], #4 @ *buf2++ = temp2@
SUB r5, r6, r9, LSL #1 @ s3 = s1 - t@
ADD r6, r6, r9, LSL #1 @ s1 = s1 + t@
ADD r11, r7, r8 @ (s4 + s2)
SMULWT r11, r11, r10 @ t = mult32_shl((s4 + s2), C51)@
@LSL r11, r11, #1
MOV r11, r11, LSL #1
SMULWB r8, r8, r10 @mult32_shl(s4, C52)
MOVW r10, #0xD180
MOVT r10, #0xFFFF
ADD r8, r11, r8, LSL #2 @s4 = t + (mult32_shl(s4, C52) << 1)@
SMULWB r7, r7, r10 @mult32_shl(s2, C53)
ADD r7, r11, r7, LSL #1 @s2 = t + mult32_shl(s2, C53)@
ADD r3, r1, r7 @buf2[2] = r1 + s2
SUB r9, r6, r2 @buf2[3] = s1 - r2
SUB r10, r12, r8 @buf2[4] = r3 - s4
ADD r11, r5, r4 @buf2[5] = s3 + r4
ADD r12, r12, r8 @buf2[6] = r3 + s4
SUB r4, r5, r4 @buf2[7] = s3 - r4
SUB r5, r1, r7 @buf2[8] = r1 - s2
ADD r6, r6, r2 @buf2[9] = s1 + r2
MOVW r1, #0xFA00
MOVT r1, #0xFFFF
STMIA lr!, {r3, r9-r12}
MOV r12, #384 @
STMIA lr!, {r4-r6} @
LDRD r2, [r0] @ r2 = buf1a[0] and r3 = buf1a[1]
ADD r0, r0, r12
LDRD r4, [r0] @ r4 = buf1a[2] and r5 = buf1a[3]
ADD r0, r0, r1
LDRD r6, [r0] @ r6 = buf1a[4] and r7 = buf1a[5]
ADD r0, r0, r12
LDRD r8, [r0] @ r8 = buf1a[6] and r9 = buf1a[7]
ADD r0, r0, r12
LDRD r10, [r0] @ r10 = buf1a[8] and r11 = buf1a[9]
ADD r0, r0, r12
ADD r1, r4, r10 @ r1 = buf1a[2] + buf1a[8]
SUB r4, r4, r10 @ r4 = buf1a[2] - buf1a[8]@
MOVW r10, #0xB000
MOVT r10, #0x478E
ADD r12, r6, r8 @ r3 = buf1a[4] + buf1a[6]
SUB r8, r6, r8 @ r2 = buf1a[4] - buf1a[6]
SUB r6, r1, r12 @ (r1 - r3)
SMULWT r6, r6, r10 @ t = mult32x16in32_shl((r1 - r3), C54)
ADD r1, r1, r12 @ r1 = r1 + r3@
ADD r2, r2, r1 @ temp1 = inp[0] + r1@
SMULWB r1, r1, r10 @ mult32_shl(r1, C55)
ADD r1, r2, r1, lsl #2 @ r1 = temp1 + ((mult32_shl(r1, C55)) << 1)@
MOVW r10, #0x9D84
MOVT r10, #0x79BC
STR r2, [lr], #4 @ *buf2++ = temp1@
SUB r12, r1, r6, LSL #1 @ r3 = r1 - t@
ADD r1, r1, r6, LSL #1 @ r1 = r1 + t@
ADD r2, r4, r8 @ (r4 + r2)
SMULWT r2, r2, r10 @ t = mult32_shl((r4 + r2), C51)@
@LSL r2, r2, #1
MOV r2, r2, LSL #1
SMULWB r4, r4, r10 @ mult32_shl(r4, C52)
MOVW r10, #0xD180
MOVT r10, #0xFFFF
ADD r4, r2, r4, LSL #2 @ r4 = t + (mult32_shl(r4, C52) << 1)@
SMULWB r8, r8, r10 @ mult32_shl(r2, C53)
ADD r2, r2, r8, LSL #1 @ r2 = t + mult32_shl(r2, C53)@
ADD r6, r5, r11 @ s1 = buf1a[3] + buf1a[9]
SUB r8, r5, r11 @ s4 = buf1a[3] - buf1a[9]
MOVW r10, #0xB000
MOVT r10, #0x478E
ADD r5, r7, r9 @ s3 = buf1a[5] + buf1a[7]@
SUB r7, r7, r9 @ s2 = buf1a[5] + buf1a[7]@
SUB r9, r6, r5 @ (s1 - s3)
SMULWT r9, r9, r10 @ t = mult32x16in32_shl((s1 - s3), C54)
ADD r6, r6, r5 @ s1 = s1 + s3@
ADD r3, r3, r6 @ temp2 = buf1a[1] + s1
SMULWB r6, r6, r10 @ mult32_shl(s1, C55)
ADD r6, r3, r6, lsl #2 @ s1 = temp1 + ((mult32_shl(s1, C55)) << 1)@
MOVW r10, #0x9D84
MOVT r10, #0x79BC
STR r3, [lr], #4 @ *buf2++ = temp2@
SUB r5, r6, r9, LSL #1 @ s3 = s1 - t@
ADD r6, r6, r9, LSL #1 @ s1 = s1 + t@
ADD r11, r7, r8 @ (s4 + s2)
SMULWT r11, r11, r10 @ t = mult32_shl((s4 + s2), C51)@
@LSL r11, r11, #1 @
MOV r11, r11, LSL #1
SMULWB r8, r8, r10 @mult32_shl(s4, C52)
MOVW r10, #0xD180
MOVT r10, #0xFFFF
ADD r8, r11, r8, LSL #2 @s4 = t + (mult32_shl(s4, C52) << 1)@
SMULWB r7, r7, r10 @mult32_shl(s2, C53)
ADD r7, r11, r7, LSL #1 @s2 = t + mult32_shl(s2, C53)@
ADD r3, r1, r7 @buf2[2] = r1 + s2
SUB r9, r6, r2 @buf2[3] = s1 - r2
SUB r10, r12, r8 @buf2[4] = r3 - s4
ADD r11, r5, r4 @buf2[5] = s3 + r4
ADD r12, r12, r8 @buf2[6] = r3 + s4
SUB r4, r5, r4 @buf2[7] = s3 - r4
SUB r5, r1, r7 @buf2[8] = r1 - s2
ADD r6, r6, r2 @buf2[9] = s1 + r2
STMIA lr!, {r3, r9-r12}
STMIA lr!, {r4-r6} @
SUB lr, lr, #120 @
MOVW r12, # 28378 @
LDMFD r13!, {r10, r11} @
LOOP_FFT3:
LDRD r0, [lr] @ r0 = fft3outptr[0] and r1 = fft3outptr[1]
LDRD r2, [lr, #40] @ r2 = fft3outptr[10] and r3 = fft3outptr[11]
LDRD r4, [lr, #80] @ r4 = fft3outptr[20] and r5 = fft3outptr[21]
ADD lr, lr, #8 @
ADD r6, r0, r2 @ X01r = add32(buf1[0], buf1[2])
ADD r7, r1, r3 @ X01i = add32(buf1[1], buf1[3])
ADD r8, r2, r4 @ add_r = add32(buf1[2], buf1[4])
ADD r9, r3, r5 @ add_i = add32(buf1[3], buf1[5])
SUB r2, r2, r4 @ sub_r = sub32(buf1[2], buf1[4])@
SUB r3, r3, r5 @ sub_i = sub32(buf1[3], buf1[5])@
@ASR r8, r8, #1 @ p1 = add_r >> 1@
MOV r8, r8, ASR #1
@ASR r9, r9, #1 @ p4 = add_i >> 1@
MOV r9, r9, ASR #1
SMULWB r3, r3, r12 @ p2 = mult32x16in32_shl(sub_i, sinmu)@
SMULWB r2, r2, r12 @ p3 = mult32x16in32_shl(sub_r, sinmu)@
SUB r0, r0, r8 @ temp = sub32(buf1a[0], p1)@
ADD r8, r1, r2, LSL #1 @ temp1 = add32(buf1a[1], p3)@
SUB r2, r1, r2, LSL #1 @ temp2 = sub32(buf1a[1], p3)@
ADD r4, r6, r4 @ add32(X01r, buf1a[4])@
ADD r5, r7, r5 @ add32(X01i, buf1a[5])@
ADD r6, r0, r3, LSL #1 @ add32(temp, p2)@
SUB r7, r2, r9 @ sub32(temp2, p4)@
SUB r9, r8, r9 @ sub32(temp1, p4)@
SUB r8, r0, r3, LSL #1 @ sub32(temp, p2)@
MOV r3, r11 @
LDRB r0, [r10], #1 @
LDRB r1, [r10], #1 @
LDRB r2, [r10], #1 @
ADD r0, r11, r0, lsl #3 @
ADD r1, r11, r1, lsl #3 @
ADD r2, r11, r2, lsl #3 @
STRD r4, [r0] @
STRD r6, [r1] @
STRD r8, [r2] @
LDRD r0, [lr] @ r0 = fft3outptr[0] and r1 = fft3outptr[1]
LDRD r2, [lr, #40] @ r2 = fft3outptr[10] and r3 = fft3outptr[11]
LDRD r4, [lr, #80] @ r4 = fft3outptr[20] and r5 = fft3outptr[21]
ADD lr, lr, #8 @
ADD r6, r0, r2 @ X01r = add32(buf1[0], buf1[2])
ADD r7, r1, r3 @ X01i = add32(buf1[1], buf1[3])
ADD r8, r2, r4 @ add_r = add32(buf1[2], buf1[4])
ADD r9, r3, r5 @ add_i = add32(buf1[3], buf1[5])
SUB r2, r2, r4 @ sub_r = sub32(buf1[2], buf1[4])@
SUB r3, r3, r5 @ sub_i = sub32(buf1[3], buf1[5])@
@ASR r8, r8, #1 @ p1 = add_r >> 1@
MOV r8, r8, ASR #1
@ASR r9, r9, #1 @ p4 = add_i >> 1@
MOV r9, r9, ASR #1
SMULWB r3, r3, r12 @ p2 = mult32x16in32_shl(sub_i, sinmu)@
SMULWB r2, r2, r12 @ p3 = mult32x16in32_shl(sub_r, sinmu)@
SUB r0, r0, r8 @ temp = sub32(buf1a[0], p1)@
ADD r8, r1, r2, LSL #1 @ temp1 = add32(buf1a[1], p3)@
SUB r2, r1, r2, LSL #1 @ temp2 = sub32(buf1a[1], p3)@
ADD r4, r6, r4 @ add32(X01r, buf1a[4])@
ADD r5, r7, r5 @ add32(X01i, buf1a[5])@
ADD r6, r0, r3, LSL #1 @ add32(temp, p2)@
SUB r7, r2, r9 @ sub32(temp2, p4)@
SUB r9, r8, r9 @ sub32(temp1, p4)@
SUB r8, r0, r3, LSL #1 @ sub32(temp, p2)@
LDRB r0, [r10], #1 @
LDRB r1, [r10], #1 @
LDRB r2, [r10], #1 @
ADD r0, r11, r0, lsl #3 @
ADD r1, r11, r1, lsl #3 @
ADD r2, r11, r2, lsl #3 @
STRD r4, [r0] @
STRD r6, [r1] @
STRD r8, [r2] @
LDRD r0, [lr] @ r0 = fft3outptr[0] and r1 = fft3outptr[1]
LDRD r2, [lr, #40] @ r2 = fft3outptr[10] and r3 = fft3outptr[11]
LDRD r4, [lr, #80] @ r4 = fft3outptr[20] and r5 = fft3outptr[21]
ADD lr, lr, #8 @
ADD r6, r0, r2 @ X01r = add32(buf1[0], buf1[2])
ADD r7, r1, r3 @ X01i = add32(buf1[1], buf1[3])
ADD r8, r2, r4 @ add_r = add32(buf1[2], buf1[4])
ADD r9, r3, r5 @ add_i = add32(buf1[3], buf1[5])
SUB r2, r2, r4 @ sub_r = sub32(buf1[2], buf1[4])@
SUB r3, r3, r5 @ sub_i = sub32(buf1[3], buf1[5])@
@ASR r8, r8, #1 @ p1 = add_r >> 1@
MOV r8, r8, ASR #1
@ASR r9, r9, #1 @ p4 = add_i >> 1@
MOV r9, r9, ASR #1
SMULWB r3, r3, r12 @ p2 = mult32x16in32_shl(sub_i, sinmu)@
SMULWB r2, r2, r12 @ p3 = mult32x16in32_shl(sub_r, sinmu)@
SUB r0, r0, r8 @ temp = sub32(buf1a[0], p1)@
ADD r8, r1, r2, LSL #1 @ temp1 = add32(buf1a[1], p3)@
SUB r2, r1, r2, LSL #1 @ temp2 = sub32(buf1a[1], p3)@
ADD r4, r6, r4 @ add32(X01r, buf1a[4])@
ADD r5, r7, r5 @ add32(X01i, buf1a[5])@
ADD r6, r0, r3, LSL #1 @ add32(temp, p2)@
SUB r7, r2, r9 @ sub32(temp2, p4)@
SUB r9, r8, r9 @ sub32(temp1, p4)@
SUB r8, r0, r3, LSL #1 @ sub32(temp, p2)@
LDRB r0, [r10], #1 @
LDRB r1, [r10], #1 @
LDRB r2, [r10], #1 @
ADD r0, r11, r0, lsl #3 @
ADD r1, r11, r1, lsl #3 @
ADD r2, r11, r2, lsl #3 @
STRD r4, [r0] @
STRD r6, [r1] @
STRD r8, [r2] @
LDRD r0, [lr] @ r0 = fft3outptr[0] and r1 = fft3outptr[1]
LDRD r2, [lr, #40] @ r2 = fft3outptr[10] and r3 = fft3outptr[11]
LDRD r4, [lr, #80] @ r4 = fft3outptr[20] and r5 = fft3outptr[21]
ADD lr, lr, #8 @
ADD r6, r0, r2 @ X01r = add32(buf1[0], buf1[2])
ADD r7, r1, r3 @ X01i = add32(buf1[1], buf1[3])
ADD r8, r2, r4 @ add_r = add32(buf1[2], buf1[4])
ADD r9, r3, r5 @ add_i = add32(buf1[3], buf1[5])
SUB r2, r2, r4 @ sub_r = sub32(buf1[2], buf1[4])@
SUB r3, r3, r5 @ sub_i = sub32(buf1[3], buf1[5])@
@ASR r8, r8, #1 @ p1 = add_r >> 1@
MOV r8, r8, ASR #1
@ASR r9, r9, #1 @ p4 = add_i >> 1@
MOV r9, r9, ASR #1
SMULWB r3, r3, r12 @ p2 = mult32x16in32_shl(sub_i, sinmu)@
SMULWB r2, r2, r12 @ p3 = mult32x16in32_shl(sub_r, sinmu)@
SUB r0, r0, r8 @ temp = sub32(buf1a[0], p1)@
ADD r8, r1, r2, LSL #1 @ temp1 = add32(buf1a[1], p3)@
SUB r2, r1, r2, LSL #1 @ temp2 = sub32(buf1a[1], p3)@
ADD r4, r6, r4 @ add32(X01r, buf1a[4])@
ADD r5, r7, r5 @ add32(X01i, buf1a[5])@
ADD r6, r0, r3, LSL #1 @ add32(temp, p2)@
SUB r7, r2, r9 @ sub32(temp2, p4)@
SUB r9, r8, r9 @ sub32(temp1, p4)@
SUB r8, r0, r3, LSL #1 @ sub32(temp, p2)@
LDRB r0, [r10], #1 @
LDRB r1, [r10], #1 @
LDRB r2, [r10], #1 @
ADD r0, r11, r0, lsl #3 @
ADD r1, r11, r1, lsl #3 @
ADD r2, r11, r2, lsl #3 @
STRD r4, [r0] @
STRD r6, [r1] @
STRD r8, [r2] @
LDRD r0, [lr] @ r0 = fft3outptr[0] and r1 = fft3outptr[1]
LDRD r2, [lr, #40] @ r2 = fft3outptr[10] and r3 = fft3outptr[11]
LDRD r4, [lr, #80] @ r4 = fft3outptr[20] and r5 = fft3outptr[21]
ADD r6, r0, r2 @ X01r = add32(buf1[0], buf1[2])
ADD r7, r1, r3 @ X01i = add32(buf1[1], buf1[3])
ADD r8, r2, r4 @ add_r = add32(buf1[2], buf1[4])
ADD r9, r3, r5 @ add_i = add32(buf1[3], buf1[5])
SUB r2, r2, r4 @ sub_r = sub32(buf1[2], buf1[4])@
SUB r3, r3, r5 @ sub_i = sub32(buf1[3], buf1[5])@
@ASR r8, r8, #1 @ p1 = add_r >> 1@
MOV r8, r8, ASR #1
@ASR r9, r9, #1 @ p4 = add_i >> 1@
MOV r9, r9, ASR #1
SMULWB r3, r3, r12 @ p2 = mult32x16in32_shl(sub_i, sinmu)@
SMULWB r2, r2, r12 @ p3 = mult32x16in32_shl(sub_r, sinmu)@
SUB r0, r0, r8 @ temp = sub32(buf1a[0], p1)@
ADD r8, r1, r2, LSL #1 @ temp1 = add32(buf1a[1], p3)@
SUB r2, r1, r2, LSL #1 @ temp2 = sub32(buf1a[1], p3)@
ADD r4, r6, r4 @ add32(X01r, buf1a[4])@
ADD r5, r7, r5 @ add32(X01i, buf1a[5])@
ADD r6, r0, r3, LSL #1 @ add32(temp, p2)@
SUB r7, r2, r9 @ sub32(temp2, p4)@
SUB r9, r8, r9 @ sub32(temp1, p4)@
SUB r8, r0, r3, LSL #1 @ sub32(temp, p2)@
LDRB r0, [r10], #1 @
LDRB r1, [r10], #1 @
LDRB r2, [r10], #1 @
ADD r0, r11, r0, lsl #3 @
ADD r1, r11, r1, lsl #3 @
ADD r2, r11, r2, lsl #3 @
STRD r4, [r0] @
STRD r6, [r1] @
STRD r8, [r2] @
LDMFD r13!, {r4 - r12, r15}