blob: 6713b28695cb975d532318d196c1587cb5f9506d [file] [log] [blame]
@
@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
@
@ Use of this source code is governed by a BSD-style license
@ that can be found in the LICENSE file in the root of the source
@ tree. An additional intellectual property rights grant can be found
@ in the file PATENTS. All contributing project authors may
@ be found in the AUTHORS file in the root of the source tree.
@
@ Reference code in transform.c. Bit not exact due to how rounding is
@ done in C code and ARM instructions, but quality by assembly code is
@ not worse.
#include "webrtc/modules/audio_coding/codecs/isac/fix/source/settings.h"
#include "webrtc/system_wrappers/interface/asm_defines.h"
GLOBAL_FUNCTION WebRtcIsacfix_Spec2TimeNeon
GLOBAL_FUNCTION WebRtcIsacfix_Time2SpecNeon
GLOBAL_LABEL WebRtcIsacfix_kSinTab1
GLOBAL_LABEL WebRtcIsacfix_kCosTab1
GLOBAL_LABEL WebRtcIsacfix_kSinTab2
@ void WebRtcIsacfix_Time2SpecNeon(int16_t* inre1Q9,
@ int16_t* inre2Q9,
@ int16_t* outreQ7,
@ int16_t* outimQ7);
DEFINE_FUNCTION WebRtcIsacfix_Time2SpecNeon
.align 2
push {r4-r11,lr}
sub sp, sp, #(16 + FRAMESAMPLES * 4)
str r0, [sp] @ inre1Q9
str r1, [sp, #4] @ inre2Q9
str r2, [sp, #8] @ outreQ7
str r3, [sp, #12] @ outimQ7
mov r8, #(FRAMESAMPLES - 16)
add r12, r0, r8 @ &inreQ7[FRAMESAMPLES / 2 - 4]
add r11, r1, r8 @ &inimQ7[FRAMESAMPLES / 2 - 4]
add r4, sp, #16 @ tmpreQ16;
add r5, sp, #(16 + FRAMESAMPLES * 2) @ tmpimQ16;
adr r9, WebRtcIsacfix_kCosTab1
#if defined(__APPLE__)
mov r6, #:lower16:(WebRtcIsacfix_kSinTab1 - WebRtcIsacfix_kCosTab1)
#else
mov r6, #(WebRtcIsacfix_kSinTab1 - WebRtcIsacfix_kCosTab1)
#endif
add r10, r9, r6 @ WebRtcIsacfix_kSinTab1
vmov.u32 q14, #0 @ Initialize the maximum values for tmpInIm.
vmov.u32 q15, #0 @ Initialize the maximum values for tmpInRe.
movw r6, #16921 @ 0.5 / sqrt(240) in Q19
lsl r6, #5 @ Together with vqdmulh, net effect is ">> 26".
mov r8, #(FRAMESAMPLES / 2) @ loop counter
vdup.s32 q11, r6
Time2Spec_TransformAndFindMax:
@ Use ">> 26", instead of ">> 7", ">> 16" and then ">> 3" as in the C code.
subs r8, #8
vld1.16 {q0}, [r9, :64]! @ WebRtcIsacfix_kCosTab1[]
vld1.16 {q2}, [r0]! @ inre1Q9[]
vmull.s16 q8, d0, d4 @ WebRtcIsacfix_kCosTab1[k] * inre1Q9[k]
vld1.16 {q1}, [r10, :64]! @ WebRtcIsacfix_kSinTab1[]
vmull.s16 q9, d1, d5 @ WebRtcIsacfix_kCosTab1[k] * inre1Q9[k]
vld1.16 {q3}, [r1]! @ inre2Q9[]
vmlal.s16 q8, d2, d6 @ WebRtcIsacfix_kSinTab1[k] * inre2Q9[k]
vmlal.s16 q9, d3, d7 @ WebRtcIsacfix_kSinTab1[k] * inre2Q9[k]
vmull.s16 q12, d0, d6 @ WebRtcIsacfix_kCosTab1[k] * inre2Q9[k]
vmull.s16 q13, d1, d7 @ WebRtcIsacfix_kCosTab1[k] * inre2Q9[k]
vmlsl.s16 q12, d2, d4 @ WebRtcIsacfix_kSinTab1[k] * inre1Q9[k]
vmlsl.s16 q13, d3, d5 @ WebRtcIsacfix_kSinTab1[k] * inre1Q9[k]
vqdmulh.s32 q0, q8, q11 @ xrQ16 * factQ19
vqdmulh.s32 q1, q9, q11 @ xrQ16 * factQ19
vqdmulh.s32 q2, q12, q11 @ xrQ16 * factQ19
vqdmulh.s32 q3, q13, q11 @ xrQ16 * factQ19
@ Find the absolute maximum in the vectors and store them.
vabs.s32 q8, q0
vabs.s32 q9, q1
vabs.s32 q12, q2
vst1.32 {q0, q1}, [r4]! @ tmpreQ16[k]
vabs.s32 q13, q3
vmax.u32 q14, q8 @ Use u32 so we don't lose the value 0x80000000.
vmax.u32 q15, q12
vst1.32 {q2, q3}, [r5]! @ tmpimQ16[k]
vmax.u32 q15, q13
vmax.u32 q14, q9 @ Maximum for outre1Q16[].
bgt Time2Spec_TransformAndFindMax
@ Find the maximum value in the Neon registers
vmax.u32 d28, d29
vmax.u32 d30, d31
vpmax.u32 d28, d28, d28 @ Both 32 bits words hold the same value tmpInIm.
vpmax.u32 d30, d30, d30 @ Both 32 bits words hold the same value tmpInRe.
vmax.s32 d30, d28, d30 @ if (yrQ16 > xrQ16) {xrQ16 = yrQ16};
ldr r4, [sp] @ inre1Q9
vcls.s32 d31, d30 @ sh = WebRtcSpl_NormW32(tmpInRe);
ldr r5, [sp, #4] @ inre2Q9
vmov.i32 d30, #24
add r6, sp, #16 @ tmpreQ16;
vsub.s32 d31, d31, d30 @ sh = sh - 24;
add r7, sp, #(16 + FRAMESAMPLES * 2) @ tmpimQ16;
vdup.s32 q8, d31[0] @ sh
mov r8, #(FRAMESAMPLES / 2) @ loop counter
Time2Spec_PreFftShift:
subs r8, #16
vld1.32 {q0, q1}, [r6]! @ tmpreQ16[]
vrshl.s32 q0, q0, q8
vld1.32 {q2, q3}, [r6]! @ tmpreQ16[]
vrshl.s32 q1, q1, q8
vld1.32 {q10, q11}, [r7]! @ tmpimQ16[]
vrshl.s32 q2, q2, q8
vld1.32 {q12, q13}, [r7]! @ tmpimQ16[]
vrshl.s32 q3, q3, q8
vrshl.s32 q10, q10, q8
vrshl.s32 q11, q11, q8
vrshl.s32 q12, q12, q8
vrshl.s32 q13, q13, q8
vmovn.s32 d0, q0
vmovn.s32 d1, q1
vmovn.s32 d2, q2
vmovn.s32 d3, q3
vmovn.s32 d4, q10
vmovn.s32 d5, q11
vmovn.s32 d6, q12
vmovn.s32 d7, q13
vst1.16 {q0, q1}, [r4]! @ inre1Q9[]
vst1.16 {q2, q3}, [r5]! @ inre2Q9[]
bgt Time2Spec_PreFftShift
vmov.s32 r10, d16[0] @ Store value of sh.
ldr r0, [sp] @ inre1Q9
ldr r1, [sp, #4] @ inre2Q9
mov r2, #-1
CALL_FUNCTION WebRtcIsacfix_FftRadix16Fastest
vdup.s32 q8, r10 @ sh
mov r8, #(FRAMESAMPLES - 8)
ldr r2, [sp, #8] @ outreQ7
ldr r3, [sp, #12] @ outimQ7
add r11, r2, r8 @ &outRe1Q16[FRAMESAMPLES / 2 - 4]
add r12, r3, r8 @ &outim2Q16[FRAMESAMPLES / 2 - 4]
ldr r6, [sp] @ inre1Q9
ldr r7, [sp, #4] @ inre2Q9
add r4, r6, r8 @ &inre1Q9[FRAMESAMPLES / 2 - 4]
add r5, r7, r8 @ &inre2Q9[FRAMESAMPLES / 2 - 4]
adr r10, WebRtcIsacfix_kSinTab2
add r9, r10, #(120*2 - 8) @ &WebRtcIsacfix_kSinTab2[119 - 4]
vneg.s32 q15, q8 @ -sh
vmov.i32 q0, #23
vsub.s32 q15, q15, q0 @ -sh - 23
mov r8, #(FRAMESAMPLES / 4) @ loop counter
@ Pre-load variables.
vld1.16 {d2}, [r4] @ inre1Q9[FRAMESAMPLES / 2 - 4 - i]
vld1.16 {d3}, [r5] @ inre2Q9[FRAMESAMPLES / 2 - 4 - i]
vld1.16 {d0}, [r6]! @ inre1Q9
vld1.16 {d1}, [r7]! @ inre2Q9
Time2Spec_PostFftTransform:
@ By vshl, we effectively did "<< (-sh - 23)", instead of "<< (-sh)",
@ ">> 14" and then ">> 9" as in the C code.
vld1.16 {d6}, [r9, :64] @ kCosTab2[]
vneg.s16 d6, d6
vld1.16 {d7}, [r10, :64]! @ WebRtcIsacfix_kSinTab2[]
vrev64.16 q1, q1 @ Reverse samples in 2nd half of xrQ16[].
vqadd.s16 d4, d0, d2 @ xrQ16
vqsub.s16 d5, d1, d3 @ xiQ16
vrev64.16 d6, d6
sub r9, #8 @ Update pointers for kCosTab2[].
sub r4, #8 @ Update pointers for inre1Q9[].
sub r5, #8 @ Update pointers for inr22Q9[].
subs r8, #4 @ Update loop counter.
vqadd.s16 d1, d1, d3 @ yrQ16
vqsub.s16 d0, d2, d0 @ yiQ16
vmull.s16 q12, d6, d4 @ kCosTab2[k] * xrQ16
vmlsl.s16 q12, d7, d5 @ WebRtcIsacfix_kSinTab2[k] * xiQ16
vmull.s16 q13, d7, d4 @ WebRtcIsacfix_kSinTab2[k] * xrQ16
vmlal.s16 q13, d6, d5 @ kCosTab2[k] * xiQ16
vmull.s16 q9, d7, d1 @ WebRtcIsacfix_kSinTab2[k] * yrQ16
vmlal.s16 q9, d6, d0 @ kCosTab2[k] * yiQ16
vmull.s16 q10, d7, d0 @ WebRtcIsacfix_kSinTab2[k] * yiQ16
vmlsl.s16 q10, d6, d1 @ kCosTab2[k] * yrQ16
vshl.s32 q12, q12, q15
vshl.s32 q13, q13, q15
vshl.s32 q9, q9, q15
vshl.s32 q10, q10, q15
vneg.s32 q8, q9
vld1.16 {d0}, [r6]! @ inre1Q9
vmovn.s32 d24, q12
vld1.16 {d1}, [r7]! @ inre2Q9
vmovn.s32 d25, q13
vld1.16 {d2}, [r4] @ inre1Q9[FRAMESAMPLES / 2 - 4 - i]
vmovn.s32 d5, q10
vld1.16 {d3}, [r5] @ inre2Q9[FRAMESAMPLES / 2 - 4 - i]
vmovn.s32 d4, q8
vst1.16 {d24}, [r2]! @ outreQ7[k]
vrev64.16 q2, q2 @ Reverse the order of the samples.
vst1.16 {d25}, [r3]! @ outimQ7[k]
vst1.16 {d4}, [r11] @ outreQ7[FRAMESAMPLES / 2 - 1 - k]
vst1.16 {d5}, [r12] @ outimQ7[FRAMESAMPLES / 2 - 1 - k]
sub r11, #8 @ Update pointers for outreQ7[].
sub r12, #8 @ Update pointers for outimQ7[].
bgt Time2Spec_PostFftTransform
add sp, sp, #(16 + FRAMESAMPLES * 4)
pop {r4-r11,pc}
.align 8
@ Cosine table 1 in Q14
WebRtcIsacfix_kCosTab1:
_WebRtcIsacfix_kCosTab1: @ Label for iOS
.short 16384, 16383, 16378, 16371, 16362, 16349, 16333, 16315
.short 16294, 16270, 16244, 16214, 16182, 16147, 16110, 16069
.short 16026, 15980, 15931, 15880, 15826, 15769, 15709, 15647
.short 15582, 15515, 15444, 15371, 15296, 15218, 15137, 15053
.short 14968, 14879, 14788, 14694, 14598, 14500, 14399, 14295
.short 14189, 14081, 13970, 13856, 13741, 13623, 13502, 13380
.short 13255, 13128, 12998, 12867, 12733, 12597, 12458, 12318
.short 12176, 12031, 11885, 11736, 11585, 11433, 11278, 11121
.short 10963, 10803, 10641, 10477, 10311, 10143, 9974, 9803
.short 9630, 9456, 9280, 9102, 8923, 8743, 8561, 8377
.short 8192, 8006, 7818, 7629, 7438, 7246, 7053, 6859
.short 6664, 6467, 6270, 6071, 5872, 5671, 5469, 5266
.short 5063, 4859, 4653, 4447, 4240, 4033, 3825, 3616
.short 3406, 3196, 2986, 2775, 2563, 2351, 2139, 1926
.short 1713, 1499, 1285, 1072, 857, 643, 429, 214
.short 0, -214, -429, -643, -857, -1072, -1285, -1499
.short -1713, -1926, -2139, -2351, -2563, -2775, -2986, -3196
.short -3406, -3616, -3825, -4033, -4240, -4447, -4653, -4859
.short -5063, -5266, -5469, -5671, -5872, -6071, -6270, -6467
.short -6664, -6859, -7053, -7246, -7438, -7629, -7818, -8006
.short -8192, -8377, -8561, -8743, -8923, -9102, -9280, -9456
.short -9630, -9803, -9974, -10143, -10311, -10477, -10641, -10803
.short -10963, -11121, -11278, -11433, -11585, -11736, -11885, -12031
.short -12176, -12318, -12458, -12597, -12733, -12867, -12998, -13128
.short -13255, -13380, -13502, -13623, -13741, -13856, -13970, -14081
.short -14189, -14295, -14399, -14500, -14598, -14694, -14788, -14879
.short -14968, -15053, -15137, -15218, -15296, -15371, -15444, -15515
.short -15582, -15647, -15709, -15769, -15826, -15880, -15931, -15980
.short -16026, -16069, -16110, -16147, -16182, -16214, -16244, -16270
.short -16294, -16315, -16333, -16349, -16362, -16371, -16378, -16383
.align 8
@ Sine table 2 in Q14
WebRtcIsacfix_kSinTab2:
_WebRtcIsacfix_kSinTab2: @ Label for iOS
.short 16384, -16381, 16375, -16367, 16356, -16342, 16325, -16305
.short 16283, -16257, 16229, -16199, 16165, -16129, 16090, -16048
.short 16003, -15956, 15906, -15853, 15798, -15739, 15679, -15615
.short 15549, -15480, 15408, -15334, 15257, -15178, 15095, -15011
.short 14924, -14834, 14741, -14647, 14549, -14449, 14347, -14242
.short 14135, -14025, 13913, -13799, 13682, -13563, 13441, -13318
.short 13192, -13063, 12933, -12800, 12665, -12528, 12389, -12247
.short 12104, -11958, 11810, -11661, 11509, -11356, 11200, -11042
.short 10883, -10722, 10559, -10394, 10227, -10059, 9889, -9717
.short 9543, -9368, 9191, -9013, 8833, -8652, 8469, -8285
.short 8099, -7912, 7723, -7534, 7342, -7150, 6957, -6762
.short 6566, -6369, 6171, -5971, 5771, -5570, 5368, -5165
.short 4961, -4756, 4550, -4344, 4137, -3929, 3720, -3511
.short 3301, -3091, 2880, -2669, 2457, -2245, 2032, -1819
.short 1606, -1392, 1179, -965, 750, -536, 322, -107
@ Table kCosTab2 was removed since its data is redundant with kSinTab2.
.align 8
@ Sine table 1 in Q14
WebRtcIsacfix_kSinTab1:
_WebRtcIsacfix_kSinTab1: @ Label for iOS
.short 0, 214, 429, 643, 857, 1072, 1285, 1499
.short 1713, 1926, 2139, 2351, 2563, 2775, 2986, 3196
.short 3406, 3616, 3825, 4033, 4240, 4447, 4653, 4859
.short 5063, 5266, 5469, 5671, 5872, 6071, 6270, 6467
.short 6664, 6859, 7053, 7246, 7438, 7629, 7818, 8006
.short 8192, 8377, 8561, 8743, 8923, 9102, 9280, 9456
.short 9630, 9803, 9974, 10143, 10311, 10477, 10641, 10803
.short 10963, 11121, 11278, 11433, 11585, 11736, 11885, 12031
.short 12176, 12318, 12458, 12597, 12733, 12867, 12998, 13128
.short 13255, 13380, 13502, 13623, 13741, 13856, 13970, 14081
.short 14189, 14295, 14399, 14500, 14598, 14694, 14788, 14879
.short 14968, 15053, 15137, 15218, 15296, 15371, 15444, 15515
.short 15582, 15647, 15709, 15769, 15826, 15880, 15931, 15980
.short 16026, 16069, 16110, 16147, 16182, 16214, 16244, 16270
.short 16294, 16315, 16333, 16349, 16362, 16371, 16378, 16383
.short 16384, 16383, 16378, 16371, 16362, 16349, 16333, 16315
.short 16294, 16270, 16244, 16214, 16182, 16147, 16110, 16069
.short 16026, 15980, 15931, 15880, 15826, 15769, 15709, 15647
.short 15582, 15515, 15444, 15371, 15296, 15218, 15137, 15053
.short 14968, 14879, 14788, 14694, 14598, 14500, 14399, 14295
.short 14189, 14081, 13970, 13856, 13741, 13623, 13502, 13380
.short 13255, 13128, 12998, 12867, 12733, 12597, 12458, 12318
.short 12176, 12031, 11885, 11736, 11585, 11433, 11278, 11121
.short 10963, 10803, 10641, 10477, 10311, 10143, 9974, 9803
.short 9630, 9456, 9280, 9102, 8923, 8743, 8561, 8377
.short 8192, 8006, 7818, 7629, 7438, 7246, 7053, 6859
.short 6664, 6467, 6270, 6071, 5872, 5671, 5469, 5266
.short 5063, 4859, 4653, 4447, 4240, 4033, 3825, 3616
.short 3406, 3196, 2986, 2775, 2563, 2351, 2139, 1926
.short 1713, 1499, 1285, 1072, 857, 643, 429, 214
@ void WebRtcIsacfix_Spec2TimeNeon(int16_t *inreQ7,
@ int16_t *inimQ7,
@ int32_t *outre1Q16,
@ int32_t *outre2Q16);
DEFINE_FUNCTION WebRtcIsacfix_Spec2TimeNeon
.align 2
push {r4-r11,lr}
sub sp, sp, #16
str r0, [sp] @ inreQ7
str r1, [sp, #4] @ inimQ7
str r2, [sp, #8] @ outre1Q16
str r3, [sp, #12] @ outre2Q16
mov r8, #(FRAMESAMPLES - 16)
add r12, r0, r8 @ &inreQ7[FRAMESAMPLES / 2 - 8]
add r11, r1, r8 @ &inimQ7[FRAMESAMPLES / 2 - 8]
add r4, r2, r8, lsl #1 @ &outRe1Q16[FRAMESAMPLES / 2 - 8]
add r6, r3, r8, lsl #1 @ &outRe2Q16[FRAMESAMPLES / 2 - 8]
mov r8, #(FRAMESAMPLES / 2) @ loop counter
adr r10, WebRtcIsacfix_kSinTab2
add r9, r10, #(120*2 - 16) @ &WebRtcIsacfix_kSinTab2[119 - 8]
vpush {q4-q7}
mov r5, #-32
mov r7, #-16
vmov.u32 q6, #0 @ Initialize the maximum values for tmpInIm.
vmov.u32 q7, #0 @ Initialize the maximum values for tmpInRe.
TransformAndFindMax:
@ Use ">> 5", instead of "<< 9" and then ">> 14" as in the C code.
@ Bit-exact.
subs r8, #16
vld1.16 {q0}, [r9, :64] @ kCosTab2[]
sub r9, #16
vld1.16 {q2}, [r0]! @ inreQ7[]
vneg.s16 q0, q0
vld1.16 {q3}, [r1]! @ inimQ7[]
vrev64.16 d0, d0
vrev64.16 d1, d1
vld1.16 {q1}, [r10, :64]! @ WebRtcIsacfix_kSinTab2[]
vswp d0, d1
vmull.s16 q8, d2, d6 @ WebRtcIsacfix_kSinTab2[k] * inimQ7[k]
vmull.s16 q9, d3, d7 @ WebRtcIsacfix_kSinTab2[k] * inimQ7[k]
vmlal.s16 q8, d0, d4 @ kCosTab2[k] * inreQ7[k]
vmlal.s16 q9, d1, d5 @ kCosTab2[k] * inreQ7[k]
vmull.s16 q12, d0, d6 @ kCosTab2[k] * inimQ7[k]
vmull.s16 q13, d1, d7 @ kCosTab2[k] * inimQ7[k]
vmlsl.s16 q12, d2, d4 @ WebRtcIsacfix_kSinTab2[k] * inreQ7[k]
vmlsl.s16 q13, d3, d5 @ WebRtcIsacfix_kSinTab2[k] * inreQ7[k]
vld1.16 {q2}, [r11], r7 @ inimQ7[FRAMESAMPLES / 2 - 8 + i]
vld1.16 {q3}, [r12], r7 @ inreQ7[FRAMESAMPLES / 2 - 8 + i]
vrev64.16 q2, q2 @ Reverse the order of the samples
vrev64.16 q3, q3 @ Reverse the order of the samples
vmull.s16 q14, d2, d5 @ WebRtcIsacfix_kSinTab2[k] * inimQ7[k]
vmull.s16 q15, d3, d4 @ WebRtcIsacfix_kSinTab2[k] * inimQ7[k]
vmlsl.s16 q14, d0, d7 @ q14 -= kCosTab2[k] * inreQ7[k]
vmlsl.s16 q15, d1, d6 @ q15 -= kCosTab2[k] * inreQ7[k]
vmull.s16 q10, d0, d5 @ kCosTab2[k] * inimQ7[]
vmull.s16 q11, d1, d4 @ kCosTab2[k] * inimQ7[]
vmlal.s16 q10, d2, d7 @ q10 += WebRtcIsacfix_kSinTab2[k] * inreQ7[]
vmlal.s16 q11, d3, d6 @ q11 += WebRtcIsacfix_kSinTab2[k] * inreQ7[]
vshr.s32 q8, q8, #5 @ xrQ16
vshr.s32 q9, q9, #5 @ xrQ16
vshr.s32 q12, q12, #5 @ xiQ16
vshr.s32 q13, q13, #5 @ xiQ16
vshr.s32 q14, q14, #5 @ yiQ16
vshr.s32 q15, q15, #5 @ yiQ16
vneg.s32 q10, q10
vneg.s32 q11, q11
@ xrQ16 - yiQ16
vsub.s32 q0, q8, q14
vsub.s32 q1, q9, q15
vshr.s32 q10, q10, #5 @ yrQ16
vshr.s32 q11, q11, #5 @ yrQ16
@ xrQ16 + yiQ16
vadd.s32 q3, q8, q14
vadd.s32 q2, q9, q15
@ yrQ16 + xiQ16
vadd.s32 q4, q10, q12
vadd.s32 q5, q11, q13
@ yrQ16 - xiQ16
vsub.s32 q8, q11, q13
vsub.s32 q9, q10, q12
@ Reverse the order of the samples
vrev64.32 q2, q2
vrev64.32 q3, q3
vrev64.32 q8, q8
vrev64.32 q9, q9
vswp d4, d5
vswp d6, d7
vst1.32 {q0, q1}, [r2]! @ outre1Q16[k]
vswp d16, d17
vswp d18, d19
vst1.32 {q2, q3}, [r4], r5 @ outre1Q16[FRAMESAMPLES / 2 - 1 - k]
@ Find the absolute maximum in the vectors and store them in q6 and q7.
vabs.s32 q10, q0
vabs.s32 q14, q4
vabs.s32 q11, q1
vabs.s32 q15, q5
vabs.s32 q12, q2
vmax.u32 q6, q10 @ Use u32 so we don't lose the value 0x80000000.
vmax.u32 q7, q14 @ Maximum for outre2Q16[].
vabs.s32 q0, q8
vmax.u32 q6, q11 @ Maximum for outre1Q16[].
vmax.u32 q7, q15
vabs.s32 q13, q3
vmax.u32 q6, q12
vmax.u32 q7, q0
vabs.s32 q1, q9
vst1.32 {q4, q5}, [r3]! @ outre2Q16[k]
vst1.32 {q8, q9}, [r6], r5 @ outre2Q16[FRAMESAMPLES / 2 - 1 - k]
vmax.u32 q6, q13
vmax.u32 q7, q1
bgt TransformAndFindMax
adr r10, WebRtcIsacfix_kSinTab1
#if defined(__APPLE__)
mov r2, #:lower16:(WebRtcIsacfix_kSinTab1 - WebRtcIsacfix_kCosTab1)
#else
mov r2, #(WebRtcIsacfix_kSinTab1 - WebRtcIsacfix_kCosTab1)
#endif
sub r11, r10, r2 @ WebRtcIsacfix_kCosTab1
@ Find the maximum value in the Neon registers
vmax.u32 d12, d13
vmax.u32 d14, d15
vpmax.u32 d12, d12, d12 @ Both 32 bits words hold the same value tmpInIm.
vpmax.u32 d14, d14, d14 @ Both 32 bits words hold the same value tmpInRe.
vmax.s32 d0, d12, d14 @ if (tmpInIm>tmpInRe) tmpInRe = tmpInIm;
vpop {q4-q7}
ldr r4, [sp] @ inreQ7
vcls.s32 d1, d0 @ sh = WebRtcSpl_NormW32(tmpInRe);
ldr r5, [sp, #4] @ inimQ7
vmov.i32 d0, #24 @ sh = sh-24;
ldr r6, [sp, #8] @ outre1Q16
vsub.s32 d1, d1, d0
ldr r7, [sp, #12] @ outre2Q16
vdup.s32 q8, d1[0] @ sh
mov r8, #(FRAMESAMPLES / 2)
PreFftShift:
subs r8, #16
vld1.32 {q0, q1}, [r6]! @ outre1Q16[]
vld1.32 {q2, q3}, [r6]! @ outre1Q16[]
vrshl.s32 q0, q0, q8
vrshl.s32 q1, q1, q8
vrshl.s32 q2, q2, q8
vrshl.s32 q3, q3, q8
vld1.32 {q10, q11}, [r7]! @ outre2Q16[]
vld1.32 {q12, q13}, [r7]! @ outre2Q16[]
vrshl.s32 q10, q10, q8
vrshl.s32 q11, q11, q8
vrshl.s32 q12, q12, q8
vrshl.s32 q13, q13, q8
vmovn.s32 d0, q0
vmovn.s32 d1, q1
vmovn.s32 d2, q2
vmovn.s32 d3, q3
vmovn.s32 d4, q10
vmovn.s32 d5, q11
vmovn.s32 d6, q12
vmovn.s32 d7, q13
vst1.16 {q0, q1}, [r4]! @ inreQ7[]
vst1.16 {q2, q3}, [r5]! @ inimQ7[]
bgt PreFftShift
vmov.s32 r8, d16[0] @ Store value of sh.
ldr r0, [sp] @ inreQ7
ldr r1, [sp, #4] @ inimQ7
mov r2, #1
CALL_FUNCTION WebRtcIsacfix_FftRadix16Fastest
vdup.s32 q8, r8 @ sh
mov r9, r11 @ WebRtcIsacfix_kCosTab1
ldr r4, [sp] @ inreQ7
ldr r5, [sp, #4] @ inimQ7
ldr r6, [sp, #8] @ outre1Q16
ldr r7, [sp, #12] @ outre2Q16
mov r8, #(FRAMESAMPLES / 2)
vneg.s32 q15, q8 @ -sh
movw r0, #273
lsl r0, #15 @ Together with vqdmulh, net effect is ">> 16".
vdup.s32 q14, r0
PostFftShiftDivide:
subs r8, #16
vld1.16 {q0, q1}, [r4]! @ inreQ7
vmovl.s16 q10, d0
vmovl.s16 q11, d1
vld1.16 {q2, q3}, [r5]! @ inimQ7
vmovl.s16 q8, d2
vmovl.s16 q9, d3
vshl.s32 q10, q10, q15
vshl.s32 q11, q11, q15
vshl.s32 q8, q8, q15
vshl.s32 q9, q9, q15
vqdmulh.s32 q10, q10, q14
vqdmulh.s32 q11, q11, q14
vqdmulh.s32 q8, q8, q14
vqdmulh.s32 q9, q9, q14
vmovl.s16 q0, d4
vmovl.s16 q1, d5
vmovl.s16 q2, d6
vmovl.s16 q3, d7
vshl.s32 q0, q0, q15
vshl.s32 q1, q1, q15
vshl.s32 q2, q2, q15
vshl.s32 q3, q3, q15
@ WEBRTC_SPL_MUL_16_32_RSFT16(273, outre2Q16[k])
vqdmulh.s32 q0, q0, q14
vqdmulh.s32 q1, q1, q14
vst1.32 {q10, q11}, [r6]! @ outre1Q16[]
vqdmulh.s32 q2, q2, q14
vqdmulh.s32 q3, q3, q14
vst1.32 {q8, q9}, [r6]! @ outre1Q16[]
vst1.32 {q0, q1}, [r7]! @ outre2Q16[]
vst1.32 {q2, q3}, [r7]! @ outre2Q16[]
bgt PostFftShiftDivide
mov r8, #(FRAMESAMPLES / 2)
ldr r2, [sp, #8] @ outre1Q16
ldr r3, [sp, #12] @ outre2Q16
movw r0, #31727
lsl r0, #16 @ With vqdmulh and vrshrn, net effect is ">> 25".
DemodulateAndSeparate:
subs r8, #8
vld1.16 {q0}, [r9, :64]! @ WebRtcIsacfix_kCosTab1[]
vmovl.s16 q10, d0 @ WebRtcIsacfix_kCosTab1[]
vld1.16 {q1}, [r10, :64]! @ WebRtcIsacfix_kSinTab1[]
vmovl.s16 q11, d1 @ WebRtcIsacfix_kCosTab1[]
vld1.32 {q2, q3}, [r2] @ outre1Q16
vmovl.s16 q12, d2 @ WebRtcIsacfix_kSinTab1[]
vld1.32 {q14, q15}, [r3] @ outre2Q16
vmovl.s16 q13, d3 @ WebRtcIsacfix_kSinTab1[]
vmull.s32 q0, d20, d4 @ WebRtcIsacfix_kCosTab1[k] * outre1Q16[k]
vmull.s32 q1, d21, d5 @ WebRtcIsacfix_kCosTab1[k] * outre1Q16[k]
vmull.s32 q8, d22, d6 @ WebRtcIsacfix_kCosTab1[k] * outre1Q16[k]
vmull.s32 q9, d23, d7 @ WebRtcIsacfix_kCosTab1[k] * outre1Q16[k]
vmlsl.s32 q0, d24, d28 @ += WebRtcIsacfix_kSinTab1[k] * outre2Q16[k]
vmlsl.s32 q1, d25, d29 @ += WebRtcIsacfix_kSinTab1[k] * outre2Q16[k]
vmlsl.s32 q8, d26, d30 @ += WebRtcIsacfix_kSinTab1[k] * outre2Q16[k]
vmlsl.s32 q9, d27, d31 @ += WebRtcIsacfix_kSinTab1[k] * outre2Q16[k]
vrshrn.s64 d0, q0, #10 @ xrQ16
vrshrn.s64 d1, q1, #10 @ xrQ16
vrshrn.s64 d2, q8, #10 @ xrQ16
vrshrn.s64 d3, q9, #10 @ xrQ16
vmull.s32 q8, d20, d28 @ WebRtcIsacfix_kCosTab1[k] * outre2Q16[k]
vmull.s32 q9, d21, d29 @ WebRtcIsacfix_kCosTab1[k] * outre2Q16[k]
vmull.s32 q14, d22, d30 @ WebRtcIsacfix_kCosTab1[k] * outre2Q16[k]
vmull.s32 q15, d23, d31 @ WebRtcIsacfix_kCosTab1[k] * outre2Q16[k]
vmlal.s32 q8, d24, d4 @ += WebRtcIsacfix_kSinTab1[k] * outre1Q16[k]
vmlal.s32 q9, d25, d5 @ += WebRtcIsacfix_kSinTab1[k] * outre1Q16[k]
vmlal.s32 q14, d26, d6 @ += WebRtcIsacfix_kSinTab1[k] * outre1Q16[k]
vmlal.s32 q15, d27, d7 @ += WebRtcIsacfix_kSinTab1[k] * outre1Q16[k]
vdup.s32 q11, r0 @ generic -> Neon doesn't cost extra cycles.
vrshrn.s64 d24, q8, #10 @ xiQ16
vrshrn.s64 d25, q9, #10 @ xiQ16
vqdmulh.s32 q0, q0, q11
vrshrn.s64 d26, q14, #10 @ xiQ16
vrshrn.s64 d27, q15, #10 @ xiQ16
@ WEBRTC_SPL_MUL_16_32_RSFT11(factQ11, xrQ16)
@ WEBRTC_SPL_MUL_16_32_RSFT11(factQ11, xiQ16)
vqdmulh.s32 q1, q1, q11
vqdmulh.s32 q2, q12, q11
vqdmulh.s32 q3, q13, q11
vst1.16 {q0, q1}, [r2]! @ outre1Q16[]
vst1.16 {q2, q3}, [r3]! @ outre2Q16[]
bgt DemodulateAndSeparate
add sp, sp, #16
pop {r4-r11,pc}