Enabling common_audio building with NEON on ARM64
Passed building common_audio_neon and common_audio_unittests both on
Android ARMv7 and Android ARM64. Pass common_audio_unittests tests both
on Android ARMv7 and Android ARM64.
BUG=4002
R=andrew@webrtc.org, jridges@masque.com, kjellander@webrtc.org
Change-Id: I8e0722f356db8cca6fc8232f00ae1e898a086f5a
Review URL: https://webrtc-codereview.appspot.com/40629004
Patch from Zhongwei Yao <zhongwei.yao@arm.com>.
Cr-Commit-Position: refs/heads/master@{#8620}
git-svn-id: http://webrtc.googlecode.com/svn/trunk@8620 4adac7df-926f-26a2-2b94-8c16560cd09d
diff --git a/webrtc/common_audio/BUILD.gn b/webrtc/common_audio/BUILD.gn
index b3e3ff5..c9c3c5b 100644
--- a/webrtc/common_audio/BUILD.gn
+++ b/webrtc/common_audio/BUILD.gn
@@ -124,6 +124,10 @@
}
}
+ if (current_cpu == "arm64") {
+ deps += [ ":common_audio_neon" ]
+ }
+
if (current_cpu == "mipsel") {
sources += [
"signal_processing/include/spl_inl_mips.h",
@@ -194,30 +198,23 @@
}
}
-if (rtc_build_armv7_neon) {
+if (rtc_build_armv7_neon || current_cpu == "arm64") {
source_set("common_audio_neon") {
sources = [
"fir_filter_neon.cc",
"resampler/sinc_resampler_neon.cc",
- "signal_processing/cross_correlation_neon.S",
- "signal_processing/downsample_fast_neon.S",
- "signal_processing/min_max_operations_neon.S",
+ "signal_processing/cross_correlation_neon.c",
+ "signal_processing/downsample_fast_neon.c",
+ "signal_processing/min_max_operations_neon.c",
]
configs += [ "..:common_config" ]
public_configs = [ "..:common_inherited_config" ]
-
- # Enable compilation for the ARM v7 Neon instruction set. This is needed
- # since //build/config/arm.gni only enables Neon for iOS, not Android.
- # This provides the same functionality as webrtc/build/arm_neon.gypi.
- # TODO(kjellander): Investigate if this can be moved into webrtc.gni or
- # //build/config/arm.gni instead, to reduce code duplication.
- # Remove the -mfpu=vfpv3-d16 cflag.
- configs -= [ "//build/config/compiler:compiler_arm_fpu" ]
- cflags = [
- "-mfpu=neon",
- ]
+ if (!arm_use_neon) {
+ configs -= [ "//build/config/compiler:compiler_arm_fpu" ]
+ cflags = [ "-mfpu=neon" ]
+ }
# Disable LTO in audio_processing_neon target due to compiler bug.
if (rtc_use_lto) {
diff --git a/webrtc/common_audio/common_audio.gyp b/webrtc/common_audio/common_audio.gyp
index f74a5b1..f7bc3a8 100644
--- a/webrtc/common_audio/common_audio.gyp
+++ b/webrtc/common_audio/common_audio.gyp
@@ -146,6 +146,9 @@
}],
], # conditions
}],
+ ['target_arch=="arm64"', {
+ 'dependencies': ['common_audio_neon',],
+ }],
['target_arch=="mipsel" and mips_arch_variant!="r6" and android_webview_build==0', {
'sources': [
'signal_processing/include/spl_inl_mips.h',
@@ -194,7 +197,7 @@
},
], # targets
}],
- ['target_arch=="arm" and arm_version>=7', {
+ ['target_arch=="arm" and arm_version>=7 or target_arch=="arm64"', {
'targets': [
{
'target_name': 'common_audio_neon',
@@ -203,9 +206,9 @@
'sources': [
'fir_filter_neon.cc',
'resampler/sinc_resampler_neon.cc',
- 'signal_processing/cross_correlation_neon.S',
- 'signal_processing/downsample_fast_neon.S',
- 'signal_processing/min_max_operations_neon.S',
+ 'signal_processing/cross_correlation_neon.c',
+ 'signal_processing/downsample_fast_neon.c',
+ 'signal_processing/min_max_operations_neon.c',
],
'conditions': [
# Disable LTO in common_audio_neon target due to compiler bug
diff --git a/webrtc/common_audio/resampler/sinc_resampler.h b/webrtc/common_audio/resampler/sinc_resampler.h
index 4428359..be84a99 100644
--- a/webrtc/common_audio/resampler/sinc_resampler.h
+++ b/webrtc/common_audio/resampler/sinc_resampler.h
@@ -107,7 +107,7 @@
static float Convolve_SSE(const float* input_ptr, const float* k1,
const float* k2,
double kernel_interpolation_factor);
-#elif defined(WEBRTC_ARCH_ARM_V7)
+#elif defined(WEBRTC_ARCH_ARM_V7) || defined(WEBRTC_ARCH_ARM64_NEON)
static float Convolve_NEON(const float* input_ptr, const float* k1,
const float* k2,
double kernel_interpolation_factor);
diff --git a/webrtc/common_audio/signal_processing/cross_correlation_neon.S b/webrtc/common_audio/signal_processing/cross_correlation_neon.S
deleted file mode 100644
index 15b25b8..0000000
--- a/webrtc/common_audio/signal_processing/cross_correlation_neon.S
+++ /dev/null
@@ -1,159 +0,0 @@
-@
-@ Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
-@
-@ Use of this source code is governed by a BSD-style license
-@ that can be found in the LICENSE file in the root of the source
-@ tree. An additional intellectual property rights grant can be found
-@ in the file PATENTS. All contributing project authors may
-@ be found in the AUTHORS file in the root of the source tree.
-@
-
-@ cross_correlation_neon.s
-@ This file contains the function WebRtcSpl_CrossCorrelationNeon(),
-@ optimized for ARM Neon platform.
-@
-@ Reference Ccode at end of this file.
-@ Output is bit-exact with the reference C code, but not with the generic
-@ C code in file cross_correlation.c, due to reduction of shift operations
-@ from using Neon registers.
-
-@ Register usage:
-@
-@ r0: *cross_correlation (function argument)
-@ r1: *seq1 (function argument)
-@ r2: *seq2 (function argument)
-@ r3: dim_seq (function argument); then, total iteration of LOOP_DIM_SEQ
-@ r4: counter for LOOP_DIM_CROSS_CORRELATION
-@ r5: seq2_ptr
-@ r6: seq1_ptr
-@ r7: Total iteration of LOOP_DIM_SEQ_RESIDUAL
-@ r8, r9, r10, r11, r12: scratch
-
-#include "webrtc/system_wrappers/interface/asm_defines.h"
-
-GLOBAL_FUNCTION WebRtcSpl_CrossCorrelationNeon
-.align 2
-DEFINE_FUNCTION WebRtcSpl_CrossCorrelationNeon
- push {r4-r11}
-
- @ Put the shift value (-right_shifts) into a Neon register.
- ldrsh r10, [sp, #36]
- rsb r10, r10, #0
- mov r8, r10, asr #31
- vmov d16, r10, r8
-
- @ Initialize loop counters.
- and r7, r3, #7 @ inner_loop_len2 = dim_seq % 8;
- asr r3, r3, #3 @ inner_loop_len1 = dim_seq / 8;
- ldrsh r4, [sp, #32] @ dim_cross_correlation
-
-LOOP_DIM_CROSS_CORRELATION:
- vmov.i32 q9, #0
- vmov.i32 q14, #0
- movs r8, r3 @ inner_loop_len1
- mov r6, r1 @ seq1_ptr
- mov r5, r2 @ seq2_ptr
- ble POST_LOOP_DIM_SEQ
-
-LOOP_DIM_SEQ:
- vld1.16 {d20, d21}, [r6]! @ seq1_ptr
- vld1.16 {d22, d23}, [r5]! @ seq2_ptr
- subs r8, r8, #1
- vmull.s16 q12, d20, d22
- vmull.s16 q13, d21, d23
- vpadal.s32 q9, q12
- vpadal.s32 q14, q13
- bgt LOOP_DIM_SEQ
-
-POST_LOOP_DIM_SEQ:
- movs r10, r7 @ Loop counter
- mov r12, #0
- mov r8, #0
- ble POST_LOOP_DIM_SEQ_RESIDUAL
-
-LOOP_DIM_SEQ_RESIDUAL:
- ldrh r11, [r6], #2
- ldrh r9, [r5], #2
- smulbb r11, r11, r9
- adds r8, r8, r11
- adc r12, r12, r11, asr #31
- subs r10, #1
- bgt LOOP_DIM_SEQ_RESIDUAL
-
-POST_LOOP_DIM_SEQ_RESIDUAL: @ Sum the results up and do the shift.
- vadd.i64 d18, d19
- vadd.i64 d28, d29
- vadd.i64 d18, d28
- vmov.32 d17[0], r8
- vmov.32 d17[1], r12
- vadd.i64 d17, d18
- vshl.s64 d17, d16
- vst1.32 d17[0], [r0]! @ Store the output
-
- ldr r8, [sp, #40] @ step_seq2
- add r2, r8, lsl #1 @ prepare for seq2_ptr(r5) in the next loop.
-
- subs r4, #1
- bgt LOOP_DIM_CROSS_CORRELATION
-
- pop {r4-r11}
- bx lr
-
-@ TODO(kma): Place this piece of reference code into a C code file.
-@ void WebRtcSpl_CrossCorrelationNeon(int32_t* cross_correlation,
-@ int16_t* seq1,
-@ int16_t* seq2,
-@ int16_t dim_seq,
-@ int16_t dim_cross_correlation,
-@ int16_t right_shifts,
-@ int16_t step_seq2) {
-@ int i = 0;
-@ int j = 0;
-@ int inner_loop_len1 = dim_seq >> 3;
-@ int inner_loop_len2 = dim_seq - (inner_loop_len1 << 3);
-@
-@ assert(dim_cross_correlation > 0);
-@ assert(dim_seq > 0);
-@
-@ for (i = 0; i < dim_cross_correlation; i++) {
-@ int16_t *seq1_ptr = seq1;
-@ int16_t *seq2_ptr = seq2 + (step_seq2 * i);
-@ int64_t sum = 0;
-@
-@ for (j = inner_loop_len1; j > 0; j -= 1) {
-@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
-@ seq1_ptr++;
-@ seq2_ptr++;
-@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
-@ seq1_ptr++;
-@ seq2_ptr++;
-@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
-@ seq1_ptr++;
-@ seq2_ptr++;
-@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
-@ seq1_ptr++;
-@ seq2_ptr++;
-@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
-@ seq1_ptr++;
-@ seq2_ptr++;
-@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
-@ seq1_ptr++;
-@ seq2_ptr++;
-@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
-@ seq1_ptr++;
-@ seq2_ptr++;
-@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
-@ seq1_ptr++;
-@ seq2_ptr++;
-@ }
-@
-@ // Calculate the rest of the samples.
-@ for (j = inner_loop_len2; j > 0; j -= 1) {
-@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
-@ seq1_ptr++;
-@ seq2_ptr++;
-@ }
-@
-@ *cross_correlation++ = (int32_t)(sum >> right_shifts);
-@ }
-@ }
diff --git a/webrtc/common_audio/signal_processing/downsample_fast_neon.S b/webrtc/common_audio/signal_processing/downsample_fast_neon.S
deleted file mode 100644
index 4e348ec..0000000
--- a/webrtc/common_audio/signal_processing/downsample_fast_neon.S
+++ /dev/null
@@ -1,215 +0,0 @@
-@
-@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
-@
-@ Use of this source code is governed by a BSD-style license
-@ that can be found in the LICENSE file in the root of the source
-@ tree. An additional intellectual property rights grant can be found
-@ in the file PATENTS. All contributing project authors may
-@ be found in the AUTHORS file in the root of the source tree.
-@
-
-@ This file contains the function WebRtcSpl_DownsampleFastNeon(), optimized for
-@ ARM Neon platform. The description header can be found in
-@ signal_processing_library.h
-@
-@ The reference C code is in file downsample_fast.c. Bit-exact.
-
-#include "webrtc/system_wrappers/interface/asm_defines.h"
-
-GLOBAL_FUNCTION WebRtcSpl_DownsampleFastNeon
-.align 2
-DEFINE_FUNCTION WebRtcSpl_DownsampleFastNeon
- push {r4-r11}
-
- cmp r3, #0 @ data_out_length <= 0?
- movle r0, #-1
- ble END
-
- ldrsh r12, [sp, #44]
- ldr r5, [sp, #40] @ r5: factor
- add r4, r12, #1 @ r4: delay + 1
- sub r3, r3, #1 @ r3: data_out_length - 1
- smulbb r3, r5, r3
- ldr r8, [sp, #32] @ &coefficients[0]
- mov r9, r12 @ Iteration counter for outer loops.
- add r3, r4 @ delay + factor * (out_length-1) +1
-
- cmp r3, r1 @ data_in_length < endpos?
- movgt r0, #-1
- bgt END
-
- @ Initializations.
- sub r3, r5, asl #3
- add r11, r0, r12, asl #1 @ &data_in[delay]
- ldr r0, [sp, #36] @ coefficients_length
- add r3, r5 @ endpos - factor * 7
-
- cmp r0, #0 @ coefficients_length <= 0 ?
- movle r0, #-1
- ble END
-
- add r8, r0, asl #1 @ &coeffieient[coefficients_length]
- cmp r9, r3
- bge POST_LOOP_ENDPOS @ branch when Iteration < 8 times.
-
-@
-@ First part, unroll the loop 8 times, with 3 subcases (factor == 2, 4, others)
-@
- mov r4, #-2
-
- @ Direct program flow to the right channel.
-
- @ r10 is an offset to &data_in[] in the loop. After an iteration, we need to
- @ move the pointer back to original after advancing 16 bytes by a vld1, and
- @ then move 2 bytes forward to increment one more sample.
- cmp r5, #2
- moveq r10, #-14
- beq LOOP_ENDPOS_FACTOR2 @ Branch when factor == 2
-
- @ Similar here, for r10, we need to move the pointer back to original after
- @ advancing 32 bytes, then move 2 bytes forward to increment one sample.
- cmp r5, #4
- moveq r10, #-30
- beq LOOP_ENDPOS_FACTOR4 @ Branch when factor == 4
-
- @ For r10, we need to move the pointer back to original after advancing
- @ (factor * 7 * 2) bytes, then move 2 bytes forward to increment one sample.
- mov r10, r5, asl #4
- rsb r10, #2
- add r10, r5, asl #1
- lsl r5, #1 @ r5 = factor * sizeof(data_in)
-
-@ The general case (factor != 2 && factor != 4)
-LOOP_ENDPOS_GENERAL:
- @ Initializations.
- vmov.i32 q2, #2048
- vmov.i32 q3, #2048
- sub r7, r8, #2
- sub r12, r0, #1 @ coefficients_length - 1
- sub r1, r11, r12, asl #1 @ &data_in[i - j]
-
-LOOP_COEFF_LENGTH_GENERAL:
- vld1.16 {d2[], d3[]}, [r7], r4 @ coefficients[j]
- vld1.16 d0[0], [r1], r5 @ data_in[i - j]
- vld1.16 d0[1], [r1], r5 @ data_in[i + factor - j]
- vld1.16 d0[2], [r1], r5 @ data_in[i + factor * 2 - j]
- vld1.16 d0[3], [r1], r5 @ data_in[i + factor * 3 - j]
- vld1.16 d1[0], [r1], r5 @ data_in[i + factor * 4 - j]
- vld1.16 d1[1], [r1], r5 @ data_in[i + factor * 5 - j]
- vld1.16 d1[2], [r1], r5 @ data_in[i + factor * 6 - j]
- vld1.16 d1[3], [r1], r10 @ data_in[i + factor * 7 - j]
- subs r12, #1
- vmlal.s16 q2, d0, d2
- vmlal.s16 q3, d1, d3
- bge LOOP_COEFF_LENGTH_GENERAL
-
- @ Shift, saturate, and store the result.
- vqshrn.s32 d0, q2, #12
- vqshrn.s32 d1, q3, #12
- vst1.16 {d0, d1}, [r2]!
-
- add r11, r5, asl #3 @ r11 -> &data_in[i + factor * 8]
- add r9, r5, asl #2 @ Counter i = delay + factor * 8.
- cmp r9, r3 @ i < endpos - factor * 7 ?
- blt LOOP_ENDPOS_GENERAL
- asr r5, #1 @ Restore r5 to the value of factor.
- b POST_LOOP_ENDPOS
-
-@ The case for factor == 2.
-LOOP_ENDPOS_FACTOR2:
- @ Initializations.
- vmov.i32 q2, #2048
- vmov.i32 q3, #2048
- sub r7, r8, #2
- sub r12, r0, #1 @ coefficients_length - 1
- sub r1, r11, r12, asl #1 @ &data_in[i - j]
-
-LOOP_COEFF_LENGTH_FACTOR2:
- vld1.16 {d16[], d17[]}, [r7], r4 @ coefficients[j]
- vld2.16 {d0, d1}, [r1]! @ data_in[]
- vld2.16 {d2, d3}, [r1], r10 @ data_in[]
- subs r12, #1
- vmlal.s16 q2, d0, d16
- vmlal.s16 q3, d2, d17
- bge LOOP_COEFF_LENGTH_FACTOR2
-
- @ Shift, saturate, and store the result.
- vqshrn.s32 d0, q2, #12
- vqshrn.s32 d1, q3, #12
- vst1.16 {d0, d1}, [r2]!
-
- add r11, r5, asl #4 @ r11 -> &data_in[i + factor * 8]
- add r9, r5, asl #3 @ Counter i = delay + factor * 8.
- cmp r9, r3 @ i < endpos - factor * 7 ?
- blt LOOP_ENDPOS_FACTOR2
- b POST_LOOP_ENDPOS
-
-@ The case for factor == 4.
-LOOP_ENDPOS_FACTOR4:
- @ Initializations.
- vmov.i32 q2, #2048
- vmov.i32 q3, #2048
- sub r7, r8, #2
- sub r12, r0, #1 @ coefficients_length - 1
- sub r1, r11, r12, asl #1 @ &data_in[i - j]
-
-LOOP_COEFF_LENGTH_FACTOR4:
- vld1.16 {d16[], d17[]}, [r7], r4 @ coefficients[j]
- vld4.16 {d0, d1, d2, d3}, [r1]! @ data_in[]
- vld4.16 {d18, d19, d20, d21}, [r1], r10 @ data_in[]
- subs r12, #1
- vmlal.s16 q2, d0, d16
- vmlal.s16 q3, d18, d17
- bge LOOP_COEFF_LENGTH_FACTOR4
-
- add r11, r5, asl #4 @ r11 -> &data_in[i + factor * 8]
- add r9, r5, asl #3 @ Counter i = delay + factor * 8.
-
- @ Shift, saturate, and store the result.
- vqshrn.s32 d0, q2, #12
- vqshrn.s32 d1, q3, #12
- cmp r9, r3 @ i < endpos - factor * 7 ?
- vst1.16 {d0, d1}, [r2]!
-
- blt LOOP_ENDPOS_FACTOR4
-
-@
-@ Second part, do the rest iterations (if any).
-@
-
-POST_LOOP_ENDPOS:
- add r3, r5, asl #3
- sub r3, r5 @ Restore r3 to endpos.
- cmp r9, r3
- movge r0, #0
- bge END
-
-LOOP2_ENDPOS:
- @ Initializations.
- mov r7, r8
- sub r12, r0, #1 @ coefficients_length - 1
- sub r6, r11, r12, asl #1 @ &data_in[i - j]
-
- mov r1, #2048
-
-LOOP2_COEFF_LENGTH:
- ldrsh r4, [r7, #-2]! @ coefficients[j]
- ldrsh r10, [r6], #2 @ data_in[i - j]
- smlabb r1, r4, r10, r1
- subs r12, #1
- bge LOOP2_COEFF_LENGTH
-
- @ Shift, saturate, and store the result.
- ssat r1, #16, r1, asr #12
- strh r1, [r2], #2
-
- add r11, r5, asl #1 @ r11 -> &data_in[i + factor]
- add r9, r5 @ Counter i = delay + factor.
- cmp r9, r3 @ i < endpos?
- blt LOOP2_ENDPOS
-
- mov r0, #0
-
-END:
- pop {r4-r11}
- bx lr
diff --git a/webrtc/common_audio/signal_processing/include/signal_processing_library.h b/webrtc/common_audio/signal_processing/include/signal_processing_library.h
index d987e9a..a1dc6aa 100644
--- a/webrtc/common_audio/signal_processing/include/signal_processing_library.h
+++ b/webrtc/common_audio/signal_processing/include/signal_processing_library.h
@@ -154,7 +154,8 @@
typedef int16_t (*MaxAbsValueW16)(const int16_t* vector, int length);
extern MaxAbsValueW16 WebRtcSpl_MaxAbsValueW16;
int16_t WebRtcSpl_MaxAbsValueW16C(const int16_t* vector, int length);
-#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON)
+#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) || \
+ (defined WEBRTC_ARCH_ARM64_NEON)
int16_t WebRtcSpl_MaxAbsValueW16Neon(const int16_t* vector, int length);
#endif
#if defined(MIPS32_LE)
@@ -172,7 +173,8 @@
typedef int32_t (*MaxAbsValueW32)(const int32_t* vector, int length);
extern MaxAbsValueW32 WebRtcSpl_MaxAbsValueW32;
int32_t WebRtcSpl_MaxAbsValueW32C(const int32_t* vector, int length);
-#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON)
+#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) || \
+ (defined WEBRTC_ARCH_ARM64_NEON)
int32_t WebRtcSpl_MaxAbsValueW32Neon(const int32_t* vector, int length);
#endif
#if defined(MIPS_DSP_R1_LE)
@@ -192,7 +194,8 @@
typedef int16_t (*MaxValueW16)(const int16_t* vector, int length);
extern MaxValueW16 WebRtcSpl_MaxValueW16;
int16_t WebRtcSpl_MaxValueW16C(const int16_t* vector, int length);
-#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON)
+#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) || \
+ (defined WEBRTC_ARCH_ARM64_NEON)
int16_t WebRtcSpl_MaxValueW16Neon(const int16_t* vector, int length);
#endif
#if defined(MIPS32_LE)
@@ -212,7 +215,8 @@
typedef int32_t (*MaxValueW32)(const int32_t* vector, int length);
extern MaxValueW32 WebRtcSpl_MaxValueW32;
int32_t WebRtcSpl_MaxValueW32C(const int32_t* vector, int length);
-#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON)
+#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) || \
+ (defined WEBRTC_ARCH_ARM64_NEON)
int32_t WebRtcSpl_MaxValueW32Neon(const int32_t* vector, int length);
#endif
#if defined(MIPS32_LE)
@@ -232,7 +236,8 @@
typedef int16_t (*MinValueW16)(const int16_t* vector, int length);
extern MinValueW16 WebRtcSpl_MinValueW16;
int16_t WebRtcSpl_MinValueW16C(const int16_t* vector, int length);
-#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON)
+#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) || \
+ (defined WEBRTC_ARCH_ARM64_NEON)
int16_t WebRtcSpl_MinValueW16Neon(const int16_t* vector, int length);
#endif
#if defined(MIPS32_LE)
@@ -252,7 +257,8 @@
typedef int32_t (*MinValueW32)(const int32_t* vector, int length);
extern MinValueW32 WebRtcSpl_MinValueW32;
int32_t WebRtcSpl_MinValueW32C(const int32_t* vector, int length);
-#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON)
+#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) || \
+ (defined WEBRTC_ARCH_ARM64_NEON)
int32_t WebRtcSpl_MinValueW32Neon(const int32_t* vector, int length);
#endif
#if defined(MIPS32_LE)
@@ -552,7 +558,8 @@
int16_t dim_cross_correlation,
int16_t right_shifts,
int16_t step_seq2);
-#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON)
+#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) || \
+ (defined WEBRTC_ARCH_ARM64_NEON)
void WebRtcSpl_CrossCorrelationNeon(int32_t* cross_correlation,
const int16_t* seq1,
const int16_t* seq2,
@@ -717,7 +724,8 @@
int coefficients_length,
int factor,
int delay);
-#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON)
+#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) || \
+ (defined WEBRTC_ARCH_ARM64_NEON)
int WebRtcSpl_DownsampleFastNeon(const int16_t* data_in,
int data_in_length,
int16_t* data_out,
diff --git a/webrtc/common_audio/signal_processing/min_max_operations_neon.S b/webrtc/common_audio/signal_processing/min_max_operations_neon.S
deleted file mode 100644
index f427e68..0000000
--- a/webrtc/common_audio/signal_processing/min_max_operations_neon.S
+++ /dev/null
@@ -1,283 +0,0 @@
-@
-@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
-@
-@ Use of this source code is governed by a BSD-style license
-@ that can be found in the LICENSE file in the root of the source
-@ tree. An additional intellectual property rights grant can be found
-@ in the file PATENTS. All contributing project authors may
-@ be found in the AUTHORS file in the root of the source tree.
-@
-
-@ This file contains some minimum and maximum functions, optimized for
-@ ARM Neon platform. The description header can be found in
-@ signal_processing_library.h
-@
-@ The reference C code is in file min_max_operations.c. Code here is basically
-@ a loop unrolling by 8 with Neon instructions. Bit-exact.
-
-#include "webrtc/system_wrappers/interface/asm_defines.h"
-
-GLOBAL_FUNCTION WebRtcSpl_MaxAbsValueW16Neon
-GLOBAL_FUNCTION WebRtcSpl_MaxAbsValueW32Neon
-GLOBAL_FUNCTION WebRtcSpl_MaxValueW16Neon
-GLOBAL_FUNCTION WebRtcSpl_MaxValueW32Neon
-GLOBAL_FUNCTION WebRtcSpl_MinValueW16Neon
-GLOBAL_FUNCTION WebRtcSpl_MinValueW32Neon
-
-.align 2
-@ int16_t WebRtcSpl_MaxAbsValueW16Neon(const int16_t* vector, int length);
-DEFINE_FUNCTION WebRtcSpl_MaxAbsValueW16Neon
- mov r2, #-1 @ Initialize the return value.
- cmp r0, #0
- beq END_MAX_ABS_VALUE_W16
- cmp r1, #0
- ble END_MAX_ABS_VALUE_W16
-
- cmp r1, #8
- blt LOOP_MAX_ABS_VALUE_W16
-
- vmov.i16 q12, #0
- sub r1, #8 @ Counter for loops
-
-LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W16:
- vld1.16 {q13}, [r0]!
- subs r1, #8
- vabs.s16 q13, q13 @ Note vabs doesn't change the value of -32768.
- vmax.u16 q12, q13 @ Use u16 so we don't lose the value -32768.
- bge LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W16
-
- @ Find the maximum value in the Neon registers and move it to r2.
- vmax.u16 d24, d25
- vpmax.u16 d24, d24, d24
- vpmax.u16 d24, d24, d24
- adds r1, #8
- vmov.u16 r2, d24[0]
- beq END_MAX_ABS_VALUE_W16
-
-LOOP_MAX_ABS_VALUE_W16:
- ldrsh r3, [r0], #2
- eor r12, r3, r3, asr #31 @ eor and then sub, to get absolute value.
- sub r12, r12, r3, asr #31
- cmp r2, r12
- movlt r2, r12
- subs r1, #1
- bne LOOP_MAX_ABS_VALUE_W16
-
-END_MAX_ABS_VALUE_W16:
- cmp r2, #0x8000 @ Guard against the case for -32768.
- subeq r2, #1
- mov r0, r2
- bx lr
-
-
-
-@ int32_t WebRtcSpl_MaxAbsValueW32Neon(const int32_t* vector, int length);
-DEFINE_FUNCTION WebRtcSpl_MaxAbsValueW32Neon
- cmp r0, #0
- moveq r0, #-1
- beq EXIT @ Return -1 for a NULL pointer.
- cmp r1, #0 @ length
- movle r0, #-1
- ble EXIT @ Return -1 if length <= 0.
-
- vmov.i32 q11, #0
- vmov.i32 q12, #0
- cmp r1, #8
- blt LOOP_MAX_ABS_VALUE_W32
-
- sub r1, #8 @ Counter for loops
-
-LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W32:
- vld1.32 {q13, q14}, [r0]!
- subs r1, #8 @ Counter for loops
- vabs.s32 q13, q13 @ vabs doesn't change the value of 0x80000000.
- vabs.s32 q14, q14
- vmax.u32 q11, q13 @ Use u32 so we don't lose the value 0x80000000.
- vmax.u32 q12, q14
- bge LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W32
-
- @ Find the maximum value in the Neon registers and move it to r2.
- vmax.u32 q12, q11
- vmax.u32 d24, d25
- vpmax.u32 d24, d24, d24
- adds r1, #8
- vmov.u32 r2, d24[0]
- beq END_MAX_ABS_VALUE_W32
-
-LOOP_MAX_ABS_VALUE_W32:
- ldr r3, [r0], #4
- eor r12, r3, r3, asr #31 @ eor and then sub, to get absolute value.
- sub r12, r12, r3, asr #31
- cmp r2, r12
- movcc r2, r12
- subs r1, #1
- bne LOOP_MAX_ABS_VALUE_W32
-
-END_MAX_ABS_VALUE_W32:
- mvn r0, #0x80000000 @ Guard against the case for 0x80000000.
- cmp r2, r0
- movcc r0, r2
-
-EXIT:
- bx lr
-
-@ int16_t WebRtcSpl_MaxValueW16Neon(const int16_t* vector, int length);
-DEFINE_FUNCTION WebRtcSpl_MaxValueW16Neon
- mov r2, #0x8000 @ Initialize the return value.
- cmp r0, #0
- beq END_MAX_VALUE_W16
- cmp r1, #0
- ble END_MAX_VALUE_W16
-
- vmov.i16 q12, #0x8000
- cmp r1, #8
- blt LOOP_MAX_VALUE_W16
-
- sub r1, #8 @ Counter for loops
-
-LOOP_UNROLLED_BY_8_MAX_VALUE_W16:
- vld1.16 {q13}, [r0]!
- subs r1, #8
- vmax.s16 q12, q13
- bge LOOP_UNROLLED_BY_8_MAX_VALUE_W16
-
- @ Find the maximum value in the Neon registers and move it to r2.
- vmax.s16 d24, d25
- vpmax.s16 d24, d24, d24
- vpmax.s16 d24, d24, d24
- adds r1, #8
- vmov.u16 r2, d24[0]
- beq END_MAX_VALUE_W16
-
-LOOP_MAX_VALUE_W16:
- ldrsh r3, [r0], #2
- cmp r2, r3
- movlt r2, r3
- subs r1, #1
- bne LOOP_MAX_VALUE_W16
-
-END_MAX_VALUE_W16:
- mov r0, r2
- bx lr
-
-@ int32_t WebRtcSpl_MaxValueW32Neon(const int32_t* vector, int length);
-DEFINE_FUNCTION WebRtcSpl_MaxValueW32Neon
- mov r2, #0x80000000 @ Initialize the return value.
- cmp r0, #0
- beq END_MAX_VALUE_W32
- cmp r1, #0
- ble END_MAX_VALUE_W32
-
- vmov.i32 q11, #0x80000000
- vmov.i32 q12, #0x80000000
- cmp r1, #8
- blt LOOP_MAX_VALUE_W32
-
- sub r1, #8 @ Counter for loops
-
-LOOP_UNROLLED_BY_8_MAX_VALUE_W32:
- vld1.32 {q13, q14}, [r0]!
- subs r1, #8
- vmax.s32 q11, q13
- vmax.s32 q12, q14
- bge LOOP_UNROLLED_BY_8_MAX_VALUE_W32
-
- @ Find the maximum value in the Neon registers and move it to r2.
- vmax.s32 q12, q11
- vpmax.s32 d24, d24, d25
- vpmax.s32 d24, d24, d24
- adds r1, #8
- vmov.s32 r2, d24[0]
- beq END_MAX_VALUE_W32
-
-LOOP_MAX_VALUE_W32:
- ldr r3, [r0], #4
- cmp r2, r3
- movlt r2, r3
- subs r1, #1
- bne LOOP_MAX_VALUE_W32
-
-END_MAX_VALUE_W32:
- mov r0, r2
- bx lr
-
-@ int16_t WebRtcSpl_MinValueW16Neon(const int16_t* vector, int length);
-DEFINE_FUNCTION WebRtcSpl_MinValueW16Neon
- movw r2, #0x7FFF @ Initialize the return value.
- cmp r0, #0
- beq END_MIN_VALUE_W16
- cmp r1, #0
- ble END_MIN_VALUE_W16
-
- vdup.16 q12, r2
- cmp r1, #8
- blt LOOP_MIN_VALUE_W16
-
- sub r1, #8 @ Counter for loops
-
-LOOP_UNROLLED_BY_8_MIN_VALUE_W16:
- vld1.16 {q13}, [r0]!
- subs r1, #8
- vmin.s16 q12, q13
- bge LOOP_UNROLLED_BY_8_MIN_VALUE_W16
-
- @ Find the maximum value in the Neon registers and move it to r2.
- vmin.s16 d24, d25
- vpmin.s16 d24, d24, d24
- vpmin.s16 d24, d24, d24
- adds r1, #8
- vmov.s16 r2, d24[0]
- sxth r2, r2
- beq END_MIN_VALUE_W16
-
-LOOP_MIN_VALUE_W16:
- ldrsh r3, [r0], #2
- cmp r2, r3
- movge r2, r3
- subs r1, #1
- bne LOOP_MIN_VALUE_W16
-
-END_MIN_VALUE_W16:
- mov r0, r2
- bx lr
-
-@ int32_t WebRtcSpl_MinValueW32Neon(const int32_t* vector, int length);
-DEFINE_FUNCTION WebRtcSpl_MinValueW32Neon
- mov r2, #0x7FFFFFFF @ Initialize the return value.
- cmp r0, #0
- beq END_MIN_VALUE_W32
- cmp r1, #0
- ble END_MIN_VALUE_W32
-
- vdup.32 q11, r2
- vdup.32 q12, r2
- cmp r1, #8
- blt LOOP_MIN_VALUE_W32
-
- sub r1, #8 @ Counter for loops
-
-LOOP_UNROLLED_BY_8_MIN_VALUE_W32:
- vld1.32 {q13, q14}, [r0]!
- subs r1, #8
- vmin.s32 q11, q13
- vmin.s32 q12, q14
- bge LOOP_UNROLLED_BY_8_MIN_VALUE_W32
-
- @ Find the maximum value in the Neon registers and move it to r2.
- vmin.s32 q12, q11
- vpmin.s32 d24, d24, d25
- vpmin.s32 d24, d24, d24
- adds r1, #8
- vmov.s32 r2, d24[0]
- beq END_MIN_VALUE_W32
-
-LOOP_MIN_VALUE_W32:
- ldr r3, [r0], #4
- cmp r2, r3
- movge r2, r3
- subs r1, #1
- bne LOOP_MIN_VALUE_W32
-
-END_MIN_VALUE_W32:
- mov r0, r2
- bx lr
diff --git a/webrtc/common_audio/signal_processing/spl_init.c b/webrtc/common_audio/signal_processing/spl_init.c
index c9a1673..0a49379 100644
--- a/webrtc/common_audio/signal_processing/spl_init.c
+++ b/webrtc/common_audio/signal_processing/spl_init.c
@@ -29,7 +29,7 @@
ScaleAndAddVectorsWithRound WebRtcSpl_ScaleAndAddVectorsWithRound;
#if (defined(WEBRTC_DETECT_ARM_NEON) || !defined(WEBRTC_ARCH_ARM_NEON)) && \
- !defined(MIPS32_LE)
+ !defined(MIPS32_LE) && !defined(WEBRTC_ARCH_ARM64_NEON)
/* Initialize function pointers to the generic C version. */
static void InitPointersToC() {
WebRtcSpl_MaxAbsValueW16 = WebRtcSpl_MaxAbsValueW16C;
@@ -45,7 +45,8 @@
}
#endif
-#if defined(WEBRTC_DETECT_ARM_NEON) || defined(WEBRTC_ARCH_ARM_NEON)
+#if defined(WEBRTC_DETECT_ARM_NEON) || defined(WEBRTC_ARCH_ARM_NEON) || \
+ (defined WEBRTC_ARCH_ARM64_NEON)
/* Initialize function pointers to the Neon version. */
static void InitPointersToNeon() {
WebRtcSpl_MaxAbsValueW16 = WebRtcSpl_MaxAbsValueW16Neon;
@@ -92,7 +93,7 @@
} else {
InitPointersToC();
}
-#elif defined(WEBRTC_ARCH_ARM_NEON)
+#elif defined(WEBRTC_ARCH_ARM_NEON) || defined(WEBRTC_ARCH_ARM64_NEON)
InitPointersToNeon();
#elif defined(MIPS32_LE)
InitPointersToMIPS();