Add NEON intrinsics version for min_max_operations_neon.c
WebRtcSpl_MinValueW32Neon, WebRtcSpl_MaxValueW32Neon, WebRtcSpl_MaxValueW16Neon
and WebRtcSpl_MaxAbsValueW32Neon are added. SplTest in common_audio_unittests
is passed on ARM32/ARM64 platforms.
BUG=4002
R=andrew@webrtc.org, jridges@masque.com
Change-Id: Id461d64c3313f56147edadd2231e8845574ead2a
Review URL: https://webrtc-codereview.appspot.com/28259004
Patch from Yang Zhang <yang.zhang@arm.com>.
git-svn-id: http://webrtc.googlecode.com/svn/trunk@7889 4adac7df-926f-26a2-2b94-8c16560cd09d
diff --git a/webrtc/common_audio/signal_processing/min_max_operations_neon.c b/webrtc/common_audio/signal_processing/min_max_operations_neon.c
index 704911e..dec31ad 100644
--- a/webrtc/common_audio/signal_processing/min_max_operations_neon.c
+++ b/webrtc/common_audio/signal_processing/min_max_operations_neon.c
@@ -67,6 +67,147 @@
return (int16_t)maximum;
}
+// Maximum absolute value of word32 vector. NEON intrinsics version for
+// ARM 32-bit/64-bit platforms.
+int32_t WebRtcSpl_MaxAbsValueW32Neon(const int32_t* vector, int length) {
+ // Use uint32_t for the local variables, to accommodate the return value
+ // of abs(0x80000000), which is 0x80000000.
+
+ uint32_t absolute = 0, maximum = 0;
+ int i = 0;
+ int residual = length & 0x7;
+
+ if (vector == NULL || length <= 0) {
+ return -1;
+ }
+
+ const int32_t* p_start = vector;
+ uint32x4_t max32x4_0 = vdupq_n_u32(0);
+ uint32x4_t max32x4_1 = vdupq_n_u32(0);
+
+ // First part, unroll the loop 8 times.
+ for (i = length - residual; i >0; i -= 8) {
+ int32x4_t in32x4_0 = vld1q_s32(p_start);
+ p_start += 4;
+ int32x4_t in32x4_1 = vld1q_s32(p_start);
+ p_start += 4;
+ in32x4_0 = vabsq_s32(in32x4_0);
+ in32x4_1 = vabsq_s32(in32x4_1);
+ // vabs doesn't change the value of 0x80000000.
+ // Use u32 so we don't lose the value 0x80000000.
+ max32x4_0 = vmaxq_u32(max32x4_0, vreinterpretq_u32_s32(in32x4_0));
+ max32x4_1 = vmaxq_u32(max32x4_1, vreinterpretq_u32_s32(in32x4_1));
+ }
+
+ uint32x4_t max32x4 = vmaxq_u32(max32x4_0, max32x4_1);
+#if defined(WEBRTC_ARCH_ARM64)
+ maximum = vmaxvq_u32(max32x4);
+#else
+ uint32x2_t max32x2 = vmax_u32(vget_low_u32(max32x4), vget_high_u32(max32x4));
+ max32x2 = vpmax_u32(max32x2, max32x2);
+
+ maximum = vget_lane_u32(max32x2, 0);
+#endif
+
+ // Second part, do the remaining iterations (if any).
+ for (i = residual; i > 0; i--) {
+ absolute = abs((int)(*p_start));
+ if (absolute > maximum) {
+ maximum = absolute;
+ }
+ p_start++;
+ }
+
+ // Guard against the case for 0x80000000.
+ maximum = WEBRTC_SPL_MIN(maximum, WEBRTC_SPL_WORD32_MAX);
+
+ return (int32_t)maximum;
+}
+
+// Maximum value of word16 vector. NEON intrinsics version for
+// ARM 32-bit/64-bit platforms.
+int16_t WebRtcSpl_MaxValueW16Neon(const int16_t* vector, int length) {
+ int16_t maximum = WEBRTC_SPL_WORD16_MIN;
+ int i = 0;
+ int residual = length & 0x7;
+
+ if (vector == NULL || length <= 0) {
+ return maximum;
+ }
+
+ const int16_t* p_start = vector;
+ int16x8_t max16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MIN);
+
+ // First part, unroll the loop 8 times.
+ for (i = length - residual; i >0; i -= 8) {
+ int16x8_t in16x8 = vld1q_s16(p_start);
+ max16x8 = vmaxq_s16(max16x8, in16x8);
+ p_start += 8;
+ }
+
+#if defined(WEBRTC_ARCH_ARM64)
+ maximum = vmaxvq_s16(max16x8);
+#else
+ int16x4_t max16x4 = vmax_s16(vget_low_s16(max16x8), vget_high_s16(max16x8));
+ max16x4 = vpmax_s16(max16x4, max16x4);
+ max16x4 = vpmax_s16(max16x4, max16x4);
+
+ maximum = vget_lane_s16(max16x4, 0);
+#endif
+
+ // Second part, do the remaining iterations (if any).
+ for (i = residual; i > 0; i--) {
+ if (*p_start > maximum)
+ maximum = *p_start;
+ p_start++;
+ }
+ return maximum;
+}
+
+// Maximum value of word32 vector. NEON intrinsics version for
+// ARM 32-bit/64-bit platforms.
+int32_t WebRtcSpl_MaxValueW32Neon(const int32_t* vector, int length) {
+ int32_t maximum = WEBRTC_SPL_WORD32_MIN;
+ int i = 0;
+ int residual = length & 0x7;
+
+ if (vector == NULL || length <= 0) {
+ return maximum;
+ }
+
+ const int32_t* p_start = vector;
+ int32x4_t max32x4_0 = vdupq_n_s32(WEBRTC_SPL_WORD32_MIN);
+ int32x4_t max32x4_1 = vdupq_n_s32(WEBRTC_SPL_WORD32_MIN);
+
+ // First part, unroll the loop 8 times.
+ for (i = length - residual; i >0; i -= 8) {
+ int32x4_t in32x4_0 = vld1q_s32(p_start);
+ p_start += 4;
+ int32x4_t in32x4_1 = vld1q_s32(p_start);
+ p_start += 4;
+ max32x4_0 = vmaxq_s32(max32x4_0, in32x4_0);
+ max32x4_1 = vmaxq_s32(max32x4_1, in32x4_1);
+ }
+
+ int32x4_t max32x4 = vmaxq_s32(max32x4_0, max32x4_1);
+#if defined(WEBRTC_ARCH_ARM64)
+ maximum = vmaxvq_s32(max32x4);
+#else
+ int32x2_t max32x2 = vmax_s32(vget_low_s32(max32x4), vget_high_s32(max32x4));
+ max32x2 = vpmax_s32(max32x2, max32x2);
+
+ maximum = vget_lane_s32(max32x2, 0);
+#endif
+
+ // Second part, do the remaining iterations (if any).
+ for (i = residual; i > 0; i--) {
+ if (*p_start > maximum)
+ maximum = *p_start;
+ p_start++;
+ }
+ return maximum;
+}
+
// Minimum value of word16 vector. NEON intrinsics version for
// ARM 32-bit/64-bit platforms.
int16_t WebRtcSpl_MinValueW16Neon(const int16_t* vector, int length) {
@@ -107,3 +248,47 @@
return minimum;
}
+// Minimum value of word32 vector. NEON intrinsics version for
+// ARM 32-bit/64-bit platforms.
+int32_t WebRtcSpl_MinValueW32Neon(const int32_t* vector, int length) {
+ int32_t minimum = WEBRTC_SPL_WORD32_MAX;
+ int i = 0;
+ int residual = length & 0x7;
+
+ if (vector == NULL || length <= 0) {
+ return minimum;
+ }
+
+ const int32_t* p_start = vector;
+ int32x4_t min32x4_0 = vdupq_n_s32(WEBRTC_SPL_WORD32_MAX);
+ int32x4_t min32x4_1 = vdupq_n_s32(WEBRTC_SPL_WORD32_MAX);
+
+ // First part, unroll the loop 8 times.
+ for (i = length - residual; i >0; i -= 8) {
+ int32x4_t in32x4_0 = vld1q_s32(p_start);
+ p_start += 4;
+ int32x4_t in32x4_1 = vld1q_s32(p_start);
+ p_start += 4;
+ min32x4_0 = vminq_s32(min32x4_0, in32x4_0);
+ min32x4_1 = vminq_s32(min32x4_1, in32x4_1);
+ }
+
+ int32x4_t min32x4 = vminq_s32(min32x4_0, min32x4_1);
+#if defined(WEBRTC_ARCH_ARM64)
+ minimum = vminvq_s32(min32x4);
+#else
+ int32x2_t min32x2 = vmin_s32(vget_low_s32(min32x4), vget_high_s32(min32x4));
+ min32x2 = vpmin_s32(min32x2, min32x2);
+
+ minimum = vget_lane_s32(min32x2, 0);
+#endif
+
+ // Second part, do the remaining iterations (if any).
+ for (i = residual; i > 0; i--) {
+ if (*p_start < minimum)
+ minimum = *p_start;
+ p_start++;
+ }
+ return minimum;
+}
+