Add WebRtcIsacfix_FilterMaLoopNeon's intrinsics version.

This intrinsics version gives bit-exact result as the current assembly
neon code. And the performance is 38% better than current assembly
neon version, 5.92 times faster than current C version. The test runs
under Cortex-a53 aarch32 mode, other cpu should give similar performance
result.

BUG=4002
R=andrew@webrtc.org, jridges@masque.com

Change-Id: I257e33ef6d634a519fd71adc4f52b06dd655bd9d

Review URL: https://webrtc-codereview.appspot.com/32749004

Patch from Zhongwei Yao <zhongwei.yao@arm.com>.

git-svn-id: http://webrtc.googlecode.com/svn/trunk@7891 4adac7df-926f-26a2-2b94-8c16560cd09d
diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_neon.c b/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_neon.c
new file mode 100644
index 0000000..6bdaf1d
--- /dev/null
+++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_neon.c
@@ -0,0 +1,173 @@
+/*
+ *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "webrtc/modules/audio_coding/codecs/isac/fix/source/codec.h"
+#include "webrtc/modules/audio_coding/codecs/isac/fix/source/settings.h"
+
+// Contains a function for the core loop in the normalized lattice MA
+// filter routine for iSAC codec, optimized for ARM Neon platform.
+// It does:
+//  for 0 <= n < HALF_SUBFRAMELEN - 1:
+//    *ptr2 = input2 * (*ptr2) + input0 * (*ptr0));
+//    *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
+// Output is not bit-exact with the reference C code, due to the replacement
+// of WEBRTC_SPL_MUL_16_32_RSFT15 and LATTICE_MUL_32_32_RSFT16 with Neon
+// instructions. The difference should not be bigger than 1.
+void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0,  // Filter coefficient
+                                    int16_t input1,  // Filter coefficient
+                                    int32_t input2,  // Inverse coefficient
+                                    int32_t* ptr0,   // Sample buffer
+                                    int32_t* ptr1,   // Sample buffer
+                                    int32_t* ptr2)   // Sample buffer
+{
+  int n = 0;
+  int loop = (HALF_SUBFRAMELEN - 1) >> 3;
+  int loop_tail = (HALF_SUBFRAMELEN - 1) & 0x7;
+
+  int32x4_t input0_v = vdupq_n_s32((int32_t)input0 << 16);
+  int32x4_t input1_v = vdupq_n_s32((int32_t)input1 << 16);
+  int32x4_t input2_v = vdupq_n_s32(input2);
+  int32x4_t tmp0a, tmp1a, tmp2a, tmp3a;
+  int32x4_t tmp0b, tmp1b, tmp2b, tmp3b;
+  int32x4_t ptr0va, ptr1va, ptr2va;
+  int32x4_t ptr0vb, ptr1vb, ptr2vb;
+
+  // Unroll to process 8 samples at once.
+  for (n = 0; n < loop; n++) {
+    ptr0va = vld1q_s32(ptr0);
+    ptr0vb = vld1q_s32(ptr0 + 4);
+    ptr0 += 8;
+
+    ptr2va = vld1q_s32(ptr2);
+    ptr2vb = vld1q_s32(ptr2 + 4);
+
+    // Calculate tmp0 = (*ptr0) * input0.
+    tmp0a = vqrdmulhq_s32(ptr0va, input0_v);
+    tmp0b = vqrdmulhq_s32(ptr0vb, input0_v);
+
+    // Calculate tmp1 = (*ptr0) * input1.
+    tmp1a = vqrdmulhq_s32(ptr0va, input1_v);
+    tmp1b = vqrdmulhq_s32(ptr0vb, input1_v);
+
+    // Calculate tmp2 = tmp0 + *(ptr2).
+    tmp2a = vaddq_s32(tmp0a, ptr2va);
+    tmp2b = vaddq_s32(tmp0b, ptr2vb);
+    tmp2a = vshlq_n_s32(tmp2a, 15);
+    tmp2b = vshlq_n_s32(tmp2b, 15);
+
+    // Calculate *ptr2 = input2 * tmp2.
+    ptr2va = vqrdmulhq_s32(tmp2a, input2_v);
+    ptr2vb = vqrdmulhq_s32(tmp2b, input2_v);
+
+    vst1q_s32(ptr2, ptr2va);
+    vst1q_s32(ptr2 + 4, ptr2vb);
+    ptr2 += 8;
+
+    // Calculate tmp3 = ptr2v * input0.
+    tmp3a = vqrdmulhq_s32(ptr2va, input0_v);
+    tmp3b = vqrdmulhq_s32(ptr2vb, input0_v);
+
+    // Calculate *ptr1 = tmp1 + tmp3.
+    ptr1va = vaddq_s32(tmp1a, tmp3a);
+    ptr1vb = vaddq_s32(tmp1b, tmp3b);
+
+    vst1q_s32(ptr1, ptr1va);
+    vst1q_s32(ptr1 + 4, ptr1vb);
+    ptr1 += 8;
+  }
+
+  // Process four more samples.
+  if (loop_tail & 0x4) {
+    ptr0va = vld1q_s32(ptr0);
+    ptr2va = vld1q_s32(ptr2);
+    ptr0 += 4;
+
+    // Calculate tmp0 = (*ptr0) * input0.
+    tmp0a = vqrdmulhq_s32(ptr0va, input0_v);
+
+    // Calculate tmp1 = (*ptr0) * input1.
+    tmp1a = vqrdmulhq_s32(ptr0va, input1_v);
+
+    // Calculate tmp2 = tmp0 + *(ptr2).
+    tmp2a = vaddq_s32(tmp0a, ptr2va);
+    tmp2a = vshlq_n_s32(tmp2a, 15);
+
+    // Calculate *ptr2 = input2 * tmp2.
+    ptr2va = vqrdmulhq_s32(tmp2a, input2_v);
+
+    vst1q_s32(ptr2, ptr2va);
+    ptr2 += 4;
+
+    // Calculate tmp3 = *(ptr2) * input0.
+    tmp3a = vqrdmulhq_s32(ptr2va, input0_v);
+
+    // Calculate *ptr1 = tmp1 + tmp3.
+    ptr1va = vaddq_s32(tmp1a, tmp3a);
+
+    vst1q_s32(ptr1, ptr1va);
+    ptr1 += 4;
+  }
+
+  // Process two more samples.
+  if (loop_tail & 0x2) {
+    int32x2_t ptr0v_tail, ptr2v_tail, ptr1v_tail;
+    int32x2_t tmp0_tail, tmp1_tail, tmp2_tail, tmp3_tail;
+    ptr0v_tail = vld1_s32(ptr0);
+    ptr2v_tail = vld1_s32(ptr2);
+    ptr0 += 2;
+
+    // Calculate tmp0 = (*ptr0) * input0.
+    tmp0_tail = vqrdmulh_s32(ptr0v_tail, vget_low_s32(input0_v));
+
+    // Calculate tmp1 = (*ptr0) * input1.
+    tmp1_tail = vqrdmulh_s32(ptr0v_tail, vget_low_s32(input1_v));
+
+    // Calculate tmp2 = tmp0 + *(ptr2).
+    tmp2_tail = vadd_s32(tmp0_tail, ptr2v_tail);
+    tmp2_tail = vshl_n_s32(tmp2_tail, 15);
+
+    // Calculate *ptr2 = input2 * tmp2.
+    ptr2v_tail = vqrdmulh_s32(tmp2_tail, vget_low_s32(input2_v));
+
+    vst1_s32(ptr2, ptr2v_tail);
+    ptr2 += 2;
+
+    // Calculate tmp3 = *(ptr2) * input0.
+    tmp3_tail = vqrdmulh_s32(ptr2v_tail, vget_low_s32(input0_v));
+
+    // Calculate *ptr1 = tmp1 + tmp3.
+    ptr1v_tail = vadd_s32(tmp1_tail, tmp3_tail);
+
+    vst1_s32(ptr1, ptr1v_tail);
+    ptr1 += 2;
+  }
+
+  // Process one more sample.
+  if (loop_tail & 0x1) {
+    int16_t t16a = (int16_t)(input2 >> 16);
+    int16_t t16b = (int16_t)input2;
+    if (t16b < 0) t16a++;
+    int32_t tmp32a;
+    int32_t tmp32b;
+
+    // Calculate *ptr2 = input2 * (*ptr2 + input0 * (*ptr0)).
+    tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr0);
+    tmp32b = *ptr2 + tmp32a;
+    *ptr2 = (int32_t)(WEBRTC_SPL_MUL(t16a, tmp32b) +
+                       (WEBRTC_SPL_MUL_16_32_RSFT16(t16b, tmp32b)));
+
+    // Calculate *ptr1 = input1 * (*ptr0) + input0 * (*ptr2).
+    tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input1, *ptr0);
+    tmp32b = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr2);
+    *ptr1 = tmp32a + tmp32b;
+  }
+}