replace inline assembly WebRtcAecm_CalcLinearEnergiesNeon by intrinsics.

The modification only uses the unique part of the CalcLinearEnergies
 function. Pass byte to byte conformance test both on ARMv7 and ARM64,
 and the single function performance is similar with original assembly
 version on different platforms. If not specified, the code is compiled
 by GCC 4.6. The result is the "X version / C version" ratio, and the
 less is better.

| run 100k times             | cortex-a7 | cortex-a9 | cortex-a15 |
| use C as the base on each  |  (1.2Ghz) |  (1.0Ghz) |   (1.7Ghz) |
| CPU target                 |           |           |            |
|----------------------------+-----------+-----------+------------|
| Neon asm                   |    19.48% |    19.26% |     13.68% |
| Neon inline                |    27.90% |    28.87% |     17.79% |
| Neon intrinsics (GCC 4.8)  |    18.69% |    20.18% |     14.69% |
| Neon intrinsics (LLVM 3.4) |    18.52% |    21.15% |     13.56% |

BUG=3580
R=andrew@webrtc.org

Review URL: https://webrtc-codereview.appspot.com/23349004

Patch from Zhongwei Yao <zhongwei.yao@arm.com>.

git-svn-id: http://webrtc.googlecode.com/svn/trunk@7686 4adac7df-926f-26a2-2b94-8c16560cd09d
diff --git a/webrtc/modules/audio_processing/aecm/aecm_core_neon.c b/webrtc/modules/audio_processing/aecm/aecm_core_neon.c
index 48d63f7..ff733b7 100644
--- a/webrtc/modules/audio_processing/aecm/aecm_core_neon.c
+++ b/webrtc/modules/audio_processing/aecm/aecm_core_neon.c
@@ -227,72 +227,85 @@
   }
 }
 
+static inline void AddLanes(uint32_t* ptr, uint32x4_t v) {
+#if defined(__aarch64__)
+  *(ptr) = vaddvq_u32(v);
+#else
+  uint32x2_t tmp_v;
+  tmp_v = vadd_u32(vget_low_u32(v), vget_high_u32(v));
+  tmp_v = vpadd_u32(tmp_v, tmp_v);
+  *(ptr) = vget_lane_u32(tmp_v, 0);
+#endif
+}
+
 void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm,
                                        const uint16_t* far_spectrum,
                                        int32_t* echo_est,
                                        uint32_t* far_energy,
                                        uint32_t* echo_energy_adapt,
                                        uint32_t* echo_energy_stored) {
-  int i;
+  int16_t* start_stored_p = aecm->channelStored;
+  int16_t* start_adapt_p = aecm->channelAdapt16;
+  int32_t* echo_est_p = echo_est;
+  const int16_t* end_stored_p = aecm->channelStored + PART_LEN;
+  const uint16_t* far_spectrum_p = far_spectrum;
+  int16x8_t store_v, adapt_v, spectrum_v;
+  uint32x4_t echo_est_v_low, echo_est_v_high;
+  uint32x4_t far_energy_v, echo_energy_stored_v, echo_energy_adapt_v;
 
-  register uint32_t far_energy_r;
-  register uint32_t echo_energy_stored_r;
-  register uint32_t echo_energy_adapt_r;
+  far_energy_v = vdupq_n_u32(0);
+  echo_energy_adapt_v = vdupq_n_u32(0);
+  echo_energy_stored_v = vdupq_n_u32(0);
 
-  assert((uintptr_t)echo_est % 32 == 0);
-  assert((uintptr_t)(aecm->channelStored) % 16 == 0);
-  assert((uintptr_t)(aecm->channelAdapt16) % 16 == 0);
-  assert((uintptr_t)(aecm->channelStored) % 16 == 0);
-  assert((uintptr_t)(aecm->channelStored) % 16 == 0);
+  // Get energy for the delayed far end signal and estimated
+  // echo using both stored and adapted channels.
+  // The C code:
+  //  for (i = 0; i < PART_LEN1; i++) {
+  //      echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
+  //                                         far_spectrum[i]);
+  //      (*far_energy) += (uint32_t)(far_spectrum[i]);
+  //      *echo_energy_adapt += aecm->channelAdapt16[i] * far_spectrum[i];
+  //      (*echo_energy_stored) += (uint32_t)echo_est[i];
+  //  }
+  while (start_stored_p < end_stored_p) {
+    spectrum_v = vld1q_u16(far_spectrum_p);
+    adapt_v = vld1q_s16(start_adapt_p);
+    store_v = vld1q_s16(start_stored_p);
 
-  __asm __volatile("vmov.i32 q14, #0" : : : "q14"); // far_energy
-  __asm __volatile("vmov.i32 q8,  #0" : : : "q8"); // echo_energy_stored
-  __asm __volatile("vmov.i32 q9,  #0" : : : "q9"); // echo_energy_adapt
+    far_energy_v = vaddw_u16(far_energy_v, vget_low_s16(spectrum_v));
+    far_energy_v = vaddw_u16(far_energy_v, vget_high_s16(spectrum_v));
 
-  for (i = 0; i < PART_LEN - 7; i += 8) {
-    // far_energy += (uint32_t)(far_spectrum[i]);
-    __asm __volatile("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
-    __asm __volatile("vaddw.u16 q14, q14, d26" : : : "q14", "q13");
-    __asm __volatile("vaddw.u16 q14, q14, d27" : : : "q14", "q13");
+    echo_est_v_low = vmull_u16(vget_low_s16(store_v), vget_low_s16(spectrum_v));
+    echo_est_v_high = vmull_u16(vget_high_s16(store_v),
+                                vget_high_s16(spectrum_v));
+    vst1q_s32(echo_est_p, echo_est_v_low);
+    vst1q_s32(echo_est_p + 4, echo_est_v_high);
 
-    // Get estimated echo energies for adaptive channel and stored channel.
-    // echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
-    __asm __volatile("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12");
-    __asm __volatile("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
-    __asm __volatile("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
-    __asm __volatile("vst1.32 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&echo_est[i]):
-            "q10", "q11");
+    echo_energy_stored_v = vaddq_s32(echo_est_v_low, echo_energy_stored_v);
+    echo_energy_stored_v = vaddq_s32(echo_est_v_high, echo_energy_stored_v);
 
-    // echo_energy_stored += (uint32_t)echoEst[i];
-    __asm __volatile("vadd.u32 q8, q10" : : : "q10", "q8");
-    __asm __volatile("vadd.u32 q8, q11" : : : "q11", "q8");
+    echo_energy_adapt_v = vmlal_u16(echo_energy_adapt_v,
+                                    vget_low_s16(adapt_v),
+                                    vget_low_s16(spectrum_v));
+    echo_energy_adapt_v = vmlal_u16(echo_energy_adapt_v,
+                                    vget_high_s16(adapt_v),
+                                    vget_high_s16(spectrum_v));
 
-    // echo_energy_adapt += aecm->channelAdapt16[i] * far_spectrum[i];
-    __asm __volatile("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12");
-    __asm __volatile("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
-    __asm __volatile("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
-    __asm __volatile("vadd.u32 q9, q10" : : : "q9", "q15");
-    __asm __volatile("vadd.u32 q9, q11" : : : "q9", "q11");
+    start_stored_p += 8;
+    start_adapt_p += 8;
+    far_spectrum_p += 8;
+    echo_est_p += 8;
   }
 
-  __asm __volatile("vadd.u32 d28, d29" : : : "q14");
-  __asm __volatile("vpadd.u32 d28, d28" : : : "q14");
-  __asm __volatile("vmov.32 %0, d28[0]" : "=r"(far_energy_r): : "q14");
+  AddLanes(far_energy, far_energy_v);
+  AddLanes(echo_energy_stored, echo_energy_stored_v);
+  AddLanes(echo_energy_adapt, echo_energy_adapt_v);
 
-  __asm __volatile("vadd.u32 d18, d19" : : : "q9");
-  __asm __volatile("vpadd.u32 d18, d18" : : : "q9");
-  __asm __volatile("vmov.32 %0, d18[0]" : "=r"(echo_energy_adapt_r): : "q9");
-
-  __asm __volatile("vadd.u32 d16, d17" : : : "q8");
-  __asm __volatile("vpadd.u32 d16, d16" : : : "q8");
-  __asm __volatile("vmov.32 %0, d16[0]" : "=r"(echo_energy_stored_r): : "q8");
-
-  // Get estimated echo energies for adaptive channel and stored channel.
-  echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
-  *echo_energy_stored = echo_energy_stored_r + (uint32_t)echo_est[i];
-  *far_energy = far_energy_r + (uint32_t)(far_spectrum[i]);
-  *echo_energy_adapt = echo_energy_adapt_r +
-      aecm->channelAdapt16[i] * far_spectrum[i];
+  echo_est[PART_LEN] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[PART_LEN],
+                                             far_spectrum[PART_LEN]);
+  *echo_energy_stored += (uint32_t)echo_est[PART_LEN];
+  *far_energy += (uint32_t)far_spectrum[PART_LEN];
+  *echo_energy_adapt += aecm->channelAdapt16[PART_LEN] * far_spectrum[PART_LEN];
 }
 
 void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm,