replace inline assembly WebRtcAecm_CalcLinearEnergiesNeon by intrinsics. The modification only uses the unique part of the CalcLinearEnergies function. Pass byte to byte conformance test both on ARMv7 and ARM64, and the single function performance is similar with original assembly version on different platforms. If not specified, the code is compiled by GCC 4.6. The result is the "X version / C version" ratio, and the less is better. | run 100k times | cortex-a7 | cortex-a9 | cortex-a15 | | use C as the base on each | (1.2Ghz) | (1.0Ghz) | (1.7Ghz) | | CPU target | | | | |----------------------------+-----------+-----------+------------| | Neon asm | 19.48% | 19.26% | 13.68% | | Neon inline | 27.90% | 28.87% | 17.79% | | Neon intrinsics (GCC 4.8) | 18.69% | 20.18% | 14.69% | | Neon intrinsics (LLVM 3.4) | 18.52% | 21.15% | 13.56% | BUG=3580 R=andrew@webrtc.org Review URL: https://webrtc-codereview.appspot.com/23349004 Patch from Zhongwei Yao <zhongwei.yao@arm.com>. git-svn-id: http://webrtc.googlecode.com/svn/trunk@7686 4adac7df-926f-26a2-2b94-8c16560cd09d

commit: 0e37b898f056f7d7a140636162d8e86e2222de98 [log] [tgz]
author: andrew@webrtc.org <andrew@webrtc.org> Tue Nov 11 19:34:14 2014 +0000
committer: andrew@webrtc.org <andrew@webrtc.org> Tue Nov 11 19:34:14 2014 +0000
tree: 04181c12a549ea9bd8b9abe4fbbf7f54612b4bf0
parent: e497be3de17423226dc47e54f651c2c05b3273c1 [diff]
diff --git a/webrtc/modules/audio_processing/aecm/aecm_core_neon.c b/webrtc/modules/audio_processing/aecm/aecm_core_neon.c
index 48d63f7..ff733b7 100644
--- a/webrtc/modules/audio_processing/aecm/aecm_core_neon.c
+++ b/webrtc/modules/audio_processing/aecm/aecm_core_neon.c

@@ -227,72 +227,85 @@
   }
 }
 
+static inline void AddLanes(uint32_t* ptr, uint32x4_t v) {
+#if defined(__aarch64__)
+  *(ptr) = vaddvq_u32(v);
+#else
+  uint32x2_t tmp_v;
+  tmp_v = vadd_u32(vget_low_u32(v), vget_high_u32(v));
+  tmp_v = vpadd_u32(tmp_v, tmp_v);
+  *(ptr) = vget_lane_u32(tmp_v, 0);
+#endif
+}
+
 void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm,
                                        const uint16_t* far_spectrum,
                                        int32_t* echo_est,
                                        uint32_t* far_energy,
                                        uint32_t* echo_energy_adapt,
                                        uint32_t* echo_energy_stored) {
-  int i;
+  int16_t* start_stored_p = aecm->channelStored;
+  int16_t* start_adapt_p = aecm->channelAdapt16;
+  int32_t* echo_est_p = echo_est;
+  const int16_t* end_stored_p = aecm->channelStored + PART_LEN;
+  const uint16_t* far_spectrum_p = far_spectrum;
+  int16x8_t store_v, adapt_v, spectrum_v;
+  uint32x4_t echo_est_v_low, echo_est_v_high;
+  uint32x4_t far_energy_v, echo_energy_stored_v, echo_energy_adapt_v;
 
-  register uint32_t far_energy_r;
-  register uint32_t echo_energy_stored_r;
-  register uint32_t echo_energy_adapt_r;
+  far_energy_v = vdupq_n_u32(0);
+  echo_energy_adapt_v = vdupq_n_u32(0);
+  echo_energy_stored_v = vdupq_n_u32(0);
 
-  assert((uintptr_t)echo_est % 32 == 0);
-  assert((uintptr_t)(aecm->channelStored) % 16 == 0);
-  assert((uintptr_t)(aecm->channelAdapt16) % 16 == 0);
-  assert((uintptr_t)(aecm->channelStored) % 16 == 0);
-  assert((uintptr_t)(aecm->channelStored) % 16 == 0);
+  // Get energy for the delayed far end signal and estimated
+  // echo using both stored and adapted channels.
+  // The C code:
+  //  for (i = 0; i < PART_LEN1; i++) {
+  //      echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
+  //                                         far_spectrum[i]);
+  //      (*far_energy) += (uint32_t)(far_spectrum[i]);
+  //      *echo_energy_adapt += aecm->channelAdapt16[i] * far_spectrum[i];
+  //      (*echo_energy_stored) += (uint32_t)echo_est[i];
+  //  }
+  while (start_stored_p < end_stored_p) {
+    spectrum_v = vld1q_u16(far_spectrum_p);
+    adapt_v = vld1q_s16(start_adapt_p);
+    store_v = vld1q_s16(start_stored_p);
 
-  __asm __volatile("vmov.i32 q14, #0" : : : "q14"); // far_energy
-  __asm __volatile("vmov.i32 q8,  #0" : : : "q8"); // echo_energy_stored
-  __asm __volatile("vmov.i32 q9,  #0" : : : "q9"); // echo_energy_adapt
+    far_energy_v = vaddw_u16(far_energy_v, vget_low_s16(spectrum_v));
+    far_energy_v = vaddw_u16(far_energy_v, vget_high_s16(spectrum_v));
 
-  for (i = 0; i < PART_LEN - 7; i += 8) {
-    // far_energy += (uint32_t)(far_spectrum[i]);
-    __asm __volatile("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
-    __asm __volatile("vaddw.u16 q14, q14, d26" : : : "q14", "q13");
-    __asm __volatile("vaddw.u16 q14, q14, d27" : : : "q14", "q13");
+    echo_est_v_low = vmull_u16(vget_low_s16(store_v), vget_low_s16(spectrum_v));
+    echo_est_v_high = vmull_u16(vget_high_s16(store_v),
+                                vget_high_s16(spectrum_v));
+    vst1q_s32(echo_est_p, echo_est_v_low);
+    vst1q_s32(echo_est_p + 4, echo_est_v_high);
 
-    // Get estimated echo energies for adaptive channel and stored channel.
-    // echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
-    __asm __volatile("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12");
-    __asm __volatile("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
-    __asm __volatile("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
-    __asm __volatile("vst1.32 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&echo_est[i]):
-            "q10", "q11");
+    echo_energy_stored_v = vaddq_s32(echo_est_v_low, echo_energy_stored_v);
+    echo_energy_stored_v = vaddq_s32(echo_est_v_high, echo_energy_stored_v);
 
-    // echo_energy_stored += (uint32_t)echoEst[i];
-    __asm __volatile("vadd.u32 q8, q10" : : : "q10", "q8");
-    __asm __volatile("vadd.u32 q8, q11" : : : "q11", "q8");
+    echo_energy_adapt_v = vmlal_u16(echo_energy_adapt_v,
+                                    vget_low_s16(adapt_v),
+                                    vget_low_s16(spectrum_v));
+    echo_energy_adapt_v = vmlal_u16(echo_energy_adapt_v,
+                                    vget_high_s16(adapt_v),
+                                    vget_high_s16(spectrum_v));
 
-    // echo_energy_adapt += aecm->channelAdapt16[i] * far_spectrum[i];
-    __asm __volatile("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12");
-    __asm __volatile("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
-    __asm __volatile("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
-    __asm __volatile("vadd.u32 q9, q10" : : : "q9", "q15");
-    __asm __volatile("vadd.u32 q9, q11" : : : "q9", "q11");
+    start_stored_p += 8;
+    start_adapt_p += 8;
+    far_spectrum_p += 8;
+    echo_est_p += 8;
   }
 
-  __asm __volatile("vadd.u32 d28, d29" : : : "q14");
-  __asm __volatile("vpadd.u32 d28, d28" : : : "q14");
-  __asm __volatile("vmov.32 %0, d28[0]" : "=r"(far_energy_r): : "q14");
+  AddLanes(far_energy, far_energy_v);
+  AddLanes(echo_energy_stored, echo_energy_stored_v);
+  AddLanes(echo_energy_adapt, echo_energy_adapt_v);
 
-  __asm __volatile("vadd.u32 d18, d19" : : : "q9");
-  __asm __volatile("vpadd.u32 d18, d18" : : : "q9");
-  __asm __volatile("vmov.32 %0, d18[0]" : "=r"(echo_energy_adapt_r): : "q9");
-
-  __asm __volatile("vadd.u32 d16, d17" : : : "q8");
-  __asm __volatile("vpadd.u32 d16, d16" : : : "q8");
-  __asm __volatile("vmov.32 %0, d16[0]" : "=r"(echo_energy_stored_r): : "q8");
-
-  // Get estimated echo energies for adaptive channel and stored channel.
-  echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
-  *echo_energy_stored = echo_energy_stored_r + (uint32_t)echo_est[i];
-  *far_energy = far_energy_r + (uint32_t)(far_spectrum[i]);
-  *echo_energy_adapt = echo_energy_adapt_r +
-      aecm->channelAdapt16[i] * far_spectrum[i];
+  echo_est[PART_LEN] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[PART_LEN],
+                                             far_spectrum[PART_LEN]);
+  *echo_energy_stored += (uint32_t)echo_est[PART_LEN];
+  *far_energy += (uint32_t)far_spectrum[PART_LEN];
+  *echo_energy_adapt += aecm->channelAdapt16[PART_LEN] * far_spectrum[PART_LEN];
 }
 
 void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm,
commit	0e37b898f056f7d7a140636162d8e86e2222de98	[log] [tgz]
author	andrew@webrtc.org <andrew@webrtc.org>	Tue Nov 11 19:34:14 2014 +0000
committer	andrew@webrtc.org <andrew@webrtc.org>	Tue Nov 11 19:34:14 2014 +0000
tree	04181c12a549ea9bd8b9abe4fbbf7f54612b4bf0
parent	e497be3de17423226dc47e54f651c2c05b3273c1 [diff]