webrtc/modules/audio_processing/aec/aec_core_neon.c - platform/external/webrtc - Git at Google

 /*
  *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */

 /*
  * The core AEC algorithm, neon version of speed-critical functions.
  *
  * Based on aec_core_sse2.c.
  */

 #include "webrtc/modules/audio_processing/aec/aec_core.h"

 #include <arm_neon.h>
 #include <math.h>
 #include <string.h>  // memset

 #include "webrtc/modules/audio_processing/aec/aec_core_internal.h"
 #include "webrtc/modules/audio_processing/aec/aec_rdft.h"

 enum { kShiftExponentIntoTopMantissa = 8 };
 enum { kFloatExponentShift = 23 };

 __inline static float MulRe(float aRe, float aIm, float bRe, float bIm) {
   return aRe * bRe - aIm * bIm;
 }

 static void FilterAdaptationNEON(AecCore* aec,
                                  float* fft,
                                  float ef[2][PART_LEN1]) {
   int i;
   const int num_partitions = aec->num_partitions;
   for (i = 0; i < num_partitions; i++) {
     int xPos = (i + aec->xfBufBlockPos) * PART_LEN1;
     int pos = i * PART_LEN1;
     int j;
     // Check for wrap
     if (i + aec->xfBufBlockPos >= num_partitions) {
       xPos -= num_partitions * PART_LEN1;
     }

     // Process the whole array...
     for (j = 0; j < PART_LEN; j += 4) {
       // Load xfBuf and ef.
       const float32x4_t xfBuf_re = vld1q_f32(&aec->xfBuf[0][xPos + j]);
       const float32x4_t xfBuf_im = vld1q_f32(&aec->xfBuf[1][xPos + j]);
       const float32x4_t ef_re = vld1q_f32(&ef[0][j]);
       const float32x4_t ef_im = vld1q_f32(&ef[1][j]);
       // Calculate the product of conjugate(xfBuf) by ef.
       //   re(conjugate(a) * b) = aRe * bRe + aIm * bIm
       //   im(conjugate(a) * b)=  aRe * bIm - aIm * bRe
       const float32x4_t a = vmulq_f32(xfBuf_re, ef_re);
       const float32x4_t e = vmlaq_f32(a, xfBuf_im, ef_im);
       const float32x4_t c = vmulq_f32(xfBuf_re, ef_im);
       const float32x4_t f = vmlsq_f32(c, xfBuf_im, ef_re);
       // Interleave real and imaginary parts.
       const float32x4x2_t g_n_h = vzipq_f32(e, f);
       // Store
       vst1q_f32(&fft[2 * j + 0], g_n_h.val[0]);
       vst1q_f32(&fft[2 * j + 4], g_n_h.val[1]);
     }
     // ... and fixup the first imaginary entry.
     fft[1] = MulRe(aec->xfBuf[0][xPos + PART_LEN],
                    -aec->xfBuf[1][xPos + PART_LEN],
                    ef[0][PART_LEN],
                    ef[1][PART_LEN]);

     aec_rdft_inverse_128(fft);
     memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);

     // fft scaling
     {
       const float scale = 2.0f / PART_LEN2;
       const float32x4_t scale_ps = vmovq_n_f32(scale);
       for (j = 0; j < PART_LEN; j += 4) {
         const float32x4_t fft_ps = vld1q_f32(&fft[j]);
         const float32x4_t fft_scale = vmulq_f32(fft_ps, scale_ps);
         vst1q_f32(&fft[j], fft_scale);
       }
     }
     aec_rdft_forward_128(fft);

     {
       const float wt1 = aec->wfBuf[1][pos];
       aec->wfBuf[0][pos + PART_LEN] += fft[1];
       for (j = 0; j < PART_LEN; j += 4) {
         float32x4_t wtBuf_re = vld1q_f32(&aec->wfBuf[0][pos + j]);
         float32x4_t wtBuf_im = vld1q_f32(&aec->wfBuf[1][pos + j]);
         const float32x4_t fft0 = vld1q_f32(&fft[2 * j + 0]);
         const float32x4_t fft4 = vld1q_f32(&fft[2 * j + 4]);
         const float32x4x2_t fft_re_im = vuzpq_f32(fft0, fft4);
         wtBuf_re = vaddq_f32(wtBuf_re, fft_re_im.val[0]);
         wtBuf_im = vaddq_f32(wtBuf_im, fft_re_im.val[1]);

         vst1q_f32(&aec->wfBuf[0][pos + j], wtBuf_re);
         vst1q_f32(&aec->wfBuf[1][pos + j], wtBuf_im);
       }
       aec->wfBuf[1][pos] = wt1;
     }
   }
 }

 extern const float WebRtcAec_weightCurve[65];
 extern const float WebRtcAec_overDriveCurve[65];

 static float32x4_t vpowq_f32(float32x4_t a, float32x4_t b) {
   // a^b = exp2(b * log2(a))
   //   exp2(x) and log2(x) are calculated using polynomial approximations.
   float32x4_t log2_a, b_log2_a, a_exp_b;

   // Calculate log2(x), x = a.
   {
     // To calculate log2(x), we decompose x like this:
     //   x = y * 2^n
     //     n is an integer
     //     y is in the [1.0, 2.0) range
     //
     //   log2(x) = log2(y) + n
     //     n       can be evaluated by playing with float representation.
     //     log2(y) in a small range can be approximated, this code uses an order
     //             five polynomial approximation. The coefficients have been
     //             estimated with the Remez algorithm and the resulting
     //             polynomial has a maximum relative error of 0.00086%.

     // Compute n.
     //    This is done by masking the exponent, shifting it into the top bit of
     //    the mantissa, putting eight into the biased exponent (to shift/
     //    compensate the fact that the exponent has been shifted in the top/
     //    fractional part and finally getting rid of the implicit leading one
     //    from the mantissa by substracting it out.
     const uint32x4_t vec_float_exponent_mask = vdupq_n_u32(0x7F800000);
     const uint32x4_t vec_eight_biased_exponent = vdupq_n_u32(0x43800000);
     const uint32x4_t vec_implicit_leading_one = vdupq_n_u32(0x43BF8000);
     const uint32x4_t two_n = vandq_u32(vreinterpretq_u32_f32(a),
                                        vec_float_exponent_mask);
     const uint32x4_t n_1 = vshrq_n_u32(two_n, kShiftExponentIntoTopMantissa);
     const uint32x4_t n_0 = vorrq_u32(n_1, vec_eight_biased_exponent);
     const float32x4_t n =
         vsubq_f32(vreinterpretq_f32_u32(n_0),
                   vreinterpretq_f32_u32(vec_implicit_leading_one));
     // Compute y.
     const uint32x4_t vec_mantissa_mask = vdupq_n_u32(0x007FFFFF);
     const uint32x4_t vec_zero_biased_exponent_is_one = vdupq_n_u32(0x3F800000);
     const uint32x4_t mantissa = vandq_u32(vreinterpretq_u32_f32(a),
                                           vec_mantissa_mask);
     const float32x4_t y =
         vreinterpretq_f32_u32(vorrq_u32(mantissa,
                                         vec_zero_biased_exponent_is_one));
     // Approximate log2(y) ~= (y - 1) * pol5(y).
     //    pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0
     const float32x4_t C5 = vdupq_n_f32(-3.4436006e-2f);
     const float32x4_t C4 = vdupq_n_f32(3.1821337e-1f);
     const float32x4_t C3 = vdupq_n_f32(-1.2315303f);
     const float32x4_t C2 = vdupq_n_f32(2.5988452f);
     const float32x4_t C1 = vdupq_n_f32(-3.3241990f);
     const float32x4_t C0 = vdupq_n_f32(3.1157899f);
     float32x4_t pol5_y = C5;
     pol5_y = vmlaq_f32(C4, y, pol5_y);
     pol5_y = vmlaq_f32(C3, y, pol5_y);
     pol5_y = vmlaq_f32(C2, y, pol5_y);
     pol5_y = vmlaq_f32(C1, y, pol5_y);
     pol5_y = vmlaq_f32(C0, y, pol5_y);
     const float32x4_t y_minus_one =
         vsubq_f32(y, vreinterpretq_f32_u32(vec_zero_biased_exponent_is_one));
     const float32x4_t log2_y = vmulq_f32(y_minus_one, pol5_y);

     // Combine parts.
     log2_a = vaddq_f32(n, log2_y);
   }

   // b * log2(a)
   b_log2_a = vmulq_f32(b, log2_a);

   // Calculate exp2(x), x = b * log2(a).
   {
     // To calculate 2^x, we decompose x like this:
     //   x = n + y
     //     n is an integer, the value of x - 0.5 rounded down, therefore
     //     y is in the [0.5, 1.5) range
     //
     //   2^x = 2^n * 2^y
     //     2^n can be evaluated by playing with float representation.
     //     2^y in a small range can be approximated, this code uses an order two
     //         polynomial approximation. The coefficients have been estimated
     //         with the Remez algorithm and the resulting polynomial has a
     //         maximum relative error of 0.17%.
     // To avoid over/underflow, we reduce the range of input to ]-127, 129].
     const float32x4_t max_input = vdupq_n_f32(129.f);
     const float32x4_t min_input = vdupq_n_f32(-126.99999f);
     const float32x4_t x_min = vminq_f32(b_log2_a, max_input);
     const float32x4_t x_max = vmaxq_f32(x_min, min_input);
     // Compute n.
     const float32x4_t half = vdupq_n_f32(0.5f);
     const float32x4_t x_minus_half = vsubq_f32(x_max, half);
     const int32x4_t x_minus_half_floor = vcvtq_s32_f32(x_minus_half);

     // Compute 2^n.
     const int32x4_t float_exponent_bias = vdupq_n_s32(127);
     const int32x4_t two_n_exponent =
         vaddq_s32(x_minus_half_floor, float_exponent_bias);
     const float32x4_t two_n =
         vreinterpretq_f32_s32(vshlq_n_s32(two_n_exponent, kFloatExponentShift));
     // Compute y.
     const float32x4_t y = vsubq_f32(x_max, vcvtq_f32_s32(x_minus_half_floor));

     // Approximate 2^y ~= C2 * y^2 + C1 * y + C0.
     const float32x4_t C2 = vdupq_n_f32(3.3718944e-1f);
     const float32x4_t C1 = vdupq_n_f32(6.5763628e-1f);
     const float32x4_t C0 = vdupq_n_f32(1.0017247f);
     float32x4_t exp2_y = C2;
     exp2_y = vmlaq_f32(C1, y, exp2_y);
     exp2_y = vmlaq_f32(C0, y, exp2_y);

     // Combine parts.
     a_exp_b = vmulq_f32(exp2_y, two_n);
   }

   return a_exp_b;
 }

 static void OverdriveAndSuppressNEON(AecCore* aec,
                                      float hNl[PART_LEN1],
                                      const float hNlFb,
                                      float efw[2][PART_LEN1]) {
   int i;
   const float32x4_t vec_hNlFb = vmovq_n_f32(hNlFb);
   const float32x4_t vec_one = vdupq_n_f32(1.0f);
   const float32x4_t vec_minus_one = vdupq_n_f32(-1.0f);
   const float32x4_t vec_overDriveSm = vmovq_n_f32(aec->overDriveSm);

   // vectorized code (four at once)
   for (i = 0; i + 3 < PART_LEN1; i += 4) {
     // Weight subbands
     float32x4_t vec_hNl = vld1q_f32(&hNl[i]);
     const float32x4_t vec_weightCurve = vld1q_f32(&WebRtcAec_weightCurve[i]);
     const uint32x4_t bigger = vcgtq_f32(vec_hNl, vec_hNlFb);
     const float32x4_t vec_weightCurve_hNlFb = vmulq_f32(vec_weightCurve,
                                                         vec_hNlFb);
     const float32x4_t vec_one_weightCurve = vsubq_f32(vec_one, vec_weightCurve);
     const float32x4_t vec_one_weightCurve_hNl = vmulq_f32(vec_one_weightCurve,
                                                           vec_hNl);
     const uint32x4_t vec_if0 = vandq_u32(vmvnq_u32(bigger),
                                          vreinterpretq_u32_f32(vec_hNl));
     const float32x4_t vec_one_weightCurve_add =
         vaddq_f32(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl);
     const uint32x4_t vec_if1 =
         vandq_u32(bigger, vreinterpretq_u32_f32(vec_one_weightCurve_add));

     vec_hNl = vreinterpretq_f32_u32(vorrq_u32(vec_if0, vec_if1));

     {
       const float32x4_t vec_overDriveCurve =
           vld1q_f32(&WebRtcAec_overDriveCurve[i]);
       const float32x4_t vec_overDriveSm_overDriveCurve =
           vmulq_f32(vec_overDriveSm, vec_overDriveCurve);
       vec_hNl = vpowq_f32(vec_hNl, vec_overDriveSm_overDriveCurve);
       vst1q_f32(&hNl[i], vec_hNl);
     }

     // Suppress error signal
     {
       float32x4_t vec_efw_re = vld1q_f32(&efw[0][i]);
       float32x4_t vec_efw_im = vld1q_f32(&efw[1][i]);
       vec_efw_re = vmulq_f32(vec_efw_re, vec_hNl);
       vec_efw_im = vmulq_f32(vec_efw_im, vec_hNl);

       // Ooura fft returns incorrect sign on imaginary component. It matters
       // here because we are making an additive change with comfort noise.
       vec_efw_im = vmulq_f32(vec_efw_im, vec_minus_one);
       vst1q_f32(&efw[0][i], vec_efw_re);
       vst1q_f32(&efw[1][i], vec_efw_im);
     }
   }

   // scalar code for the remaining items.
   for (; i < PART_LEN1; i++) {
     // Weight subbands
     if (hNl[i] > hNlFb) {
       hNl[i] = WebRtcAec_weightCurve[i] * hNlFb +
                (1 - WebRtcAec_weightCurve[i]) * hNl[i];
     }

     hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]);

     // Suppress error signal
     efw[0][i] *= hNl[i];
     efw[1][i] *= hNl[i];

     // Ooura fft returns incorrect sign on imaginary component. It matters
     // here because we are making an additive change with comfort noise.
     efw[1][i] *= -1;
   }
 }

 void WebRtcAec_InitAec_neon(void) {
   WebRtcAec_FilterAdaptation = FilterAdaptationNEON;
   WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressNEON;
 }
	/*
	* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
	*
	* Use of this source code is governed by a BSD-style license
	* that can be found in the LICENSE file in the root of the source
	* tree. An additional intellectual property rights grant can be found
	* in the file PATENTS. All contributing project authors may
	* be found in the AUTHORS file in the root of the source tree.
	*/

	/*
	* The core AEC algorithm, neon version of speed-critical functions.
	*
	* Based on aec_core_sse2.c.
	*/

	#include "webrtc/modules/audio_processing/aec/aec_core.h"

	#include <arm_neon.h>
	#include <math.h>
	#include <string.h> // memset

	#include "webrtc/modules/audio_processing/aec/aec_core_internal.h"
	#include "webrtc/modules/audio_processing/aec/aec_rdft.h"

	enum { kShiftExponentIntoTopMantissa = 8 };
	enum { kFloatExponentShift = 23 };

	__inline static float MulRe(float aRe, float aIm, float bRe, float bIm) {
	return aRe * bRe - aIm * bIm;
	}

	static void FilterAdaptationNEON(AecCore* aec,
	float* fft,
	float ef[2][PART_LEN1]) {
	int i;
	const int num_partitions = aec->num_partitions;
	for (i = 0; i < num_partitions; i++) {
	int xPos = (i + aec->xfBufBlockPos) * PART_LEN1;
	int pos = i * PART_LEN1;
	int j;
	// Check for wrap
	if (i + aec->xfBufBlockPos >= num_partitions) {
	xPos -= num_partitions * PART_LEN1;
	}

	// Process the whole array...
	for (j = 0; j < PART_LEN; j += 4) {
	// Load xfBuf and ef.
	const float32x4_t xfBuf_re = vld1q_f32(&aec->xfBuf[0][xPos + j]);
	const float32x4_t xfBuf_im = vld1q_f32(&aec->xfBuf[1][xPos + j]);
	const float32x4_t ef_re = vld1q_f32(&ef[0][j]);
	const float32x4_t ef_im = vld1q_f32(&ef[1][j]);
	// Calculate the product of conjugate(xfBuf) by ef.
	// re(conjugate(a) * b) = aRe * bRe + aIm * bIm
	// im(conjugate(a) * b)= aRe * bIm - aIm * bRe
	const float32x4_t a = vmulq_f32(xfBuf_re, ef_re);
	const float32x4_t e = vmlaq_f32(a, xfBuf_im, ef_im);
	const float32x4_t c = vmulq_f32(xfBuf_re, ef_im);
	const float32x4_t f = vmlsq_f32(c, xfBuf_im, ef_re);
	// Interleave real and imaginary parts.
	const float32x4x2_t g_n_h = vzipq_f32(e, f);
	// Store
	vst1q_f32(&fft[2 * j + 0], g_n_h.val[0]);
	vst1q_f32(&fft[2 * j + 4], g_n_h.val[1]);
	}
	// ... and fixup the first imaginary entry.
	fft[1] = MulRe(aec->xfBuf[0][xPos + PART_LEN],
	-aec->xfBuf[1][xPos + PART_LEN],
	ef[0][PART_LEN],
	ef[1][PART_LEN]);

	aec_rdft_inverse_128(fft);
	memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);

	// fft scaling
	{
	const float scale = 2.0f / PART_LEN2;
	const float32x4_t scale_ps = vmovq_n_f32(scale);
	for (j = 0; j < PART_LEN; j += 4) {
	const float32x4_t fft_ps = vld1q_f32(&fft[j]);
	const float32x4_t fft_scale = vmulq_f32(fft_ps, scale_ps);
	vst1q_f32(&fft[j], fft_scale);
	}
	}
	aec_rdft_forward_128(fft);

	{
	const float wt1 = aec->wfBuf[1][pos];
	aec->wfBuf[0][pos + PART_LEN] += fft[1];
	for (j = 0; j < PART_LEN; j += 4) {
	float32x4_t wtBuf_re = vld1q_f32(&aec->wfBuf[0][pos + j]);
	float32x4_t wtBuf_im = vld1q_f32(&aec->wfBuf[1][pos + j]);
	const float32x4_t fft0 = vld1q_f32(&fft[2 * j + 0]);
	const float32x4_t fft4 = vld1q_f32(&fft[2 * j + 4]);
	const float32x4x2_t fft_re_im = vuzpq_f32(fft0, fft4);
	wtBuf_re = vaddq_f32(wtBuf_re, fft_re_im.val[0]);
	wtBuf_im = vaddq_f32(wtBuf_im, fft_re_im.val[1]);

	vst1q_f32(&aec->wfBuf[0][pos + j], wtBuf_re);
	vst1q_f32(&aec->wfBuf[1][pos + j], wtBuf_im);
	}
	aec->wfBuf[1][pos] = wt1;
	}
	}
	}

	extern const float WebRtcAec_weightCurve[65];
	extern const float WebRtcAec_overDriveCurve[65];

	static float32x4_t vpowq_f32(float32x4_t a, float32x4_t b) {
	// a^b = exp2(b * log2(a))
	// exp2(x) and log2(x) are calculated using polynomial approximations.
	float32x4_t log2_a, b_log2_a, a_exp_b;

	// Calculate log2(x), x = a.
	{
	// To calculate log2(x), we decompose x like this:
	// x = y * 2^n
	// n is an integer
	// y is in the [1.0, 2.0) range
	//
	// log2(x) = log2(y) + n
	// n can be evaluated by playing with float representation.
	// log2(y) in a small range can be approximated, this code uses an order
	// five polynomial approximation. The coefficients have been
	// estimated with the Remez algorithm and the resulting
	// polynomial has a maximum relative error of 0.00086%.

	// Compute n.
	// This is done by masking the exponent, shifting it into the top bit of
	// the mantissa, putting eight into the biased exponent (to shift/
	// compensate the fact that the exponent has been shifted in the top/
	// fractional part and finally getting rid of the implicit leading one
	// from the mantissa by substracting it out.
	const uint32x4_t vec_float_exponent_mask = vdupq_n_u32(0x7F800000);
	const uint32x4_t vec_eight_biased_exponent = vdupq_n_u32(0x43800000);
	const uint32x4_t vec_implicit_leading_one = vdupq_n_u32(0x43BF8000);
	const uint32x4_t two_n = vandq_u32(vreinterpretq_u32_f32(a),
	vec_float_exponent_mask);
	const uint32x4_t n_1 = vshrq_n_u32(two_n, kShiftExponentIntoTopMantissa);
	const uint32x4_t n_0 = vorrq_u32(n_1, vec_eight_biased_exponent);
	const float32x4_t n =
	vsubq_f32(vreinterpretq_f32_u32(n_0),
	vreinterpretq_f32_u32(vec_implicit_leading_one));
	// Compute y.
	const uint32x4_t vec_mantissa_mask = vdupq_n_u32(0x007FFFFF);
	const uint32x4_t vec_zero_biased_exponent_is_one = vdupq_n_u32(0x3F800000);
	const uint32x4_t mantissa = vandq_u32(vreinterpretq_u32_f32(a),
	vec_mantissa_mask);
	const float32x4_t y =
	vreinterpretq_f32_u32(vorrq_u32(mantissa,
	vec_zero_biased_exponent_is_one));
	// Approximate log2(y) ~= (y - 1) * pol5(y).
	// pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0
	const float32x4_t C5 = vdupq_n_f32(-3.4436006e-2f);
	const float32x4_t C4 = vdupq_n_f32(3.1821337e-1f);
	const float32x4_t C3 = vdupq_n_f32(-1.2315303f);
	const float32x4_t C2 = vdupq_n_f32(2.5988452f);
	const float32x4_t C1 = vdupq_n_f32(-3.3241990f);
	const float32x4_t C0 = vdupq_n_f32(3.1157899f);
	float32x4_t pol5_y = C5;
	pol5_y = vmlaq_f32(C4, y, pol5_y);
	pol5_y = vmlaq_f32(C3, y, pol5_y);
	pol5_y = vmlaq_f32(C2, y, pol5_y);
	pol5_y = vmlaq_f32(C1, y, pol5_y);
	pol5_y = vmlaq_f32(C0, y, pol5_y);
	const float32x4_t y_minus_one =
	vsubq_f32(y, vreinterpretq_f32_u32(vec_zero_biased_exponent_is_one));
	const float32x4_t log2_y = vmulq_f32(y_minus_one, pol5_y);

	// Combine parts.
	log2_a = vaddq_f32(n, log2_y);
	}

	// b * log2(a)
	b_log2_a = vmulq_f32(b, log2_a);

	// Calculate exp2(x), x = b * log2(a).
	{
	// To calculate 2^x, we decompose x like this:
	// x = n + y
	// n is an integer, the value of x - 0.5 rounded down, therefore
	// y is in the [0.5, 1.5) range
	//
	// 2^x = 2^n * 2^y
	// 2^n can be evaluated by playing with float representation.
	// 2^y in a small range can be approximated, this code uses an order two
	// polynomial approximation. The coefficients have been estimated
	// with the Remez algorithm and the resulting polynomial has a
	// maximum relative error of 0.17%.
	// To avoid over/underflow, we reduce the range of input to ]-127, 129].
	const float32x4_t max_input = vdupq_n_f32(129.f);
	const float32x4_t min_input = vdupq_n_f32(-126.99999f);
	const float32x4_t x_min = vminq_f32(b_log2_a, max_input);
	const float32x4_t x_max = vmaxq_f32(x_min, min_input);
	// Compute n.
	const float32x4_t half = vdupq_n_f32(0.5f);
	const float32x4_t x_minus_half = vsubq_f32(x_max, half);
	const int32x4_t x_minus_half_floor = vcvtq_s32_f32(x_minus_half);

	// Compute 2^n.
	const int32x4_t float_exponent_bias = vdupq_n_s32(127);
	const int32x4_t two_n_exponent =
	vaddq_s32(x_minus_half_floor, float_exponent_bias);
	const float32x4_t two_n =
	vreinterpretq_f32_s32(vshlq_n_s32(two_n_exponent, kFloatExponentShift));
	// Compute y.
	const float32x4_t y = vsubq_f32(x_max, vcvtq_f32_s32(x_minus_half_floor));

	// Approximate 2^y ~= C2 * y^2 + C1 * y + C0.
	const float32x4_t C2 = vdupq_n_f32(3.3718944e-1f);
	const float32x4_t C1 = vdupq_n_f32(6.5763628e-1f);
	const float32x4_t C0 = vdupq_n_f32(1.0017247f);
	float32x4_t exp2_y = C2;
	exp2_y = vmlaq_f32(C1, y, exp2_y);
	exp2_y = vmlaq_f32(C0, y, exp2_y);

	// Combine parts.
	a_exp_b = vmulq_f32(exp2_y, two_n);
	}

	return a_exp_b;
	}

	static void OverdriveAndSuppressNEON(AecCore* aec,
	float hNl[PART_LEN1],
	const float hNlFb,
	float efw[2][PART_LEN1]) {
	int i;
	const float32x4_t vec_hNlFb = vmovq_n_f32(hNlFb);
	const float32x4_t vec_one = vdupq_n_f32(1.0f);
	const float32x4_t vec_minus_one = vdupq_n_f32(-1.0f);
	const float32x4_t vec_overDriveSm = vmovq_n_f32(aec->overDriveSm);

	// vectorized code (four at once)
	for (i = 0; i + 3 < PART_LEN1; i += 4) {
	// Weight subbands
	float32x4_t vec_hNl = vld1q_f32(&hNl[i]);
	const float32x4_t vec_weightCurve = vld1q_f32(&WebRtcAec_weightCurve[i]);
	const uint32x4_t bigger = vcgtq_f32(vec_hNl, vec_hNlFb);
	const float32x4_t vec_weightCurve_hNlFb = vmulq_f32(vec_weightCurve,
	vec_hNlFb);
	const float32x4_t vec_one_weightCurve = vsubq_f32(vec_one, vec_weightCurve);
	const float32x4_t vec_one_weightCurve_hNl = vmulq_f32(vec_one_weightCurve,
	vec_hNl);
	const uint32x4_t vec_if0 = vandq_u32(vmvnq_u32(bigger),
	vreinterpretq_u32_f32(vec_hNl));
	const float32x4_t vec_one_weightCurve_add =
	vaddq_f32(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl);
	const uint32x4_t vec_if1 =
	vandq_u32(bigger, vreinterpretq_u32_f32(vec_one_weightCurve_add));

	vec_hNl = vreinterpretq_f32_u32(vorrq_u32(vec_if0, vec_if1));

	{
	const float32x4_t vec_overDriveCurve =
	vld1q_f32(&WebRtcAec_overDriveCurve[i]);
	const float32x4_t vec_overDriveSm_overDriveCurve =
	vmulq_f32(vec_overDriveSm, vec_overDriveCurve);
	vec_hNl = vpowq_f32(vec_hNl, vec_overDriveSm_overDriveCurve);
	vst1q_f32(&hNl[i], vec_hNl);
	}

	// Suppress error signal
	{
	float32x4_t vec_efw_re = vld1q_f32(&efw[0][i]);
	float32x4_t vec_efw_im = vld1q_f32(&efw[1][i]);
	vec_efw_re = vmulq_f32(vec_efw_re, vec_hNl);
	vec_efw_im = vmulq_f32(vec_efw_im, vec_hNl);

	// Ooura fft returns incorrect sign on imaginary component. It matters
	// here because we are making an additive change with comfort noise.
	vec_efw_im = vmulq_f32(vec_efw_im, vec_minus_one);
	vst1q_f32(&efw[0][i], vec_efw_re);
	vst1q_f32(&efw[1][i], vec_efw_im);
	}
	}

	// scalar code for the remaining items.
	for (; i < PART_LEN1; i++) {
	// Weight subbands
	if (hNl[i] > hNlFb) {
	hNl[i] = WebRtcAec_weightCurve[i] * hNlFb +
	(1 - WebRtcAec_weightCurve[i]) * hNl[i];
	}

	hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]);

	// Suppress error signal
	efw[0][i] *= hNl[i];
	efw[1][i] *= hNl[i];

	// Ooura fft returns incorrect sign on imaginary component. It matters
	// here because we are making an additive change with comfort noise.
	efw[1][i] *= -1;
	}
	}

	void WebRtcAec_InitAec_neon(void) {
	WebRtcAec_FilterAdaptation = FilterAdaptationNEON;
	WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressNEON;
	}