system/embdrv/lc3/src/mdct_neon.h - platform/packages/modules/Bluetooth - Git at Google

 /******************************************************************************
  *
  *  Copyright 2022 Google LLC
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
  *  You may obtain a copy of the License at:
  *
  *  http://www.apache.org/licenses/LICENSE-2.0
  *
  *  Unless required by applicable law or agreed to in writing, software
  *  distributed under the License is distributed on an "AS IS" BASIS,
  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  *  See the License for the specific language governing permissions and
  *  limitations under the License.
  *
  ******************************************************************************/

 #if __ARM_NEON && __ARM_ARCH_ISA_A64

 #ifndef TEST_NEON
 #include <arm_neon.h>
 #endif /* TEST_NEON */


 /**
  * FFT 5 Points
  * The number of interleaved transform `n` assumed to be even
  */
 #ifndef fft_5
 #define fft_5 neon_fft_5
 LC3_HOT static inline void neon_fft_5(
     const struct lc3_complex *x, struct lc3_complex *y, int n)
 {
     static const union { float f[2]; uint64_t u64; }
         __cos1 = { {  0.3090169944,  0.3090169944 } },
         __cos2 = { { -0.8090169944, -0.8090169944 } },
         __sin1 = { {  0.9510565163, -0.9510565163 } },
         __sin2 = { {  0.5877852523, -0.5877852523 } };

     float32x2_t sin1 = vcreate_f32(__sin1.u64);
     float32x2_t sin2 = vcreate_f32(__sin2.u64);
     float32x2_t cos1 = vcreate_f32(__cos1.u64);
     float32x2_t cos2 = vcreate_f32(__cos2.u64);

     float32x4_t sin1q = vcombine_f32(sin1, sin1);
     float32x4_t sin2q = vcombine_f32(sin2, sin2);
     float32x4_t cos1q = vcombine_f32(cos1, cos1);
     float32x4_t cos2q = vcombine_f32(cos2, cos2);

     for (int i = 0; i < n; i += 2, x += 2, y += 10) {

         float32x4_t y0, y1, y2, y3, y4;

         float32x4_t x0 = vld1q_f32( (float *)(x + 0*n) );
         float32x4_t x1 = vld1q_f32( (float *)(x + 1*n) );
         float32x4_t x2 = vld1q_f32( (float *)(x + 2*n) );
         float32x4_t x3 = vld1q_f32( (float *)(x + 3*n) );
         float32x4_t x4 = vld1q_f32( (float *)(x + 4*n) );

         float32x4_t s14 = vaddq_f32(x1, x4);
         float32x4_t s23 = vaddq_f32(x2, x3);

         float32x4_t d14 = vrev64q_f32( vsubq_f32(x1, x4) );
         float32x4_t d23 = vrev64q_f32( vsubq_f32(x2, x3) );

         y0 = vaddq_f32( x0, vaddq_f32(s14, s23) );

         y4 = vfmaq_f32( x0, s14, cos1q );
         y4 = vfmaq_f32( y4, s23, cos2q );

         y1 = vfmaq_f32( y4, d14, sin1q );
         y1 = vfmaq_f32( y1, d23, sin2q );

         y4 = vfmsq_f32( y4, d14, sin1q );
         y4 = vfmsq_f32( y4, d23, sin2q );

         y3 = vfmaq_f32( x0, s14, cos2q );
         y3 = vfmaq_f32( y3, s23, cos1q );

         y2 = vfmaq_f32( y3, d14, sin2q );
         y2 = vfmsq_f32( y2, d23, sin1q );

         y3 = vfmsq_f32( y3, d14, sin2q );
         y3 = vfmaq_f32( y3, d23, sin1q );

         vst1_f32( (float *)(y + 0), vget_low_f32(y0) );
         vst1_f32( (float *)(y + 1), vget_low_f32(y1) );
         vst1_f32( (float *)(y + 2), vget_low_f32(y2) );
         vst1_f32( (float *)(y + 3), vget_low_f32(y3) );
         vst1_f32( (float *)(y + 4), vget_low_f32(y4) );

         vst1_f32( (float *)(y + 5), vget_high_f32(y0) );
         vst1_f32( (float *)(y + 6), vget_high_f32(y1) );
         vst1_f32( (float *)(y + 7), vget_high_f32(y2) );
         vst1_f32( (float *)(y + 8), vget_high_f32(y3) );
         vst1_f32( (float *)(y + 9), vget_high_f32(y4) );
     }
 }
 #endif /* fft_5 */

 /**
  * FFT Butterfly 3 Points
  */
 #ifndef fft_bf3
 #define fft_bf3 neon_fft_bf3
 LC3_HOT static inline void neon_fft_bf3(
     const struct lc3_fft_bf3_twiddles *twiddles,
     const struct lc3_complex *x, struct lc3_complex *y, int n)
 {
     int n3 = twiddles->n3;
     const struct lc3_complex (*w0_ptr)[2] = twiddles->t;
     const struct lc3_complex (*w1_ptr)[2] = w0_ptr + n3;
     const struct lc3_complex (*w2_ptr)[2] = w1_ptr + n3;

     const struct lc3_complex *x0_ptr = x;
     const struct lc3_complex *x1_ptr = x0_ptr + n*n3;
     const struct lc3_complex *x2_ptr = x1_ptr + n*n3;

     struct lc3_complex *y0_ptr = y;
     struct lc3_complex *y1_ptr = y0_ptr + n3;
     struct lc3_complex *y2_ptr = y1_ptr + n3;

     for (int j, i = 0; i < n; i++,
             y0_ptr += 3*n3, y1_ptr += 3*n3, y2_ptr += 3*n3) {

         /* --- Process by pair --- */

         for (j = 0; j < (n3 >> 1); j++,
                 x0_ptr += 2, x1_ptr += 2, x2_ptr += 2) {

             float32x4_t x0 = vld1q_f32( (float *)x0_ptr );
             float32x4_t x1 = vld1q_f32( (float *)x1_ptr );
             float32x4_t x2 = vld1q_f32( (float *)x2_ptr );

             float32x4_t x1r = vtrn1q_f32( vrev64q_f32(vnegq_f32(x1)), x1 );
             float32x4_t x2r = vtrn1q_f32( vrev64q_f32(vnegq_f32(x2)), x2 );

             float32x4x2_t wn;
             float32x4_t yn;

             wn = vld2q_f32( (float *)(w0_ptr + 2*j) );

             yn = vfmaq_f32( x0, x1 , vtrn1q_f32(wn.val[0], wn.val[0]) );
             yn = vfmaq_f32( yn, x1r, vtrn1q_f32(wn.val[1], wn.val[1]) );
             yn = vfmaq_f32( yn, x2 , vtrn2q_f32(wn.val[0], wn.val[0]) );
             yn = vfmaq_f32( yn, x2r, vtrn2q_f32(wn.val[1], wn.val[1]) );
             vst1q_f32( (float *)(y0_ptr + 2*j), yn );

             wn = vld2q_f32( (float *)(w1_ptr + 2*j) );

             yn = vfmaq_f32( x0, x1 , vtrn1q_f32(wn.val[0], wn.val[0]) );
             yn = vfmaq_f32( yn, x1r, vtrn1q_f32(wn.val[1], wn.val[1]) );
             yn = vfmaq_f32( yn, x2 , vtrn2q_f32(wn.val[0], wn.val[0]) );
             yn = vfmaq_f32( yn, x2r, vtrn2q_f32(wn.val[1], wn.val[1]) );
             vst1q_f32( (float *)(y1_ptr + 2*j), yn );

             wn = vld2q_f32( (float *)(w2_ptr + 2*j) );

             yn = vfmaq_f32( x0, x1 , vtrn1q_f32(wn.val[0], wn.val[0]) );
             yn = vfmaq_f32( yn, x1r, vtrn1q_f32(wn.val[1], wn.val[1]) );
             yn = vfmaq_f32( yn, x2 , vtrn2q_f32(wn.val[0], wn.val[0]) );
             yn = vfmaq_f32( yn, x2r, vtrn2q_f32(wn.val[1], wn.val[1]) );
             vst1q_f32( (float *)(y2_ptr + 2*j), yn );

         }

         /* --- Last iteration --- */

         if (n3 & 1) {

             float32x2x2_t wn;
             float32x2_t yn;

             float32x2_t x0 = vld1_f32( (float *)(x0_ptr++) );
             float32x2_t x1 = vld1_f32( (float *)(x1_ptr++) );
             float32x2_t x2 = vld1_f32( (float *)(x2_ptr++) );

             float32x2_t x1r = vtrn1_f32( vrev64_f32(vneg_f32(x1)), x1 );
             float32x2_t x2r = vtrn1_f32( vrev64_f32(vneg_f32(x2)), x2 );

             wn = vld2_f32( (float *)(w0_ptr + 2*j) );

             yn = vfma_f32( x0, x1 , vtrn1_f32(wn.val[0], wn.val[0]) );
             yn = vfma_f32( yn, x1r, vtrn1_f32(wn.val[1], wn.val[1]) );
             yn = vfma_f32( yn, x2 , vtrn2_f32(wn.val[0], wn.val[0]) );
             yn = vfma_f32( yn, x2r, vtrn2_f32(wn.val[1], wn.val[1]) );
             vst1_f32( (float *)(y0_ptr + 2*j), yn );

             wn = vld2_f32( (float *)(w1_ptr + 2*j) );

             yn = vfma_f32( x0, x1 , vtrn1_f32(wn.val[0], wn.val[0]) );
             yn = vfma_f32( yn, x1r, vtrn1_f32(wn.val[1], wn.val[1]) );
             yn = vfma_f32( yn, x2 , vtrn2_f32(wn.val[0], wn.val[0]) );
             yn = vfma_f32( yn, x2r, vtrn2_f32(wn.val[1], wn.val[1]) );
             vst1_f32( (float *)(y1_ptr + 2*j), yn );

             wn = vld2_f32( (float *)(w2_ptr + 2*j) );

             yn = vfma_f32( x0, x1 , vtrn1_f32(wn.val[0], wn.val[0]) );
             yn = vfma_f32( yn, x1r, vtrn1_f32(wn.val[1], wn.val[1]) );
             yn = vfma_f32( yn, x2 , vtrn2_f32(wn.val[0], wn.val[0]) );
             yn = vfma_f32( yn, x2r, vtrn2_f32(wn.val[1], wn.val[1]) );
             vst1_f32( (float *)(y2_ptr + 2*j), yn );
         }

     }
 }
 #endif /* fft_bf3 */

 /**
  * FFT Butterfly 2 Points
  */
 #ifndef fft_bf2
 #define fft_bf2 neon_fft_bf2
 LC3_HOT static inline void neon_fft_bf2(
     const struct lc3_fft_bf2_twiddles *twiddles,
     const struct lc3_complex *x, struct lc3_complex *y, int n)
 {
     int n2 = twiddles->n2;
     const struct lc3_complex *w_ptr = twiddles->t;

     const struct lc3_complex *x0_ptr = x;
     const struct lc3_complex *x1_ptr = x0_ptr + n*n2;

     struct lc3_complex *y0_ptr = y;
     struct lc3_complex *y1_ptr = y0_ptr + n2;

     for (int j, i = 0; i < n; i++, y0_ptr += 2*n2, y1_ptr += 2*n2) {

         /* --- Process by pair --- */

         for (j = 0; j < (n2 >> 1); j++, x0_ptr += 2, x1_ptr += 2) {

             float32x4_t x0 = vld1q_f32( (float *)x0_ptr );
             float32x4_t x1 = vld1q_f32( (float *)x1_ptr );
             float32x4_t y0, y1;

             float32x4_t x1r = vtrn1q_f32( vrev64q_f32(vnegq_f32(x1)), x1 );

             float32x4_t w = vld1q_f32( (float *)(w_ptr + 2*j) );
             float32x4_t w_re = vtrn1q_f32(w, w);
             float32x4_t w_im = vtrn2q_f32(w, w);

             y0 = vfmaq_f32( x0, x1 , w_re );
             y0 = vfmaq_f32( y0, x1r, w_im );
             vst1q_f32( (float *)(y0_ptr + 2*j), y0 );

             y1 = vfmsq_f32( x0, x1 , w_re );
             y1 = vfmsq_f32( y1, x1r, w_im );
             vst1q_f32( (float *)(y1_ptr + 2*j), y1 );
         }

         /* --- Last iteration --- */

         if (n2 & 1) {

             float32x2_t x0 = vld1_f32( (float *)(x0_ptr++) );
             float32x2_t x1 = vld1_f32( (float *)(x1_ptr++) );
             float32x2_t y0, y1;

             float32x2_t x1r = vtrn1_f32( vrev64_f32(vneg_f32(x1)), x1 );

             float32x2_t w = vld1_f32( (float *)(w_ptr + 2*j) );
             float32x2_t w_re = vtrn1_f32(w, w);
             float32x2_t w_im = vtrn2_f32(w, w);

             y0 = vfma_f32( x0, x1 , w_re );
             y0 = vfma_f32( y0, x1r, w_im );
             vst1_f32( (float *)(y0_ptr + 2*j), y0 );

             y1 = vfms_f32( x0, x1 , w_re );
             y1 = vfms_f32( y1, x1r, w_im );
             vst1_f32( (float *)(y1_ptr + 2*j), y1 );
         }
     }
 }
 #endif /* fft_bf2 */

 #endif /* __ARM_NEON && __ARM_ARCH_ISA_A64 */
	/******************************************************************************
	*
	* Copyright 2022 Google LLC
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at:
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*
	******************************************************************************/

	#if __ARM_NEON && __ARM_ARCH_ISA_A64

	#ifndef TEST_NEON
	#include <arm_neon.h>
	#endif /* TEST_NEON */


	/**
	* FFT 5 Points
	* The number of interleaved transform `n` assumed to be even
	*/
	#ifndef fft_5
	#define fft_5 neon_fft_5
	LC3_HOT static inline void neon_fft_5(
	const struct lc3_complex x, struct lc3_complex y, int n)
	{
	static const union { float f[2]; uint64_t u64; }
	__cos1 = { { 0.3090169944, 0.3090169944 } },
	__cos2 = { { -0.8090169944, -0.8090169944 } },
	__sin1 = { { 0.9510565163, -0.9510565163 } },
	__sin2 = { { 0.5877852523, -0.5877852523 } };

	float32x2_t sin1 = vcreate_f32(__sin1.u64);
	float32x2_t sin2 = vcreate_f32(__sin2.u64);
	float32x2_t cos1 = vcreate_f32(__cos1.u64);
	float32x2_t cos2 = vcreate_f32(__cos2.u64);

	float32x4_t sin1q = vcombine_f32(sin1, sin1);
	float32x4_t sin2q = vcombine_f32(sin2, sin2);
	float32x4_t cos1q = vcombine_f32(cos1, cos1);
	float32x4_t cos2q = vcombine_f32(cos2, cos2);

	for (int i = 0; i < n; i += 2, x += 2, y += 10) {

	float32x4_t y0, y1, y2, y3, y4;

	float32x4_t x0 = vld1q_f32( (float )(x + 0n) );
	float32x4_t x1 = vld1q_f32( (float )(x + 1n) );
	float32x4_t x2 = vld1q_f32( (float )(x + 2n) );
	float32x4_t x3 = vld1q_f32( (float )(x + 3n) );
	float32x4_t x4 = vld1q_f32( (float )(x + 4n) );

	float32x4_t s14 = vaddq_f32(x1, x4);
	float32x4_t s23 = vaddq_f32(x2, x3);

	float32x4_t d14 = vrev64q_f32( vsubq_f32(x1, x4) );
	float32x4_t d23 = vrev64q_f32( vsubq_f32(x2, x3) );

	y0 = vaddq_f32( x0, vaddq_f32(s14, s23) );

	y4 = vfmaq_f32( x0, s14, cos1q );
	y4 = vfmaq_f32( y4, s23, cos2q );

	y1 = vfmaq_f32( y4, d14, sin1q );
	y1 = vfmaq_f32( y1, d23, sin2q );

	y4 = vfmsq_f32( y4, d14, sin1q );
	y4 = vfmsq_f32( y4, d23, sin2q );

	y3 = vfmaq_f32( x0, s14, cos2q );
	y3 = vfmaq_f32( y3, s23, cos1q );

	y2 = vfmaq_f32( y3, d14, sin2q );
	y2 = vfmsq_f32( y2, d23, sin1q );

	y3 = vfmsq_f32( y3, d14, sin2q );
	y3 = vfmaq_f32( y3, d23, sin1q );

	vst1_f32( (float *)(y + 0), vget_low_f32(y0) );
	vst1_f32( (float *)(y + 1), vget_low_f32(y1) );
	vst1_f32( (float *)(y + 2), vget_low_f32(y2) );
	vst1_f32( (float *)(y + 3), vget_low_f32(y3) );
	vst1_f32( (float *)(y + 4), vget_low_f32(y4) );

	vst1_f32( (float *)(y + 5), vget_high_f32(y0) );
	vst1_f32( (float *)(y + 6), vget_high_f32(y1) );
	vst1_f32( (float *)(y + 7), vget_high_f32(y2) );
	vst1_f32( (float *)(y + 8), vget_high_f32(y3) );
	vst1_f32( (float *)(y + 9), vget_high_f32(y4) );
	}
	}
	#endif /* fft_5 */

	/**
	* FFT Butterfly 3 Points
	*/
	#ifndef fft_bf3
	#define fft_bf3 neon_fft_bf3
	LC3_HOT static inline void neon_fft_bf3(
	const struct lc3_fft_bf3_twiddles *twiddles,
	const struct lc3_complex x, struct lc3_complex y, int n)
	{
	int n3 = twiddles->n3;
	const struct lc3_complex (*w0_ptr)[2] = twiddles->t;
	const struct lc3_complex (*w1_ptr)[2] = w0_ptr + n3;
	const struct lc3_complex (*w2_ptr)[2] = w1_ptr + n3;

	const struct lc3_complex *x0_ptr = x;
	const struct lc3_complex x1_ptr = x0_ptr + nn3;
	const struct lc3_complex x2_ptr = x1_ptr + nn3;

	struct lc3_complex *y0_ptr = y;
	struct lc3_complex *y1_ptr = y0_ptr + n3;
	struct lc3_complex *y2_ptr = y1_ptr + n3;

	for (int j, i = 0; i < n; i++,
	y0_ptr += 3n3, y1_ptr += 3n3, y2_ptr += 3*n3) {

	/* --- Process by pair --- */

	for (j = 0; j < (n3 >> 1); j++,
	x0_ptr += 2, x1_ptr += 2, x2_ptr += 2) {

	float32x4_t x0 = vld1q_f32( (float *)x0_ptr );
	float32x4_t x1 = vld1q_f32( (float *)x1_ptr );
	float32x4_t x2 = vld1q_f32( (float *)x2_ptr );

	float32x4_t x1r = vtrn1q_f32( vrev64q_f32(vnegq_f32(x1)), x1 );
	float32x4_t x2r = vtrn1q_f32( vrev64q_f32(vnegq_f32(x2)), x2 );

	float32x4x2_t wn;
	float32x4_t yn;

	wn = vld2q_f32( (float )(w0_ptr + 2j) );

	yn = vfmaq_f32( x0, x1 , vtrn1q_f32(wn.val[0], wn.val[0]) );
	yn = vfmaq_f32( yn, x1r, vtrn1q_f32(wn.val[1], wn.val[1]) );
	yn = vfmaq_f32( yn, x2 , vtrn2q_f32(wn.val[0], wn.val[0]) );
	yn = vfmaq_f32( yn, x2r, vtrn2q_f32(wn.val[1], wn.val[1]) );
	vst1q_f32( (float )(y0_ptr + 2j), yn );

	wn = vld2q_f32( (float )(w1_ptr + 2j) );

	yn = vfmaq_f32( x0, x1 , vtrn1q_f32(wn.val[0], wn.val[0]) );
	yn = vfmaq_f32( yn, x1r, vtrn1q_f32(wn.val[1], wn.val[1]) );
	yn = vfmaq_f32( yn, x2 , vtrn2q_f32(wn.val[0], wn.val[0]) );
	yn = vfmaq_f32( yn, x2r, vtrn2q_f32(wn.val[1], wn.val[1]) );
	vst1q_f32( (float )(y1_ptr + 2j), yn );

	wn = vld2q_f32( (float )(w2_ptr + 2j) );

	yn = vfmaq_f32( x0, x1 , vtrn1q_f32(wn.val[0], wn.val[0]) );
	yn = vfmaq_f32( yn, x1r, vtrn1q_f32(wn.val[1], wn.val[1]) );
	yn = vfmaq_f32( yn, x2 , vtrn2q_f32(wn.val[0], wn.val[0]) );
	yn = vfmaq_f32( yn, x2r, vtrn2q_f32(wn.val[1], wn.val[1]) );
	vst1q_f32( (float )(y2_ptr + 2j), yn );

	}

	/* --- Last iteration --- */

	if (n3 & 1) {

	float32x2x2_t wn;
	float32x2_t yn;

	float32x2_t x0 = vld1_f32( (float *)(x0_ptr++) );
	float32x2_t x1 = vld1_f32( (float *)(x1_ptr++) );
	float32x2_t x2 = vld1_f32( (float *)(x2_ptr++) );

	float32x2_t x1r = vtrn1_f32( vrev64_f32(vneg_f32(x1)), x1 );
	float32x2_t x2r = vtrn1_f32( vrev64_f32(vneg_f32(x2)), x2 );

	wn = vld2_f32( (float )(w0_ptr + 2j) );

	yn = vfma_f32( x0, x1 , vtrn1_f32(wn.val[0], wn.val[0]) );
	yn = vfma_f32( yn, x1r, vtrn1_f32(wn.val[1], wn.val[1]) );
	yn = vfma_f32( yn, x2 , vtrn2_f32(wn.val[0], wn.val[0]) );
	yn = vfma_f32( yn, x2r, vtrn2_f32(wn.val[1], wn.val[1]) );
	vst1_f32( (float )(y0_ptr + 2j), yn );

	wn = vld2_f32( (float )(w1_ptr + 2j) );

	yn = vfma_f32( x0, x1 , vtrn1_f32(wn.val[0], wn.val[0]) );
	yn = vfma_f32( yn, x1r, vtrn1_f32(wn.val[1], wn.val[1]) );
	yn = vfma_f32( yn, x2 , vtrn2_f32(wn.val[0], wn.val[0]) );
	yn = vfma_f32( yn, x2r, vtrn2_f32(wn.val[1], wn.val[1]) );
	vst1_f32( (float )(y1_ptr + 2j), yn );

	wn = vld2_f32( (float )(w2_ptr + 2j) );

	yn = vfma_f32( x0, x1 , vtrn1_f32(wn.val[0], wn.val[0]) );
	yn = vfma_f32( yn, x1r, vtrn1_f32(wn.val[1], wn.val[1]) );
	yn = vfma_f32( yn, x2 , vtrn2_f32(wn.val[0], wn.val[0]) );
	yn = vfma_f32( yn, x2r, vtrn2_f32(wn.val[1], wn.val[1]) );
	vst1_f32( (float )(y2_ptr + 2j), yn );
	}

	}
	}
	#endif /* fft_bf3 */

	/**
	* FFT Butterfly 2 Points
	*/
	#ifndef fft_bf2
	#define fft_bf2 neon_fft_bf2
	LC3_HOT static inline void neon_fft_bf2(
	const struct lc3_fft_bf2_twiddles *twiddles,
	const struct lc3_complex x, struct lc3_complex y, int n)
	{
	int n2 = twiddles->n2;
	const struct lc3_complex *w_ptr = twiddles->t;

	const struct lc3_complex *x0_ptr = x;
	const struct lc3_complex x1_ptr = x0_ptr + nn2;

	struct lc3_complex *y0_ptr = y;
	struct lc3_complex *y1_ptr = y0_ptr + n2;

	for (int j, i = 0; i < n; i++, y0_ptr += 2n2, y1_ptr += 2n2) {

	/* --- Process by pair --- */

	for (j = 0; j < (n2 >> 1); j++, x0_ptr += 2, x1_ptr += 2) {

	float32x4_t x0 = vld1q_f32( (float *)x0_ptr );
	float32x4_t x1 = vld1q_f32( (float *)x1_ptr );
	float32x4_t y0, y1;

	float32x4_t x1r = vtrn1q_f32( vrev64q_f32(vnegq_f32(x1)), x1 );

	float32x4_t w = vld1q_f32( (float )(w_ptr + 2j) );
	float32x4_t w_re = vtrn1q_f32(w, w);
	float32x4_t w_im = vtrn2q_f32(w, w);

	y0 = vfmaq_f32( x0, x1 , w_re );
	y0 = vfmaq_f32( y0, x1r, w_im );
	vst1q_f32( (float )(y0_ptr + 2j), y0 );

	y1 = vfmsq_f32( x0, x1 , w_re );
	y1 = vfmsq_f32( y1, x1r, w_im );
	vst1q_f32( (float )(y1_ptr + 2j), y1 );
	}

	/* --- Last iteration --- */

	if (n2 & 1) {

	float32x2_t x0 = vld1_f32( (float *)(x0_ptr++) );
	float32x2_t x1 = vld1_f32( (float *)(x1_ptr++) );
	float32x2_t y0, y1;

	float32x2_t x1r = vtrn1_f32( vrev64_f32(vneg_f32(x1)), x1 );

	float32x2_t w = vld1_f32( (float )(w_ptr + 2j) );
	float32x2_t w_re = vtrn1_f32(w, w);
	float32x2_t w_im = vtrn2_f32(w, w);

	y0 = vfma_f32( x0, x1 , w_re );
	y0 = vfma_f32( y0, x1r, w_im );
	vst1_f32( (float )(y0_ptr + 2j), y0 );

	y1 = vfms_f32( x0, x1 , w_re );
	y1 = vfms_f32( y1, x1r, w_im );
	vst1_f32( (float )(y1_ptr + 2j), y1 );
	}
	}
	}
	#endif /* fft_bf2 */

	#endif /* __ARM_NEON && __ARM_ARCH_ISA_A64 */