test_pffastconv.c - platform/external/pffft - Git at Google

 /*
   Copyright (c) 2013 Julien Pommier.
   Copyright (c) 2019  Hayati Ayguen ( h_ayguen@web.de )
  */

 #define _WANT_SNAN  1

 #include "pffft.h"
 #include "pffastconv.h"

 #include <math.h>
 #include <float.h>
 #include <limits.h>
 #include <inttypes.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
 #include <assert.h>
 #include <string.h>

 #ifdef HAVE_SYS_TIMES
 #  include <sys/times.h>
 #  include <unistd.h>
 #endif

 /*
    vector support macros: the rest of the code is independant of
    SSE/Altivec/NEON -- adding support for other platforms with 4-element
    vectors should be limited to these macros
 */
 #if 0
 #include "simd/pf_float.h"
 #endif

 #if defined(_MSC_VER)
 #  define RESTRICT __restrict
 #elif defined(__GNUC__)
 #  define RESTRICT __restrict
 #else
 #  define RESTRICT
 #endif


 #if defined(_MSC_VER)
 #pragma warning( disable : 4244 )
 #endif


 #ifdef SNANF
   #define INVALID_FLOAT_VAL  SNANF
 #elif defined(SNAN)
   #define INVALID_FLOAT_VAL  SNAN
 #elif defined(NAN)
   #define INVALID_FLOAT_VAL  NAN
 #elif defined(INFINITY)
   #define INVALID_FLOAT_VAL  INFINITY
 #else
   #define INVALID_FLOAT_VAL  FLT_MAX
 #endif


 #if defined(HAVE_SYS_TIMES)
   inline double uclock_sec(void) {
     static double ttclk = 0.;
     struct tms t;
     if (ttclk == 0.)
       ttclk = sysconf(_SC_CLK_TCK);
     times(&t);
     /* use only the user time of this process - not realtime, which depends on OS-scheduler .. */
     return ((double)t.tms_utime)) / ttclk;
   }
 # else
   double uclock_sec(void)
 { return (double)clock()/(double)CLOCKS_PER_SEC; }
 #endif


 typedef int            (*pfnConvolution)  (void * setup, const float * X, int len, float *Y, const float *Yref, int applyFlush);
 typedef void*          (*pfnConvSetup)    (float *Hfwd, int Nf, int * BlkLen, int flags);
 typedef pfnConvolution (*pfnGetConvFnPtr) (void * setup);
 typedef void           (*pfnConvDestroy)  (void * setup);


 struct ConvSetup
 {
   pfnConvolution pfn;
   int N;
   int B;
   float * H;
   int flags;
 };


 void * convSetupRev( float * H, int N, int * BlkLen, int flags )
 {
   struct ConvSetup * s = pffastconv_malloc( sizeof(struct ConvSetup) );
   int i, Nr = N;
   if (flags & PFFASTCONV_CPLX_INP_OUT)
     Nr *= 2;
   Nr += 4;
   s->pfn = NULL;
   s->N = N;
   s->B = *BlkLen;
   s->H = pffastconv_malloc((unsigned)Nr * sizeof(float));
   s->flags = flags;
   memset(s->H, 0, (unsigned)Nr * sizeof(float));
   if (flags & PFFASTCONV_CPLX_INP_OUT)
   {
     for ( i = 0; i < N; ++i ) {
       s->H[2*(N-1 -i)  ] = H[i];
       s->H[2*(N-1 -i)+1] = H[i];
     }
     /* simpler detection of overruns */
     s->H[ 2*N    ] = INVALID_FLOAT_VAL;
     s->H[ 2*N +1 ] = INVALID_FLOAT_VAL;
     s->H[ 2*N +2 ] = INVALID_FLOAT_VAL;
     s->H[ 2*N +3 ] = INVALID_FLOAT_VAL;
   }
   else
   {
     for ( i = 0; i < N; ++i )
       s->H[ N-1 -i ] = H[i];
     /* simpler detection of overruns */
     s->H[ N    ] = INVALID_FLOAT_VAL;
     s->H[ N +1 ] = INVALID_FLOAT_VAL;
     s->H[ N +2 ] = INVALID_FLOAT_VAL;
     s->H[ N +3 ] = INVALID_FLOAT_VAL;
   }
   return s;
 }

 void convDestroyRev( void * setup )
 {
   struct ConvSetup * s = (struct ConvSetup*)setup;
   pffastconv_free(s->H);
   pffastconv_free(setup);
 }


 pfnConvolution ConvGetFnPtrRev( void * setup )
 {
   struct ConvSetup * s = (struct ConvSetup*)setup;
   if (!s)
     return NULL;
   return s->pfn;
 }


 void convSimdDestroy( void * setup )
 {
   convDestroyRev(setup);
 }


 void * fastConvSetup( float * H, int N, int * BlkLen, int flags )
 {
   void * p = pffastconv_new_setup( H, N, BlkLen, flags );
   if (!p)
     printf("fastConvSetup(N = %d, *BlkLen = %d, flags = %d) = NULL\n", N, *BlkLen, flags);
   return p;
 }


 void fastConvDestroy( void * setup )
 {
   pffastconv_destroy_setup( (PFFASTCONV_Setup*)setup );
 }


 int slow_conv_R(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush)
 {
   struct ConvSetup * p = (struct ConvSetup*)setup;
   const float * RESTRICT X = input;
   const float * RESTRICT Hrev = p->H;
   float * RESTRICT Y = output;
   const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
   const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
   int i, j;
   (void)Yref;
   (void)applyFlush;

   if (p->flags & PFFASTCONV_CPLX_INP_OUT)
   {
     for ( i = 0; i <= lenNr; i += 2 )
     {
       float sumRe = 0.0F, sumIm = 0.0F;
       for ( j = 0; j < Nr; j += 2 )
       {
         sumRe += X[i+j  ] * Hrev[j];
         sumIm += X[i+j+1] * Hrev[j+1];
       }
       Y[i  ] = sumRe;
       Y[i+1] = sumIm;
     }
     return i/2;
   }
   else
   {
     for ( i = 0; i <= lenNr; ++i )
     {
       float sum = 0.0F;
       for (j = 0; j < Nr; ++j )
         sum += X[i+j]   * Hrev[j];
       Y[i] = sum;
     }
     return i;
   }
 }


 int slow_conv_A(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush)
 {
   float sum[4];
   struct ConvSetup * p = (struct ConvSetup*)setup;
   const float * RESTRICT X = input;
   const float * RESTRICT Hrev = p->H;
   float * RESTRICT Y = output;
   const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
   const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
   int i, j;
   (void)Yref;
   (void)applyFlush;

   if (p->flags & PFFASTCONV_CPLX_INP_OUT)
   {
     if ( (Nr & 3) == 0 )
     {
       for ( i = 0; i <= lenNr; i += 2 )
       {
         sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
         for (j = 0; j < Nr; j += 4 )
         {
           sum[0] += X[i+j]   * Hrev[j];
           sum[1] += X[i+j+1] * Hrev[j+1];
           sum[2] += X[i+j+2] * Hrev[j+2];
           sum[3] += X[i+j+3] * Hrev[j+3];
         }
         Y[i  ] = sum[0] + sum[2];
         Y[i+1] = sum[1] + sum[3];
       }
     }
     else
     {
       const int M = Nr & (~3);
       for ( i = 0; i <= lenNr; i += 2 )
       {
         float tailSumRe = 0.0F, tailSumIm = 0.0F;
         sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
         for (j = 0; j < M; j += 4 )
         {
           sum[0] += X[i+j  ] * Hrev[j  ];
           sum[1] += X[i+j+1] * Hrev[j+1];
           sum[2] += X[i+j+2] * Hrev[j+2];
           sum[3] += X[i+j+3] * Hrev[j+3];
         }
         for ( ; j < Nr; j += 2 ) {
           tailSumRe += X[i+j  ] * Hrev[j  ];
           tailSumIm += X[i+j+1] * Hrev[j+1];
         }
         Y[i  ] = ( sum[0] + sum[2] ) + tailSumRe;
         Y[i+1] = ( sum[1] + sum[3] ) + tailSumIm;
       }
     }
     return i/2;
   }
   else
   {
     if ( (Nr & 3) == 0 )
     {
       for ( i = 0; i <= lenNr; ++i )
       {
         sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
         for (j = 0; j < Nr; j += 4 )
         {
           sum[0] += X[i+j]   * Hrev[j];
           sum[1] += X[i+j+1] * Hrev[j+1];
           sum[2] += X[i+j+2] * Hrev[j+2];
           sum[3] += X[i+j+3] * Hrev[j+3];
         }
         Y[i] = sum[0] + sum[1] + sum[2] + sum[3];
       }
       return i;
     }
     else
     {
       const int M = Nr & (~3);
       /* printf("A: Nr = %d, M = %d, H[M] = %f, H[M+1] = %f, H[M+2] = %f, H[M+3] = %f\n", Nr, M, Hrev[M], Hrev[M+1], Hrev[M+2], Hrev[M+3] ); */
       for ( i = 0; i <= lenNr; ++i )
       {
         float tailSum = 0.0;
         sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
         for (j = 0; j < M; j += 4 )
         {
           sum[0] += X[i+j]   * Hrev[j];
           sum[1] += X[i+j+1] * Hrev[j+1];
           sum[2] += X[i+j+2] * Hrev[j+2];
           sum[3] += X[i+j+3] * Hrev[j+3];
         }
         for ( ; j < Nr; ++j )
           tailSum += X[i+j] * Hrev[j];
         Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]) + tailSum;
       }
       return i;
     }
   }
 }


 int slow_conv_B(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush)
 {
   float sum[4];
   struct ConvSetup * p = (struct ConvSetup*)setup;
   (void)Yref;
   (void)applyFlush;
   if (p->flags & PFFASTCONV_SYMMETRIC)
   {
     const float * RESTRICT X = input;
     const float * RESTRICT Hrev = p->H;
     float * RESTRICT Y = output;
     const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
     const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
     const int h = Nr / 2 -4;
     const int E = Nr -4;
     int i, j;

     if (p->flags & PFFASTCONV_CPLX_INP_OUT)
     {
       for ( i = 0; i <= lenNr; i += 2 )
       {
         const int k = i + E;
         sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
         for (j = 0; j <= h; j += 4 )
         {
           sum[0] += Hrev[j  ] * ( X[i+j  ] + X[k-j+2] );
           sum[1] += Hrev[j+1] * ( X[i+j+1] + X[k-j+3] );
           sum[2] += Hrev[j+2] * ( X[i+j+2] + X[k-j  ] );
           sum[3] += Hrev[j+3] * ( X[i+j+3] + X[k-j+1] );
         }
         Y[i  ] = sum[0] + sum[2];
         Y[i+1] = sum[1] + sum[3];
       }
       return i/2;
     }
     else
     {
       for ( i = 0; i <= lenNr; ++i )
       {
         const int k = i + E;
         sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
         for (j = 0; j <= h; j += 4 )
         {
           sum[0] += Hrev[j  ] * ( X[i+j  ] + X[k-j+3] );
           sum[1] += Hrev[j+1] * ( X[i+j+1] + X[k-j+2] );
           sum[2] += Hrev[j+2] * ( X[i+j+2] + X[k-j+1] );
           sum[3] += Hrev[j+3] * ( X[i+j+3] + X[k-j  ] );
         }
         Y[i] = sum[0] + sum[1] + sum[2] + sum[3];
       }
       return i;
     }
   }
   else
   {
     const float * RESTRICT X = input;
     const float * RESTRICT Hrev = p->H;
     float * RESTRICT Y = output;
     const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
     const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
     int i, j;

     if (p->flags & PFFASTCONV_CPLX_INP_OUT)
     {
       for ( i = 0; i <= lenNr; i += 2 )
       {
         sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
         for (j = 0; j < Nr; j += 4 )
         {
           sum[0] += X[i+j]   * Hrev[j];
           sum[1] += X[i+j+1] * Hrev[j+1];
           sum[2] += X[i+j+2] * Hrev[j+2];
           sum[3] += X[i+j+3] * Hrev[j+3];
         }
         Y[i  ] = sum[0] + sum[2];
         Y[i+1] = sum[1] + sum[3];
       }
       return i/2;
     }
     else
     {
       if ( (Nr & 3) == 0 )
       {
         for ( i = 0; i <= lenNr; ++i )
         {
           sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
           for (j = 0; j < Nr; j += 4 )
           {
             sum[0] += X[i+j]   * Hrev[j];
             sum[1] += X[i+j+1] * Hrev[j+1];
             sum[2] += X[i+j+2] * Hrev[j+2];
             sum[3] += X[i+j+3] * Hrev[j+3];
           }
           Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]);
         }
         return i;
       }
       else
       {
         const int M = Nr & (~3);
         /* printf("B: Nr = %d\n", Nr ); */
         for ( i = 0; i <= lenNr; ++i )
         {
           float tailSum = 0.0;
           sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
           for (j = 0; j < M; j += 4 )
           {
             sum[0] += X[i+j]   * Hrev[j];
             sum[1] += X[i+j+1] * Hrev[j+1];
             sum[2] += X[i+j+2] * Hrev[j+2];
             sum[3] += X[i+j+3] * Hrev[j+3];
           }
           for ( ; j < Nr; ++j )
             tailSum += X[i+j] * Hrev[j];
           Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]) + tailSum;
         }
         return i;
       }
     }
   }

 }


 int fast_conv(void * setup, const float * X, int len, float *Y, const float *Yref, int applyFlush)
 {
   (void)Yref;
   return pffastconv_apply( (PFFASTCONV_Setup*)setup, X, len, Y, applyFlush );
 }


 void printFirst( const float * V, const char * st, const int N, const int perLine )
 {
   (void)V;  (void)st;  (void)N;  (void)perLine;
   return;
 #if 0
   int i;
   for ( i = 0; i < N; ++i )
   {
     if ( (i % perLine) == 0 )
       printf("\n%s[%d]", st, i);
     printf("\t%.1f", V[i]);
   }
   printf("\n");
 #endif
 }


 #define NUMY       11


 int test(int FILTERLEN, int convFlags, const int testOutLen, int printDbg, int printSpeed) {
   double t0, t1, tstop, td, tdref;
   float *X, *H;
   float *Y[NUMY];
   int64_t outN[NUMY];
   /* 256 KFloats or 16 MFloats data */
 #if 1
   const int len = testOutLen ? (1 << 18) : (1 << 24);
 #elif 0
   const int len = testOutLen ? (1 << 18) : (1 << 13);
 #else
   const int len = testOutLen ? (1 << 18) : (1024);
 #endif
   const int cplxFactor = ( convFlags & PFFASTCONV_CPLX_INP_OUT ) ? 2 : 1;
   const int lenC = len / cplxFactor;

   int yi, yc, posMaxErr;
   float yRangeMin, yRangeMax, yErrLimit, maxErr = 0.0;
   int i, j, numErrOverLimit, iter;
   int retErr = 0;

   /*                                  0               1               2               3                   4                   5                   6                   7                   8                      9  */
   pfnConvSetup   aSetup[NUMY]     = { convSetupRev,   convSetupRev,   convSetupRev,   fastConvSetup,      fastConvSetup,      fastConvSetup,      fastConvSetup,      fastConvSetup,      fastConvSetup,         fastConvSetup   };
   pfnConvDestroy aDestroy[NUMY]   = { convDestroyRev, convDestroyRev, convDestroyRev, fastConvDestroy,    fastConvDestroy,    fastConvDestroy,    fastConvDestroy,    fastConvDestroy,    fastConvDestroy,       fastConvDestroy };
   pfnGetConvFnPtr aGetFnPtr[NUMY] = { NULL,           NULL,           NULL,           NULL,               NULL,               NULL,               NULL,               NULL,               NULL,                  NULL,           };
   pfnConvolution aConv[NUMY]      = { slow_conv_R,    slow_conv_A,    slow_conv_B,    fast_conv,          fast_conv,          fast_conv,          fast_conv,          fast_conv,          fast_conv,             fast_conv       };
   const char * convText[NUMY]     = { "R(non-simd)",  "A(non-simd)",  "B(non-simd)",  "fast_conv_64",     "fast_conv_128",    "fast_conv_256",    "fast_conv_512",    "fast_conv_1K",     "fast_conv_2K",        "fast_conv_4K"  };
   int    aFastAlgo[NUMY]          = { 0,              0,              0,              1,                  1,                  1,                  1,                  1,                  1,                     1               };
   void * aSetupCfg[NUMY]          = { NULL,           NULL,           NULL,           NULL,               NULL,               NULL,               NULL,               NULL,               NULL,                  NULL            };
   int    aBlkLen[NUMY]            = { 1024,           1024,           1024,           64,                 128,                256,                512,                1024,               2048,                  4096            };
 #if 1
   int    aRunAlgo[NUMY]           = { 1,              1,              1,              FILTERLEN<64,       FILTERLEN<128,      FILTERLEN<256,      FILTERLEN<512,      FILTERLEN<1024,     FILTERLEN<2048,        FILTERLEN<4096  };
 #elif 0
   int    aRunAlgo[NUMY]           = { 1,              0,              0,              0 && FILTERLEN<64,  1 && FILTERLEN<128, 1 && FILTERLEN<256, 0 && FILTERLEN<512, 0 && FILTERLEN<1024, 0 && FILTERLEN<2048,  0 && FILTERLEN<4096  };
 #else
   int    aRunAlgo[NUMY]           = { 1,              1,              1,              0 && FILTERLEN<64,  0 && FILTERLEN<128, 1 && FILTERLEN<256, 0 && FILTERLEN<512, 0 && FILTERLEN<1024, 0 && FILTERLEN<2048,  0 && FILTERLEN<4096  };
 #endif
   double aSpeedFactor[NUMY], aDuration[NUMY], procSmpPerSec[NUMY];

   X = pffastconv_malloc( (unsigned)(len+4) * sizeof(float) );
   for ( i=0; i < NUMY; ++i)
   {
     if ( 1 || i < 2 )
       Y[i] = pffastconv_malloc( (unsigned)len * sizeof(float) );
     else
       Y[i] = Y[1];

     Y[i][0] = 123.F;  /* test for pffft_zconvolve_no_accu() */
     aSpeedFactor[i] = -1.0;
     aDuration[i] = -1.0;
     procSmpPerSec[i] = -1.0;
   }

   H = pffastconv_malloc((unsigned)FILTERLEN * sizeof(float));

   /* initialize input */
   if ( convFlags & PFFASTCONV_CPLX_INP_OUT )
   {
     for ( i = 0; i < lenC; ++i )
     {
       X[2*i  ] = (float)(i % 4093);  /* 4094 is a prime number. see https://en.wikipedia.org/wiki/List_of_prime_numbers */
       X[2*i+1] = (float)((i+2048) % 4093);
     }
   }
   else
   {
     for ( i = 0; i < len; ++i )
       X[i] = (float)(i % 4093);  /* 4094 is a prime number. see https://en.wikipedia.org/wiki/List_of_prime_numbers */
   }
   X[ len    ] = INVALID_FLOAT_VAL;
   X[ len +1 ] = INVALID_FLOAT_VAL;
   X[ len +2 ] = INVALID_FLOAT_VAL;
   X[ len +3 ] = INVALID_FLOAT_VAL;

   if (!testOutLen)
     printFirst( X, "X", 64, 8 );

   /* filter coeffs */
   memset( H, 0, FILTERLEN * sizeof(float) );
 #if 1
   if ( convFlags & PFFASTCONV_SYMMETRIC )
   {
     const int half = FILTERLEN / 2;
     for ( j = 0; j < half; ++j ) {
       switch (j % 3) {
         case 0: H[j] = H[FILTERLEN-1-j] = -1.0F;  break;
         case 1: H[j] = H[FILTERLEN-1-j] =  1.0F;  break;
         case 2: H[j] = H[FILTERLEN-1-j] =  0.5F;  break;
       }
     }
   }
   else
   {
     for ( j = 0; j < FILTERLEN; ++j ) {
       switch (j % 3) {
         case 0: H[j] = -1.0F;  break;
         case 1: H[j] = 1.0F;   break;
         case 2: H[j] = 0.5F;   break;
       }
     }
   }
 #else
   H[0] = 1.0F;
   H[FILTERLEN -1] = 1.0F;
 #endif
   if (!testOutLen)
     printFirst( H, "H", FILTERLEN, 8 );

   printf("\n");
   printf("filterLen = %d\t%s%s\t%s:\n", FILTERLEN,
     ((convFlags & PFFASTCONV_CPLX_INP_OUT)?"cplx":"real"),
     (convFlags & PFFASTCONV_CPLX_INP_OUT)?((convFlags & PFFASTCONV_CPLX_SINGLE_FFT)?" single":" 2x") : "",
     ((convFlags & PFFASTCONV_SYMMETRIC)?"symmetric":"non-sym") );

   while (1)
   {

     for ( yi = 0; yi < NUMY; ++yi )
     {
       if (!aRunAlgo[yi])
         continue;

       aSetupCfg[yi] = aSetup[yi]( H, FILTERLEN, &aBlkLen[yi], convFlags );

       /* get effective apply function ptr */
       if ( aSetupCfg[yi] && aGetFnPtr[yi] )
         aConv[yi] = aGetFnPtr[yi]( aSetupCfg[yi] );

       if ( aSetupCfg[yi] && aConv[yi] ) {
         if (testOutLen)
         {
           t0 = uclock_sec();
           outN[yi] = aConv[yi]( aSetupCfg[yi], X, lenC, Y[yi], Y[0], 1 /* applyFlush */ );
           t1 = uclock_sec();
           td = t1 - t0;
         }
         else
         {
           const int blkLen = 4096;  /* required for 'fast_conv_4K' */
           int64_t offC = 0, offS, Nout;
           int k;
           iter = 0;
           outN[yi] = 0;
           t0 = uclock_sec();
           tstop = t0 + 0.25;  /* benchmark duration: 250 ms */
           do {
             for ( k = 0; k < 128 && offC +blkLen < lenC; ++k )
             {
               offS = cplxFactor * offC;
               Nout = aConv[yi]( aSetupCfg[yi], X +offS, blkLen, Y[yi] +offS, Y[0], (offC +blkLen >= lenC) /* applyFlush */ );
               offC += Nout;
               ++iter;
               if ( !Nout )
                 break;
               if ( offC +blkLen >= lenC )
               {
                 outN[yi] += offC;
                 offC = 0;
               }
             }
             t1 = uclock_sec();
           } while ( t1 < tstop );
           outN[yi] += offC;
           td = t1 - t0;
           procSmpPerSec[yi] = cplxFactor * (double)outN[yi] / td;
         }
       }
       else
       {
         t0 = t1 = td = 0.0;
         outN[yi] = 0;
       }
       aDuration[yi] = td;
       if ( yi == 0 ) {
         const float * Yvals = Y[0];
         const int64_t refOutLen = cplxFactor * outN[0];
         tdref = td;
         if (printDbg) {
           printf("convolution '%s' took: %f ms\n", convText[yi], td*1000.0);
           printf("  convolution '%s' output size %" PRId64 " == (cplx) len %d + %" PRId64 "\n", convText[yi], outN[yi], len / cplxFactor, outN[yi] - len / cplxFactor);
         }
         aSpeedFactor[yi] = 1.0;
         /*  */
         yRangeMin = FLT_MAX;
         yRangeMax = FLT_MIN;
         for ( i = 0; i < refOutLen; ++i )
         {
           if ( yRangeMax < Yvals[i] )  yRangeMax = Yvals[i];
           if ( yRangeMin > Yvals[i] )  yRangeMin = Yvals[i];
         }
         yErrLimit = fabsf(yRangeMax - yRangeMin) / ( 100.0F * 1000.0F );
         /* yErrLimit = 0.01F; */
         if (testOutLen) {
           if (1) {
             printf("reference output len = %" PRId64 " smp\n", outN[0]);
             printf("reference output range |%.1f ..%.1f| = %.1f ==> err limit = %f\n", yRangeMin, yRangeMax, yRangeMax - yRangeMin, yErrLimit);
           }
           printFirst( Yvals, "Yref", 64, 8 );
         }
       }
       else
       {
         aSpeedFactor[yi] = tdref / td;
         if (printDbg) {
           printf("\nconvolution '%s' took: %f ms == %f %% == %f X\n", convText[yi], td*1000.0, td * 100 / tdref, tdref / td);
           printf("  convolution '%s' output size %" PRId64 " == (cplx) len %d + %" PRId64 "\n", convText[yi], outN[yi], len / cplxFactor, outN[yi] - len / cplxFactor);
         }
       }
     }

     int iMaxSpeedSlowAlgo = -1;
     int iFirstFastAlgo = -1;
     int iMaxSpeedFastAlgo = -1;
     int iPrintedRefOutLen = 0;
     {
       for ( yc = 1; yc < NUMY; ++yc )
       {
         if (!aRunAlgo[yc])
           continue;
         if (aFastAlgo[yc]) {
           if ( iMaxSpeedFastAlgo < 0 || aSpeedFactor[yc] > aSpeedFactor[iMaxSpeedFastAlgo] )
             iMaxSpeedFastAlgo = yc;

           if (iFirstFastAlgo < 0)
             iFirstFastAlgo = yc;
         }
         else
         {
           if ( iMaxSpeedSlowAlgo < 0 || aSpeedFactor[yc] > aSpeedFactor[iMaxSpeedSlowAlgo] )
             iMaxSpeedSlowAlgo = yc;
         }
       }

       if (printSpeed)
       {
         if (testOutLen)
         {
           if (iMaxSpeedSlowAlgo >= 0 )
             printf("fastest slow algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iMaxSpeedSlowAlgo], aSpeedFactor[iMaxSpeedSlowAlgo], 1000.0 * aDuration[iMaxSpeedSlowAlgo]);
           if (0 != iMaxSpeedSlowAlgo && aRunAlgo[0])
             printf("slow algorithm '%s' at speed %f X ; abs duration %f ms\n", convText[0], aSpeedFactor[0], 1000.0 * aDuration[0]);
           if (1 != iMaxSpeedSlowAlgo && aRunAlgo[1])
             printf("slow algorithm '%s' at speed %f X ; abs duration %f ms\n", convText[1], aSpeedFactor[1], 1000.0 * aDuration[1]);

           if (iFirstFastAlgo >= 0 && iFirstFastAlgo != iMaxSpeedFastAlgo && aRunAlgo[iFirstFastAlgo])
             printf("first   fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iFirstFastAlgo],    aSpeedFactor[iFirstFastAlgo],    1000.0 * aDuration[iFirstFastAlgo]);
           if (iFirstFastAlgo >= 0 && iFirstFastAlgo+1 != iMaxSpeedFastAlgo && iFirstFastAlgo+1 < NUMY && aRunAlgo[iFirstFastAlgo+1])
             printf("2nd     fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iFirstFastAlgo+1],  aSpeedFactor[iFirstFastAlgo+1],  1000.0 * aDuration[iFirstFastAlgo+1]);

           if ( 0 <= iMaxSpeedFastAlgo && iMaxSpeedFastAlgo < NUMY && aRunAlgo[iMaxSpeedFastAlgo] )
           {
             printf("fastest fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iMaxSpeedFastAlgo], aSpeedFactor[iMaxSpeedFastAlgo], 1000.0 * aDuration[iMaxSpeedFastAlgo]);
             if ( 0 <= iMaxSpeedSlowAlgo && iMaxSpeedSlowAlgo < NUMY && aRunAlgo[iMaxSpeedSlowAlgo] )
               printf("fast / slow ratio: %f X\n", aSpeedFactor[iMaxSpeedFastAlgo] / aSpeedFactor[iMaxSpeedSlowAlgo] );
           }
           printf("\n");
         }
         else
         {
           for ( yc = 0; yc < NUMY; ++yc )
           {
             if (!aRunAlgo[yc] || procSmpPerSec[yc] <= 0.0)
               continue;
             printf("algo '%s':\t%.2f MSmp\tin\t%.1f ms\t= %g kSmpPerSec\n",
               convText[yc], (double)outN[yc]/(1000.0 * 1000.0), 1000.0 * aDuration[yc], procSmpPerSec[yc] * 0.001 );
           }
         }

       }
     }


     for ( yc = 1; yc < NUMY; ++yc )
     {
       const float * Yref;
       const float * Ycurr;
       int outMin;

       if (!aRunAlgo[yc])
         continue;

       if (printDbg)
         printf("\n");

       if ( outN[yc] == 0 )
       {
         printf("output size 0: '%s' not implemented\n", convText[yc]);
       }
       else if ( outN[0] != outN[yc] /* && aFastAlgo[yc] */ && testOutLen )
       {
         if (!iPrintedRefOutLen)
         {
           printf("reference output size = %" PRId64 ", delta to (cplx) input length = %" PRId64 " smp\n", outN[0], (len / cplxFactor) - outN[0]);
           iPrintedRefOutLen = 1;
         }
         printf("output size doesn't match!: ref (FILTERLEN %d) returned %" PRId64 " smp, '%s' returned %" PRId64 " smp : delta = %" PRId64 " smp\n",
           FILTERLEN, outN[0], convText[yc], outN[yc], outN[yc] - outN[0] );
         retErr = 1;
       }

       posMaxErr = 0;
       maxErr = -1.0;
       Yref = Y[0];
       Ycurr = Y[yc];
       outMin = ( outN[yc] < outN[0] ) ? outN[yc] : outN[0];
       numErrOverLimit = 0;
       for ( i = 0; i < outMin; ++i )
       {
         if ( numErrOverLimit < 6 && fabs(Ycurr[i] - Yref[i]) >= yErrLimit )
         {
           printf("algo '%s': at %d: ***ERROR*** = %f, errLimit = %f, ref = %f, actual = %f\n",
             convText[yc], i, fabs(Ycurr[i] - Yref[i]), yErrLimit, Yref[i], Ycurr[i] );
           ++numErrOverLimit;
         }

         if ( fabs(Ycurr[i] - Yref[i]) > maxErr )
         {
           maxErr = fabsf(Ycurr[i] - Yref[i]);
           posMaxErr = i;
         }
       }

       if ( printDbg || (iMaxSpeedSlowAlgo == i) || (iMaxSpeedFastAlgo == i) )
         printf("max difference for '%s' is %g at sample idx %d of max inp 4093-1 == %f %%\n", convText[yc], maxErr, posMaxErr, maxErr * 100.0 / 4092.0 );
     }

     break;
   }

   pffastconv_free(X);
   for ( i=0; i < NUMY; ++i)
   {
     if ( 1 || i < 2 )
       pffastconv_free( Y[i] );
     if (!aRunAlgo[i])
       continue;
     aDestroy[i]( aSetupCfg[i] );
   }

   pffastconv_free(H);

   return retErr;
 }

 /* small functions inside pffft.c that will detect (compiler) bugs with respect to simd instructions */
 void validate_pffft_simd();
 int  validate_pffft_simd_ex(FILE * DbgOut);


 int main(int argc, char **argv)
 {
   int result = 0;
   int i, k, M, flagsA, flagsB, flagsC, testOutLen, printDbg, printSpeed;
   int testOutLens = 1, benchConv = 1, quickTest = 0, slowTest = 0;
   int testReal = 1, testCplx = 1, testSymetric = 0;

   for ( i = 1; i < argc; ++i ) {

     if (!strcmp(argv[i], "--test-simd")) {
       int numErrs = validate_pffft_simd_ex(stdout);
       fprintf( ( numErrs != 0 ? stderr : stdout ), "validate_pffft_simd_ex() returned %d errors!\n", numErrs);
       return ( numErrs > 0 ? 1 : 0 );
     }

     if (!strcmp(argv[i], "--no-len")) {
       testOutLens = 0;
     }
     else if (!strcmp(argv[i], "--no-bench")) {
       benchConv = 0;
     }
     else if (!strcmp(argv[i], "--quick")) {
       quickTest = 1;
     }
     else if (!strcmp(argv[i], "--slow")) {
       slowTest = 1;
     }
     else if (!strcmp(argv[i], "--real")) {
       testCplx = 0;
     }
     else if (!strcmp(argv[i], "--cplx")) {
       testReal = 0;
     }
     else if (!strcmp(argv[i], "--sym")) {
       testSymetric = 1;
     }
     else /* if (!strcmp(argv[i], "--help")) */ {
       printf("usage: %s [--test-simd] [--no-len] [--no-bench] [--quick|--slow] [--real|--cplx] [--sym]\n", argv[0]);
       exit(1);
     }
   }


   if (testOutLens)
   {
     for ( k = 0; k < 3; ++k )
     {
       if ( (k == 0 && !testReal) || (k > 0 && !testCplx) )
         continue;
       printf("\n\n==========\n");
       printf("testing %s %s output lengths ..\n", (k == 0 ? "real" : "cplx"), ( k == 0 ? "" : (k==1 ? "2x" : "single") ) );
       printf("==========\n");
       flagsA = (k == 0) ? 0 : PFFASTCONV_CPLX_INP_OUT;
       flagsB = flagsA | ( testSymetric ? PFFASTCONV_SYMMETRIC : 0 );
       flagsC = flagsB | PFFASTCONV_CPLX_SINGLE_FFT;
       testOutLen = 1;
       printDbg = 0;
       printSpeed = 0;
       for ( M = 128 - 4; M <= (quickTest ? 128+16 : 256); ++M )
       {
         if ( (M % 16) != 0 && testSymetric )
           continue;
         result |= test(M, flagsB, testOutLen, printDbg, printSpeed);
       }
     }
   }

   if (benchConv)
   {
     for ( k = 0; k < 3; ++k )
     {
       if ( (k == 0 && !testReal) || (k > 0 && !testCplx) )
         continue;
       printf("\n\n==========\n");
       printf("starting %s %s benchmark against linear convolutions ..\n", (k == 0 ? "real" : "cplx"), ( k == 0 ? "" : (k==1 ? "2x" : "single") ) );
       printf("==========\n");
       flagsA = (k == 0) ? 0 : PFFASTCONV_CPLX_INP_OUT;
       flagsB = flagsA | ( testSymetric ? PFFASTCONV_SYMMETRIC : 0 );
       flagsC = flagsB | ( k == 2 ? PFFASTCONV_CPLX_SINGLE_FFT : 0 );
       testOutLen = 0;
       printDbg = 0;
       printSpeed = 1;
       if (!slowTest) {
         result |= test( 32,     flagsC, testOutLen, printDbg, printSpeed);
         result |= test( 32+ 16, flagsC, testOutLen, printDbg, printSpeed);
         result |= test( 64,     flagsC, testOutLen, printDbg, printSpeed);
         result |= test( 64+ 32, flagsC, testOutLen, printDbg, printSpeed);
         result |= test(128,     flagsC, testOutLen, printDbg, printSpeed);
       }
       if (!quickTest) {
         result |= test(128+ 64, flagsC, testOutLen, printDbg, printSpeed);
         result |= test(256,     flagsC, testOutLen, printDbg, printSpeed);
         result |= test(256+128, flagsC, testOutLen, printDbg, printSpeed);
         result |= test(512,     flagsC, testOutLen, printDbg, printSpeed);
         result |= test(1024,    flagsC, testOutLen, printDbg, printSpeed);
       }
     }
   }

   return result;
 }