stm-arm-neon-ref.h - platform/external/arm-neon-tests - Git at Google

 /*

 Copyright (c) 2009, 2010, 2011, 2012, 2013 STMicroelectronics
 Written by Christophe Lyon

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.

 */

 #ifndef _STM_ARM_NEON_REF_H_
 #define _STM_ARM_NEON_REF_H_

 #if defined(__cplusplus)
 #include <cstdio>
 #include <cinttypes>
 #include <cstring>
 #else
 #include <stdio.h>
 #if defined(_MSC_VER)
 #include "msinttypes.h"
 #include <float.h> /* for isnan() ... */
 static int32_t _ptrNan[]={0x7fc00000L};
 #define NAN (*(float*)_ptrNan)
 static int32_t _ptrInf[]={0x7f800000L};
 #define INFINITY (*(float*)_ptrInf)
 #define HUGE_VALF INFINITY
 #else
 #include <inttypes.h>
 #endif
 #include <string.h>
 #endif

 #define xSTR(X) #X
 #define STR(X) xSTR(X)

 #define xNAME1(V,T) V ## _ ##  T
 #define xNAME(V,T) xNAME1(V,T)

 #define VAR(V,T,W) xNAME(V,T##W)
 #define VAR_DECL(V, T, W) T##W##_t VAR(V,T,W)

 #define VECT_NAME(T, W, N) T##W##x##N
 #define VECT_ARRAY_NAME(T, W, N, L) T##W##x##N##x##L
 #define VECT_TYPE(T, W, N) xNAME(VECT_NAME(T,W,N),t)
 #define VECT_ARRAY_TYPE(T, W, N, L) xNAME(VECT_ARRAY_NAME(T,W,N,L),t)

 #define VECT_VAR(V,T,W,N) xNAME(V,VECT_NAME(T,W,N))
 #define VECT_VAR_DECL(V, T, W, N) T##W##_t VECT_VAR(V,T,W,N)

 /* This one is used for padding between input buffers.  */
 #define PAD(V, T, W, N) char VECT_VAR(V,T,W,N)=42;

 /* Array declarations.  */
 #define ARRAY(V, T, W, N) VECT_VAR_DECL(V,T,W,N)[N]
 #define ARRAY4(V, T, W, N) VECT_VAR_DECL(V,T,W,N)[4]

 /* Arrays of vectors.  */
 #define VECT_ARRAY_VAR(V,T,W,N,L) xNAME(V,VECT_ARRAY_NAME(T,W,N,L))
 #define VECT_ARRAY(V, T, W, N, L) T##W##_t VECT_ARRAY_VAR(V,T,W,N,L)[N*L]

 static int result_idx = 0;
 #define DUMP(MSG,T,W,N,FMT)						\
   fprintf(ref_file, "%s:%d:%s [] = { ", MSG, result_idx++,		\
 	  STR(VECT_VAR(result, T, W, N)));				\
   for(i=0; i<N ; i++)							\
     {									\
       fprintf(ref_file, "%" FMT ", ", VECT_VAR(result, T, W, N)[i]);	\
     }									\
   fprintf(ref_file, " }\n");						\
   DUMP4GCC(MSG,T,W,N,FMT);

 /* Use casts for remove sign bits */
 #define DUMP_POLY(MSG,T,W,N,FMT)					\
   fprintf(ref_file, "%s:%d:%s [] = { ", MSG, result_idx++,		\
 	  STR(VECT_VAR(result, T, W, N)));				\
   for(i=0; i<N ; i++)							\
     {									\
       fprintf(ref_file, "%" FMT ", ",					\
 	      (uint##W##_t)VECT_VAR(result, T, W, N)[i]);		\
     }									\
   fprintf(ref_file, " }\n");						\
   DUMP4GCC(MSG,T,W,N,FMT);

 #define DUMP_FP(MSG,T,W,N,FMT)						\
   fprintf(ref_file, "%s:%d:%s [] = { ", MSG, result_idx++,		\
 	  STR(VECT_VAR(result, T, W, N)));				\
   for(i=0; i<N ; i++)							\
     {									\
       union fp_operand {						\
 	uint##W##_t i;							\
 	float##W##_t f;							\
       } tmp;								\
       tmp.f = VECT_VAR(result, T, W, N)[i];				\
       fprintf(ref_file, "%" FMT ", ", tmp.i);				\
     }									\
   fprintf(ref_file, " }\n");						\
   DUMP4GCC_FP(MSG,T,W,N,FMT);

 #define DUMP4GCC(MSG,T,W,N,FMT)						\
   fprintf(gcc_tests_file, "VECT_VAR_DECL(expected,%s,%d,%d) [] = { ",	\
 	  STR(T), W, N);						\
   for(i=0; i<(N-1) ; i++)						\
     {									\
       if (W < 32) {							\
 	uint32_t tmp = (uint##W##_t) VECT_VAR(result, T, W, N)[i];	\
 	fprintf(gcc_tests_file, "0x%" FMT ", ", tmp);			\
       } else {								\
 	fprintf(gcc_tests_file, "0x%" FMT ", ", VECT_VAR(result, T, W, N)[i]); \
       }									\
     }									\
   if (W < 32) {								\
     uint32_t tmp = (uint##W##_t) VECT_VAR(result, T, W, N)[i];		\
     fprintf(gcc_tests_file, "0x%" FMT, tmp);				\
   } else {								\
     fprintf(gcc_tests_file, "0x%" FMT, VECT_VAR(result, T, W, N)[i]);	\
   }									\
   fprintf(gcc_tests_file, " };\n");

 #define DUMP4GCC_FP(MSG,T,W,N,FMT)					\
   {									\
     union fp_operand {							\
       uint##W##_t i;							\
       float##W##_t f;							\
     } tmp;								\
     fprintf(gcc_tests_file, "VECT_VAR_DECL(expected,%s,%d,%d) [] = { ",	\
 	    "hfloat", W, N);						\
     for(i=0; i<(N-1) ; i++)						\
       {									\
 	tmp.f = VECT_VAR(result, T, W, N)[i];				\
 	fprintf(gcc_tests_file, "0x%" FMT ", ", tmp.i);			\
       }									\
     tmp.f = VECT_VAR(result, T, W, N)[i];				\
     fprintf(gcc_tests_file, "0x%" FMT, tmp.i);				\
     fprintf(gcc_tests_file, " };\n");					\
   }

 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
 #define float16_t __fp16

 #define DUMP_FP16(MSG,T,W,N,FMT)					\
   fprintf(ref_file, "%s:%d:%s [] = { ", MSG, result_idx++,		\
 	  STR(VECT_VAR(result, T, W, N)));				\
   for(i=0; i<N ; i++)							\
     {									\
       uint##W##_t tmp;							\
 	tmp = (uint##W##_t)VECT_VAR(result, T, W, N)[i];		\
 	fprintf(ref_file, "%" FMT ", ", tmp);				\
     }									\
   fprintf(ref_file, " }\n");						\
   DUMP4GCC_FP16(MSG,T,W,N,FMT);

 #define DUMP4GCC_FP16(MSG,T,W,N,FMT)					\
   {									\
     uint##W##_t tmp;							\
     fprintf(gcc_tests_file, "VECT_VAR_DECL(expected,%s,%d,%d) [] = { ", \
 	    "hfloat", W, N);						\
     for(i=0; i<(N-1) ; i++)						\
       {									\
 	tmp = (uint##W##_t)VECT_VAR(result, T, W, N)[i];		\
 	fprintf(gcc_tests_file, "0x%" FMT ", ", tmp);			\
       }									\
     tmp = (uint##W##_t)VECT_VAR(result, T, W, N)[i];			\
     fprintf(gcc_tests_file, "0x%" FMT, tmp);				\
     fprintf(gcc_tests_file, " };\n");					\
   }
 #endif

 #define CLEAN_PATTERN_8  0x33
 #define CLEAN_PATTERN_16 0x3333
 #define CLEAN_PATTERN_32 0x33333333
 #define CLEAN_PATTERN_64 0x3333333333333333

 #define CLEAN(VAR,T,W,N)						\
   memset(VECT_VAR(VAR, T, W, N),					\
 	 CLEAN_PATTERN_8,						\
 	 sizeof(VECT_VAR(VAR, T, W, N)));

 #define CHECK_INIT(VAR,Q,T1,T2,W,N)					\
   {									\
     ARRAY(check_result, T1, W, N);					\
     int i;								\
 									\
     vst1##Q##_##T2##W(VECT_VAR(check_result, T1, W, N),			\
 		      VECT_VAR(VAR, T1, W, N));				\
     for(i=0; i<N ; i++)							\
       {									\
 	/*if (VECT_VAR(check_result, T1, W, N)[i] == CLEAN_PATTERN_##W)*/ { \
 	  fprintf(stdout, "%s:%d: %s[%d] unintialized! %#x\n",		\
 		  __FUNCTION__,	__LINE__,				\
 		  STR(VECT_VAR(VAR, T1, W, N)), i,			\
 		  VECT_VAR(check_result, T1, W, N)[i]);			\
 	}								\
       }									\
   }

 /* Generic declarations: */
 extern FILE* log_file;
 extern FILE* ref_file;
 extern FILE* gcc_tests_file;

 /* Input buffers, one of each size */
 extern ARRAY(buffer, int, 8, 8);
 extern ARRAY(buffer, int, 16, 4);
 extern ARRAY(buffer, int, 32, 2);
 extern ARRAY(buffer, int, 64, 1);
 extern ARRAY(buffer, uint, 8, 8);
 extern ARRAY(buffer, uint, 16, 4);
 extern ARRAY(buffer, uint, 32, 2);
 extern ARRAY(buffer, uint, 64, 1);
 extern ARRAY(buffer, poly, 8, 8);
 extern ARRAY(buffer, poly, 16, 4);
 extern ARRAY(buffer, float, 32, 2);
 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
 extern ARRAY(buffer, float, 16, 4);
 #endif
 extern ARRAY(buffer, int, 8, 16);
 extern ARRAY(buffer, int, 16, 8);
 extern ARRAY(buffer, int, 32, 4);
 extern ARRAY(buffer, int, 64, 2);
 extern ARRAY(buffer, uint, 8, 16);
 extern ARRAY(buffer, uint, 16, 8);
 extern ARRAY(buffer, uint, 32, 4);
 extern ARRAY(buffer, uint, 64, 2);
 extern ARRAY(buffer, poly, 8, 16);
 extern ARRAY(buffer, poly, 16, 8);
 extern ARRAY(buffer, float, 32, 4);
 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
 extern ARRAY(buffer, float, 16, 8);
 #endif

 /* The tests for vld1_dup and vdup expect at least 4 entries in the
    input buffer, so force 1- and 2-elements initializers to have 4
    entries.  */
 extern ARRAY(buffer_dup, int, 8, 8);
 extern ARRAY(buffer_dup, int, 16, 4);
 extern ARRAY4(buffer_dup, int, 32, 2);
 extern ARRAY4(buffer_dup, int, 64, 1);
 extern ARRAY(buffer_dup, uint, 8, 8);
 extern ARRAY(buffer_dup, uint, 16, 4);
 extern ARRAY4(buffer_dup, uint, 32, 2);
 extern ARRAY4(buffer_dup, uint, 64, 1);
 extern ARRAY(buffer_dup, poly, 8, 8);
 extern ARRAY(buffer_dup, poly, 16, 4);
 extern ARRAY4(buffer_dup, float, 32, 2);
 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
 extern ARRAY4(buffer_dup, float, 16, 4);
 #endif
 extern ARRAY(buffer_dup, int, 8, 16);
 extern ARRAY(buffer_dup, int, 16, 8);
 extern ARRAY(buffer_dup, int, 32, 4);
 extern ARRAY4(buffer_dup, int, 64, 2);
 extern ARRAY(buffer_dup, uint, 8, 16);
 extern ARRAY(buffer_dup, uint, 16, 8);
 extern ARRAY(buffer_dup, uint, 32, 4);
 extern ARRAY4(buffer_dup, uint, 64, 2);
 extern ARRAY(buffer_dup, poly, 8, 16);
 extern ARRAY(buffer_dup, poly, 16, 8);
 extern ARRAY(buffer_dup, float, 32, 4);
 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
 extern ARRAY(buffer_dup, float, 16, 8);
 #endif

 /* Input buffers for vld2, one of each size */
 extern VECT_ARRAY(buffer_vld2, int, 8, 8, 2);
 extern VECT_ARRAY(buffer_vld2, int, 16, 4, 2);
 extern VECT_ARRAY(buffer_vld2, int, 32, 2, 2);
 extern VECT_ARRAY(buffer_vld2, int, 64, 1, 2);
 extern VECT_ARRAY(buffer_vld2, uint, 8, 8, 2);
 extern VECT_ARRAY(buffer_vld2, uint, 16, 4, 2);
 extern VECT_ARRAY(buffer_vld2, uint, 32, 2, 2);
 extern VECT_ARRAY(buffer_vld2, uint, 64, 1, 2);
 extern VECT_ARRAY(buffer_vld2, poly, 8, 8, 2);
 extern VECT_ARRAY(buffer_vld2, poly, 16, 4, 2);
 extern VECT_ARRAY(buffer_vld2, float, 32, 2, 2);
 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
 extern VECT_ARRAY(buffer_vld2, float, 16, 4, 2);
 #endif
 extern VECT_ARRAY(buffer_vld2, int, 8, 16, 2);
 extern VECT_ARRAY(buffer_vld2, int, 16, 8, 2);
 extern VECT_ARRAY(buffer_vld2, int, 32, 4, 2);
 extern VECT_ARRAY(buffer_vld2, int, 64, 2, 2);
 extern VECT_ARRAY(buffer_vld2, uint, 8, 16, 2);
 extern VECT_ARRAY(buffer_vld2, uint, 16, 8, 2);
 extern VECT_ARRAY(buffer_vld2, uint, 32, 4, 2);
 extern VECT_ARRAY(buffer_vld2, uint, 64, 2, 2);
 extern VECT_ARRAY(buffer_vld2, poly, 8, 16, 2);
 extern VECT_ARRAY(buffer_vld2, poly, 16, 8, 2);
 extern VECT_ARRAY(buffer_vld2, float, 32, 4, 2);
 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
 extern VECT_ARRAY(buffer_vld2, float, 16, 8, 2);
 #endif

 /* Input buffers for vld3, one of each size */
 extern VECT_ARRAY(buffer_vld3, int, 8, 8, 3);
 extern VECT_ARRAY(buffer_vld3, int, 16, 4, 3);
 extern VECT_ARRAY(buffer_vld3, int, 32, 2, 3);
 extern VECT_ARRAY(buffer_vld3, int, 64, 1, 3);
 extern VECT_ARRAY(buffer_vld3, uint, 8, 8, 3);
 extern VECT_ARRAY(buffer_vld3, uint, 16, 4, 3);
 extern VECT_ARRAY(buffer_vld3, uint, 32, 2, 3);
 extern VECT_ARRAY(buffer_vld3, uint, 64, 1, 3);
 extern VECT_ARRAY(buffer_vld3, poly, 8, 8, 3);
 extern VECT_ARRAY(buffer_vld3, poly, 16, 4, 3);
 extern VECT_ARRAY(buffer_vld3, float, 32, 2, 3);
 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
 extern VECT_ARRAY(buffer_vld3, float, 16, 4, 3);
 #endif
 extern VECT_ARRAY(buffer_vld3, int, 8, 16, 3);
 extern VECT_ARRAY(buffer_vld3, int, 16, 8, 3);
 extern VECT_ARRAY(buffer_vld3, int, 32, 4, 3);
 extern VECT_ARRAY(buffer_vld3, int, 64, 2, 3);
 extern VECT_ARRAY(buffer_vld3, uint, 8, 16, 3);
 extern VECT_ARRAY(buffer_vld3, uint, 16, 8, 3);
 extern VECT_ARRAY(buffer_vld3, uint, 32, 4, 3);
 extern VECT_ARRAY(buffer_vld3, uint, 64, 2, 3);
 extern VECT_ARRAY(buffer_vld3, poly, 8, 16, 3);
 extern VECT_ARRAY(buffer_vld3, poly, 16, 8, 3);
 extern VECT_ARRAY(buffer_vld3, float, 32, 4, 3);
 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
 extern VECT_ARRAY(buffer_vld3, float, 16, 8, 3);
 #endif

 /* Input buffers for vld4, one of each size */
 extern VECT_ARRAY(buffer_vld4, int, 8, 8, 4);
 extern VECT_ARRAY(buffer_vld4, int, 16, 4, 4);
 extern VECT_ARRAY(buffer_vld4, int, 32, 2, 4);
 extern VECT_ARRAY(buffer_vld4, int, 64, 1, 4);
 extern VECT_ARRAY(buffer_vld4, uint, 8, 8, 4);
 extern VECT_ARRAY(buffer_vld4, uint, 16, 4, 4);
 extern VECT_ARRAY(buffer_vld4, uint, 32, 2, 4);
 extern VECT_ARRAY(buffer_vld4, uint, 64, 1, 4);
 extern VECT_ARRAY(buffer_vld4, poly, 8, 8, 4);
 extern VECT_ARRAY(buffer_vld4, poly, 16, 4, 4);
 extern VECT_ARRAY(buffer_vld4, float, 32, 2, 4);
 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
 extern VECT_ARRAY(buffer_vld4, float, 16, 4, 4);
 #endif
 extern VECT_ARRAY(buffer_vld4, int, 8, 16, 4);
 extern VECT_ARRAY(buffer_vld4, int, 16, 8, 4);
 extern VECT_ARRAY(buffer_vld4, int, 32, 4, 4);
 extern VECT_ARRAY(buffer_vld4, int, 64, 2, 4);
 extern VECT_ARRAY(buffer_vld4, uint, 8, 16, 4);
 extern VECT_ARRAY(buffer_vld4, uint, 16, 8, 4);
 extern VECT_ARRAY(buffer_vld4, uint, 32, 4, 4);
 extern VECT_ARRAY(buffer_vld4, uint, 64, 2, 4);
 extern VECT_ARRAY(buffer_vld4, poly, 8, 16, 4);
 extern VECT_ARRAY(buffer_vld4, poly, 16, 8, 4);
 extern VECT_ARRAY(buffer_vld4, float, 32, 4, 4);
 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
 extern VECT_ARRAY(buffer_vld4, float, 16, 8, 4);
 #endif

 /* Input buffers for vld2_lane */
 extern VECT_VAR_DECL(buffer_vld2_lane, int, 8, 2)[2];
 extern VECT_VAR_DECL(buffer_vld2_lane, int, 16, 2)[2];
 extern VECT_VAR_DECL(buffer_vld2_lane, int, 32, 2)[2];
 extern VECT_VAR_DECL(buffer_vld2_lane, int, 64, 2)[2];
 extern VECT_VAR_DECL(buffer_vld2_lane, uint, 8, 2)[2];
 extern VECT_VAR_DECL(buffer_vld2_lane, uint, 16, 2)[2];
 extern VECT_VAR_DECL(buffer_vld2_lane, uint, 32, 2)[2];
 extern VECT_VAR_DECL(buffer_vld2_lane, uint, 64, 2)[2];
 extern VECT_VAR_DECL(buffer_vld2_lane, poly, 8, 2)[2];
 extern VECT_VAR_DECL(buffer_vld2_lane, poly, 16, 2)[2];
 extern VECT_VAR_DECL(buffer_vld2_lane, float, 32, 2)[2];
 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
 extern VECT_VAR_DECL(buffer_vld2_lane, float, 16, 2)[2];
 #endif

 /* Input buffers for vld3_lane */
 extern VECT_VAR_DECL(buffer_vld3_lane, int, 8, 3)[3];
 extern VECT_VAR_DECL(buffer_vld3_lane, int, 16, 3)[3];
 extern VECT_VAR_DECL(buffer_vld3_lane, int, 32, 3)[3];
 extern VECT_VAR_DECL(buffer_vld3_lane, int, 64, 3)[3];
 extern VECT_VAR_DECL(buffer_vld3_lane, uint, 8, 3)[3];
 extern VECT_VAR_DECL(buffer_vld3_lane, uint, 16, 3)[3];
 extern VECT_VAR_DECL(buffer_vld3_lane, uint, 32, 3)[3];
 extern VECT_VAR_DECL(buffer_vld3_lane, uint, 64, 3)[3];
 extern VECT_VAR_DECL(buffer_vld3_lane, poly, 8, 3)[3];
 extern VECT_VAR_DECL(buffer_vld3_lane, poly, 16, 3)[3];
 extern VECT_VAR_DECL(buffer_vld3_lane, float, 32, 3)[3];
 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
 extern VECT_VAR_DECL(buffer_vld3_lane, float, 16, 3)[3];
 #endif

 /* Input buffers for vld4_lane */
 extern VECT_VAR_DECL(buffer_vld4_lane, int, 8, 4)[4];
 extern VECT_VAR_DECL(buffer_vld4_lane, int, 16, 4)[4];
 extern VECT_VAR_DECL(buffer_vld4_lane, int, 32, 4)[4];
 extern VECT_VAR_DECL(buffer_vld4_lane, int, 64, 4)[4];
 extern VECT_VAR_DECL(buffer_vld4_lane, uint, 8, 4)[4];
 extern VECT_VAR_DECL(buffer_vld4_lane, uint, 16, 4)[4];
 extern VECT_VAR_DECL(buffer_vld4_lane, uint, 32, 4)[4];
 extern VECT_VAR_DECL(buffer_vld4_lane, uint, 64, 4)[4];
 extern VECT_VAR_DECL(buffer_vld4_lane, poly, 8, 4)[4];
 extern VECT_VAR_DECL(buffer_vld4_lane, poly, 16, 4)[4];
 extern VECT_VAR_DECL(buffer_vld4_lane, float, 32, 4)[4];
 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
 extern VECT_VAR_DECL(buffer_vld4_lane, float, 16, 4)[4];
 #endif

 /* Output buffers, one of each size */
 static ARRAY(result, int, 8, 8);
 static ARRAY(result, int, 16, 4);
 static ARRAY(result, int, 32, 2);
 static ARRAY(result, int, 64, 1);
 static ARRAY(result, uint, 8, 8);
 static ARRAY(result, uint, 16, 4);
 static ARRAY(result, uint, 32, 2);
 static ARRAY(result, uint, 64, 1);
 static ARRAY(result, poly, 8, 8);
 static ARRAY(result, poly, 16, 4);
 static ARRAY(result, float, 32, 2);
 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
 static ARRAY(result, float, 16, 4);
 #endif
 static ARRAY(result, int, 8, 16);
 static ARRAY(result, int, 16, 8);
 static ARRAY(result, int, 32, 4);
 static ARRAY(result, int, 64, 2);
 static ARRAY(result, uint, 8, 16);
 static ARRAY(result, uint, 16, 8);
 static ARRAY(result, uint, 32, 4);
 static ARRAY(result, uint, 64, 2);
 static ARRAY(result, poly, 8, 16);
 static ARRAY(result, poly, 16, 8);
 static ARRAY(result, float, 32, 4);
 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
 static ARRAY(result, float, 16, 8);
 #endif

 /* Dump results (generic function) */
 static void dump_results (char *test_name)
 {
   int i;

   fprintf(ref_file, "\n%s output:\n", test_name);
   fprintf(gcc_tests_file, "\n%s output:\n", test_name);

   DUMP(test_name, int, 8, 8, PRId8);
   DUMP(test_name, int, 16, 4, PRId16);
   DUMP(test_name, int, 32, 2, PRId32);
   DUMP(test_name, int, 64, 1, PRId64);
   DUMP(test_name, uint, 8, 8, PRIu8);
   DUMP(test_name, uint, 16, 4, PRIu16);
   DUMP(test_name, uint, 32, 2, PRIu32);
   DUMP(test_name, uint, 64, 1, PRIu64);
   DUMP_POLY(test_name, poly, 8, 8, PRIu8);
   DUMP_POLY(test_name, poly, 16, 4, PRIu16);
   DUMP_FP(test_name, float, 32, 2, PRIx32);
 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
   DUMP_FP16(test_name, float, 16, 4, PRIu16);
 #endif

   DUMP(test_name, int, 8, 16, PRId8);
   DUMP(test_name, int, 16, 8, PRId16);
   DUMP(test_name, int, 32, 4, PRId32);
   DUMP(test_name, int, 64, 2, PRId64);
   DUMP(test_name, uint, 8, 16, PRIu8);
   DUMP(test_name, uint, 16, 8, PRIu16);
   DUMP(test_name, uint, 32, 4, PRIu32);
   DUMP(test_name, uint, 64, 2, PRIu64);
   DUMP_POLY(test_name, poly, 8, 16, PRIu8);
   DUMP_POLY(test_name, poly, 16, 8, PRIu16);
   DUMP_FP(test_name, float, 32, 4, PRIx32);
 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
   DUMP_FP16(test_name, float, 16, 8, PRIu16);
 #endif
 }

 /* Dump results in hex (generic function) */
 static void dump_results_hex2 (const char *test_name, const char* comment)
 {
   int i;

   fprintf(ref_file, "\n%s%s output:\n", test_name, comment);
   fprintf(gcc_tests_file, "\n%s%s output:\n", test_name, comment);

   DUMP(test_name, int, 8, 8, PRIx8);
   DUMP(test_name, int, 16, 4, PRIx16);
   DUMP(test_name, int, 32, 2, PRIx32);
   DUMP(test_name, int, 64, 1, PRIx64);
   DUMP(test_name, uint, 8, 8, PRIx8);
   DUMP(test_name, uint, 16, 4, PRIx16);
   DUMP(test_name, uint, 32, 2, PRIx32);
   DUMP(test_name, uint, 64, 1, PRIx64);
   DUMP_POLY(test_name, poly, 8, 8, PRIx8);
   DUMP_POLY(test_name, poly, 16, 4, PRIx16);
   DUMP_FP(test_name, float, 32, 2, PRIx32);
 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
   DUMP_FP16(test_name, float, 16, 4, PRIx16);
 #endif

   DUMP(test_name, int, 8, 16, PRIx8);
   DUMP(test_name, int, 16, 8, PRIx16);
   DUMP(test_name, int, 32, 4, PRIx32);
   DUMP(test_name, int, 64, 2, PRIx64);
   DUMP(test_name, uint, 8, 16, PRIx8);
   DUMP(test_name, uint, 16, 8, PRIx16);
   DUMP(test_name, uint, 32, 4, PRIx32);
   DUMP(test_name, uint, 64, 2, PRIx64);
   DUMP_POLY(test_name, poly, 8, 16, PRIx8);
   DUMP_POLY(test_name, poly, 16, 8, PRIx16);
   DUMP_FP(test_name, float, 32, 4, PRIx32);
 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
   DUMP_FP16(test_name, float, 16, 8, PRIx16);
 #endif
 }

 static void dump_results_hex (const char *test_name)
 {
   dump_results_hex2(test_name, "");
 }

 #ifndef STM_ARM_NEON_MODELS

 /* This hack is to cope with various compilers/libc which may not
    provide endian.h or cross-compilers such as llvm which includes the
    host's endian.h.  */
 #ifndef __arm__
 #include <endian.h>
 #define THIS_ENDIAN __BYTE_ORDER
 #else /* __arm__ */
 #ifdef __ARMEL__
 #define THIS_ENDIAN __LITTLE_ENDIAN
 #else /* __ARMEL__ */
 #define THIS_ENDIAN __BIG_ENDIAN
 #endif
 #endif /* __arm__ */

 #if THIS_ENDIAN == __LITTLE_ENDIAN

 typedef union {
   struct {
     int _xxx:27;
     unsigned int QC:1;
     int V:1;
     int C:1;
     int Z:1;
     int N:1;
   } b;
   unsigned int word;
 } _ARM_FPSCR;

 #else /* __BIG_ENDIAN */

 typedef union {
   struct {
     int N:1;
     int Z:1;
     int C:1;
     int V:1;
     unsigned int QC:1;
     int _dnm:27;
   } b;
   unsigned int word;
 } _ARM_FPSCR;

 #endif /* __BIG_ENDIAN */

 #ifdef __ARMCC_VERSION
 register _ARM_FPSCR _afpscr_for_qc __asm("fpscr");
 # define Neon_Cumulative_Sat _afpscr_for_qc.b.QC
 # define Set_Neon_Cumulative_Sat(x, depend)  {Neon_Cumulative_Sat = (x);}
 #else
 /* GCC/ARM does not know this register */
 # define Neon_Cumulative_Sat  __read_neon_cumulative_sat()
 /* We need a fake dependency to ensure correct ordering of asm
    statements to preset the QC flag value, and Neon operators writing
    to QC. */
 #define Set_Neon_Cumulative_Sat(x, depend)	\
   __set_neon_cumulative_sat((x), (depend))

 # if defined(__aarch64__)
 static volatile int __read_neon_cumulative_sat (void) {
     _ARM_FPSCR _afpscr_for_qc;
     asm volatile ("mrs %0,fpsr" : "=r" (_afpscr_for_qc));
     return _afpscr_for_qc.b.QC;
 }

 #define __set_neon_cumulative_sat(x, depend) {				\
     _ARM_FPSCR _afpscr_for_qc;						\
     asm volatile ("mrs %0,fpsr" : "=r" (_afpscr_for_qc));		\
     _afpscr_for_qc.b.QC = x;						\
     asm volatile ("msr fpsr,%1" : "=X" (depend) : "r" (_afpscr_for_qc)); \
   }

 # else
 static volatile int __read_neon_cumulative_sat (void) {
     _ARM_FPSCR _afpscr_for_qc;
     asm volatile ("vmrs %0,fpscr" : "=r" (_afpscr_for_qc));
     return _afpscr_for_qc.b.QC;
 }

 #define __set_neon_cumulative_sat(x, depend) {				\
     _ARM_FPSCR _afpscr_for_qc;						\
     asm volatile ("vmrs %0,fpscr" : "=r" (_afpscr_for_qc));		\
     _afpscr_for_qc.b.QC = x;						\
     asm volatile ("vmsr fpscr,%1" : "=X" (depend) : "r" (_afpscr_for_qc)); \
   }

 # endif
 #endif

 #endif /* STM_ARM_NEON_MODELS */

 static void dump_neon_cumulative_sat(const char* msg, const char *name,
 				     const char* t1, int w, int n)
 {
   fprintf(ref_file, "%s:%d:%s Neon cumulative saturation %d\n", msg, result_idx++,
 	  name, Neon_Cumulative_Sat);
   fprintf(gcc_tests_file,
 	  "int VECT_VAR(expected_cumulative_sat,%s,%d,%d) = %d;\n",
 	  t1, w, n, Neon_Cumulative_Sat);
 }

 /* Clean output buffers before execution */
 static void clean_results (void)
 {
   result_idx = 0;
   CLEAN(result, int, 8, 8);
   CLEAN(result, int, 16, 4);
   CLEAN(result, int, 32, 2);
   CLEAN(result, int, 64, 1);
   CLEAN(result, uint, 8, 8);
   CLEAN(result, uint, 16, 4);
   CLEAN(result, uint, 32, 2);
   CLEAN(result, uint, 64, 1);
   CLEAN(result, poly, 8, 8);
   CLEAN(result, poly, 16, 4);
   CLEAN(result, float, 32, 2);

   CLEAN(result, int, 8, 16);
   CLEAN(result, int, 16, 8);
   CLEAN(result, int, 32, 4);
   CLEAN(result, int, 64, 2);
   CLEAN(result, uint, 8, 16);
   CLEAN(result, uint, 16, 8);
   CLEAN(result, uint, 32, 4);
   CLEAN(result, uint, 64, 2);
   CLEAN(result, poly, 8, 16);
   CLEAN(result, poly, 16, 8);
   CLEAN(result, float, 32, 4);
 }


 /* Helpers to declare variables of various types  */
 #define DECL_VARIABLE(VAR, T1, W, N)		\
   volatile VECT_TYPE(T1, W, N) VECT_VAR(VAR, T1, W, N)

 #define DECL_VARIABLE_64BITS_SIGNED_VARIANTS(VAR)	\
   DECL_VARIABLE(VAR, int, 8, 8);			\
   DECL_VARIABLE(VAR, int, 16, 4);			\
   DECL_VARIABLE(VAR, int, 32, 2);			\
   DECL_VARIABLE(VAR, int, 64, 1)

 #define DECL_VARIABLE_64BITS_UNSIGNED_VARIANTS(VAR)	\
   DECL_VARIABLE(VAR, uint, 8, 8);			\
   DECL_VARIABLE(VAR, uint, 16, 4);			\
   DECL_VARIABLE(VAR, uint, 32, 2);			\
   DECL_VARIABLE(VAR, uint, 64, 1)

 #define DECL_VARIABLE_128BITS_SIGNED_VARIANTS(VAR)	\
   DECL_VARIABLE(VAR, int, 8, 16);			\
   DECL_VARIABLE(VAR, int, 16, 8);			\
   DECL_VARIABLE(VAR, int, 32, 4);			\
   DECL_VARIABLE(VAR, int, 64, 2)

 #define DECL_VARIABLE_128BITS_UNSIGNED_VARIANTS(VAR)	\
   DECL_VARIABLE(VAR, uint, 8, 16);			\
   DECL_VARIABLE(VAR, uint, 16, 8);			\
   DECL_VARIABLE(VAR, uint, 32, 4);			\
   DECL_VARIABLE(VAR, uint, 64, 2)

 #define DECL_VARIABLE_64BITS_VARIANTS(VAR)	\
   DECL_VARIABLE_64BITS_SIGNED_VARIANTS(VAR);	\
   DECL_VARIABLE_64BITS_UNSIGNED_VARIANTS(VAR);	\
   DECL_VARIABLE(VAR, poly, 8, 8);		\
   DECL_VARIABLE(VAR, poly, 16, 4);		\
   DECL_VARIABLE(VAR, float, 32, 2)

 #define DECL_VARIABLE_128BITS_VARIANTS(VAR)	\
   DECL_VARIABLE_128BITS_SIGNED_VARIANTS(VAR);	\
   DECL_VARIABLE_128BITS_UNSIGNED_VARIANTS(VAR);	\
   DECL_VARIABLE(VAR, poly, 8, 16);		\
   DECL_VARIABLE(VAR, poly, 16, 8);		\
   DECL_VARIABLE(VAR, float, 32, 4)

 #define DECL_VARIABLE_ALL_VARIANTS(VAR)		\
   DECL_VARIABLE_64BITS_VARIANTS(VAR);		\
   DECL_VARIABLE_128BITS_VARIANTS(VAR)

 #define DECL_VARIABLE_SIGNED_VARIANTS(VAR)	\
   DECL_VARIABLE_64BITS_SIGNED_VARIANTS(VAR);	\
   DECL_VARIABLE_128BITS_SIGNED_VARIANTS(VAR)

 #define DECL_VARIABLE_UNSIGNED_VARIANTS(VAR)	\
   DECL_VARIABLE_64BITS_UNSIGNED_VARIANTS(VAR);	\
   DECL_VARIABLE_128BITS_UNSIGNED_VARIANTS(VAR)

 /* Helpers to initialize vectors */
 #define VDUP(VAR, Q, T1, T2, W, N, V)		\
   VECT_VAR(VAR, T1, W, N) = vdup##Q##_n_##T2##W(V)

 #define TEST_VSET_LANE(VAR, Q, T1, T2, W, N, L, V)			\
   VECT_VAR(VAR, T1, W, N) = vset##Q##_lane_##T2##W(V,			\
 						   VECT_VAR(VAR, T1, W, N), \
 						   L)

 /* We need to load initial values first, so rely on VLD1 */
 #define VLOAD(VAR, BUF, Q, T1, T2, W, N)				\
   VECT_VAR(VAR, T1, W, N) = vld1##Q##_##T2##W(VECT_VAR(BUF, T1, W, N))

 /* Helpers for macros with 1 constant and 5 variable arguments */
 #define TEST_MACRO_64BITS_SIGNED_VARIANTS_1_5(MACRO, VAR)	\
   MACRO(VAR, , int, s, 8, 8);					\
   MACRO(VAR, , int, s, 16, 4);					\
   MACRO(VAR, , int, s, 32, 2);					\
   MACRO(VAR, , int, s, 64, 1)

 #define TEST_MACRO_64BITS_UNSIGNED_VARIANTS_1_5(MACRO, VAR)	\
   MACRO(VAR, , uint, u, 8, 8);					\
   MACRO(VAR, , uint, u, 16, 4);					\
   MACRO(VAR, , uint, u, 32, 2);					\
   MACRO(VAR, , uint, u, 64, 1)

 #define TEST_MACRO_128BITS_SIGNED_VARIANTS_1_5(MACRO, VAR)	\
   MACRO(VAR, q, int, s, 8, 16);					\
   MACRO(VAR, q, int, s, 16, 8);					\
   MACRO(VAR, q, int, s, 32, 4);					\
   MACRO(VAR, q, int, s, 64, 2)

 #define TEST_MACRO_128BITS_UNSIGNED_VARIANTS_1_5(MACRO,VAR)	\
   MACRO(VAR, q, uint, u, 8, 16);				\
   MACRO(VAR, q, uint, u, 16, 8);				\
   MACRO(VAR, q, uint, u, 32, 4);				\
   MACRO(VAR, q, uint, u, 64, 2)

 #define TEST_MACRO_64BITS_VARIANTS_1_5(MACRO, VAR)	\
   TEST_MACRO_64BITS_SIGNED_VARIANTS_1_5(MACRO, VAR);	\
   TEST_MACRO_64BITS_UNSIGNED_VARIANTS_1_5(MACRO, VAR)

 #define TEST_MACRO_128BITS_VARIANTS_1_5(MACRO, VAR)	\
   TEST_MACRO_128BITS_SIGNED_VARIANTS_1_5(MACRO, VAR);	\
   TEST_MACRO_128BITS_UNSIGNED_VARIANTS_1_5(MACRO, VAR)

 #define TEST_MACRO_ALL_VARIANTS_1_5(MACRO, VAR)	\
   TEST_MACRO_64BITS_VARIANTS_1_5(MACRO, VAR);	\
   TEST_MACRO_128BITS_VARIANTS_1_5(MACRO, VAR)

 #define TEST_MACRO_SIGNED_VARIANTS_1_5(MACRO, VAR)	\
   TEST_MACRO_64BITS_SIGNED_VARIANTS_1_5(MACRO, VAR);	\
   TEST_MACRO_128BITS_SIGNED_VARIANTS_1_5(MACRO, VAR)

 /* Helpers for macros with 2 constant and 5 variable arguments */
 #define TEST_MACRO_64BITS_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2)	\
   MACRO(VAR1, VAR2, , int, s, 8, 8);					\
   MACRO(VAR1, VAR2, , int, s, 16, 4);					\
   MACRO(VAR1, VAR2, , int, s, 32, 2);					\
   MACRO(VAR1, VAR2 , , int, s, 64, 1)

 #define TEST_MACRO_64BITS_UNSIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2)	\
   MACRO(VAR1, VAR2, , uint, u, 8, 8);					\
   MACRO(VAR1, VAR2, , uint, u, 16, 4);					\
   MACRO(VAR1, VAR2, , uint, u, 32, 2);					\
   MACRO(VAR1, VAR2, , uint, u, 64, 1)

 #define TEST_MACRO_128BITS_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2)	\
   MACRO(VAR1, VAR2, q, int, s, 8, 16);					\
   MACRO(VAR1, VAR2, q, int, s, 16, 8);					\
   MACRO(VAR1, VAR2, q, int, s, 32, 4);					\
   MACRO(VAR1, VAR2, q, int, s, 64, 2)

 #define TEST_MACRO_128BITS_UNSIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2)	\
   MACRO(VAR1, VAR2, q, uint, u, 8, 16);					\
   MACRO(VAR1, VAR2, q, uint, u, 16, 8);					\
   MACRO(VAR1, VAR2, q, uint, u, 32, 4);					\
   MACRO(VAR1, VAR2, q, uint, u, 64, 2)

 #define TEST_MACRO_64BITS_VARIANTS_2_5(MACRO, VAR1, VAR2)	\
   TEST_MACRO_64BITS_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
   TEST_MACRO_64BITS_UNSIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
   MACRO(VAR1, VAR2, , poly, p, 8, 8);				\
   MACRO(VAR1, VAR2, , poly, p, 16, 4)

 #define TEST_MACRO_128BITS_VARIANTS_2_5(MACRO, VAR1, VAR2)	\
   TEST_MACRO_128BITS_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
   TEST_MACRO_128BITS_UNSIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
   MACRO(VAR1, VAR2, q, poly, p, 8, 16);				\
   MACRO(VAR1, VAR2, q, poly, p, 16, 8)

 #define TEST_MACRO_ALL_VARIANTS_2_5(MACRO, VAR1, VAR2)	\
   TEST_MACRO_64BITS_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
   TEST_MACRO_128BITS_VARIANTS_2_5(MACRO, VAR1, VAR2)

 #define TEST_MACRO_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2)	\
   TEST_MACRO_64BITS_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
   TEST_MACRO_128BITS_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2)

 #endif /* _STM_ARM_NEON_REF_H_ */