test_conformance/geometrics/test_geometrics.cpp - platform/external/OpenCL-CTS - Git at Google

 //
 // Copyright (c) 2017 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 #include "harness/compat.h"

 #include "testBase.h"
 #include "harness/testHarness.h"
 #include "harness/typeWrappers.h"
 #include "harness/conversions.h"
 #include "harness/errorHelpers.h"
 #include <float.h>

 const char *crossKernelSource =
 "__kernel void sample_test(__global float4 *sourceA, __global float4 *sourceB, __global float4 *destValues)\n"
 "{\n"
 "    int  tid = get_global_id(0);\n"
 "    destValues[tid] = cross( sourceA[tid], sourceB[tid] );\n"
 "\n"
 "}\n" ;

 const char *crossKernelSourceV3 =
 "__kernel void sample_test(__global float *sourceA, __global float *sourceB, __global float *destValues)\n"
 "{\n"
 "    int  tid = get_global_id(0);\n"
 "    vstore3( cross( vload3( tid, sourceA), vload3( tid,  sourceB) ), tid, destValues );\n"
 "\n"
 "}\n";

 const char *twoToFloatKernelPattern =
 "__kernel void sample_test(__global float%s *sourceA, __global float%s *sourceB, __global float *destValues)\n"
 "{\n"
 "    int  tid = get_global_id(0);\n"
 "    destValues[tid] = %s( sourceA[tid], sourceB[tid] );\n"
 "\n"
 "}\n";

 const char *twoToFloatKernelPatternV3 =
 "__kernel void sample_test(__global float%s *sourceA, __global float%s *sourceB, __global float *destValues)\n"
 "{\n"
 "    int  tid = get_global_id(0);\n"
 "    destValues[tid] = %s( vload3( tid, (__global float*) sourceA), vload3( tid, (__global float*) sourceB) );\n"
 "\n"
 "}\n";

 const char *oneToFloatKernelPattern =
 "__kernel void sample_test(__global float%s *sourceA, __global float *destValues)\n"
 "{\n"
 "    int  tid = get_global_id(0);\n"
 "    destValues[tid] = %s( sourceA[tid] );\n"
 "\n"
 "}\n";

 const char *oneToFloatKernelPatternV3 =
 "__kernel void sample_test(__global float%s *sourceA, __global float *destValues)\n"
 "{\n"
 "    int  tid = get_global_id(0);\n"
 "    destValues[tid] = %s( vload3( tid, (__global float*) sourceA) );\n"
 "\n"
 "}\n";

 const char *oneToOneKernelPattern =
 "__kernel void sample_test(__global float%s *sourceA, __global float%s *destValues)\n"
 "{\n"
 "    int  tid = get_global_id(0);\n"
 "    destValues[tid] = %s( sourceA[tid] );\n"
 "\n"
 "}\n";

 const char *oneToOneKernelPatternV3 =
 "__kernel void sample_test(__global float%s *sourceA, __global float%s *destValues)\n"
 "{\n"
 "    int  tid = get_global_id(0);\n"
 "    vstore3( %s( vload3( tid, (__global float*) sourceA) ), tid, (__global float*) destValues );\n"
 "\n"
 "}\n";

 #define TEST_SIZE (1 << 20)

 double verifyFastDistance( float *srcA, float *srcB, size_t vecSize );
 double verifyFastLength( float *srcA, size_t vecSize );


 void vector2string( char *string, float *vector, size_t elements )
 {
     *string++ = '{';
     *string++ = ' ';
     string += sprintf( string, "%a", vector[0] );
     size_t i;
     for( i = 1; i < elements; i++ )
         string += sprintf( string, ", %a", vector[i] );
     *string++ = ' ';
     *string++ = '}';
     *string = '\0';
 }

 void fillWithTrickyNumbers( float *aVectors, float *bVectors, size_t vecSize )
 {
     static const cl_float trickyValues[] = { -FLT_EPSILON, FLT_EPSILON,
         MAKE_HEX_FLOAT(0x1.0p63f, 0x1L, 63), MAKE_HEX_FLOAT(0x1.8p63f, 0x18L, 59), MAKE_HEX_FLOAT(0x1.0p64f, 0x1L, 64), MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), MAKE_HEX_FLOAT(-0x1.8p-63f, -0x18L, -67), MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
         MAKE_HEX_FLOAT(0x1.0p-63f, 0x1L, -63), MAKE_HEX_FLOAT(0x1.8p-63f, 0x18L, -67), MAKE_HEX_FLOAT(0x1.0p-64f, 0x1L, -64), MAKE_HEX_FLOAT(-0x1.0p-63f, -0x1L, -63), MAKE_HEX_FLOAT(-0x1.8p-63f, -0x18L, -67), MAKE_HEX_FLOAT(-0x1.0p-64f, -0x1L, -64),
         FLT_MAX / 2.f, -FLT_MAX / 2.f, INFINITY,  -INFINITY, 0.f, -0.f };
     static const size_t trickyCount = sizeof( trickyValues ) / sizeof( trickyValues[0] );
     static const size_t stride[4] = {1, trickyCount, trickyCount*trickyCount, trickyCount*trickyCount*trickyCount };
     size_t i, j, k;

     for( j = 0; j < vecSize; j++ )
         for( k = 0; k < vecSize; k++ )
             for( i = 0; i < trickyCount; i++ )
                 aVectors[ j + stride[j] * (i + k*trickyCount)*vecSize] = trickyValues[i];

     if( bVectors )
     {
         size_t copySize = vecSize * vecSize * trickyCount;
         memset( bVectors, 0, sizeof(float) * copySize );
         memset( aVectors + copySize, 0, sizeof(float) * copySize );
         memcpy( bVectors + copySize, aVectors, sizeof(float) * copySize );
     }
 }


 void cross_product( const float *vecA, const float *vecB, float *outVector, float *errorTolerances, float ulpTolerance )
 {
     outVector[ 0 ] = ( vecA[ 1 ] * vecB[ 2 ] ) - ( vecA[ 2 ] * vecB[ 1 ] );
     outVector[ 1 ] = ( vecA[ 2 ] * vecB[ 0 ] ) - ( vecA[ 0 ] * vecB[ 2 ] );
     outVector[ 2 ] = ( vecA[ 0 ] * vecB[ 1 ] ) - ( vecA[ 1 ] * vecB[ 0 ] );
     outVector[ 3 ] = 0.0f;

     errorTolerances[ 0 ] = fmaxf( fabsf( vecA[ 1 ] ), fmaxf( fabsf( vecB[ 2 ] ), fmaxf( fabsf( vecA[ 2 ] ), fabsf( vecB[ 1 ] ) ) ) );
     errorTolerances[ 1 ] = fmaxf( fabsf( vecA[ 2 ] ), fmaxf( fabsf( vecB[ 0 ] ), fmaxf( fabsf( vecA[ 0 ] ), fabsf( vecB[ 2 ] ) ) ) );
     errorTolerances[ 2 ] = fmaxf( fabsf( vecA[ 0 ] ), fmaxf( fabsf( vecB[ 1 ] ), fmaxf( fabsf( vecA[ 1 ] ), fabsf( vecB[ 0 ] ) ) ) );

     errorTolerances[ 0 ] = errorTolerances[ 0 ] * errorTolerances[ 0 ] * ( ulpTolerance * FLT_EPSILON );    // This gives us max squared times ulp tolerance, i.e. the worst-case expected variance we could expect from this result
     errorTolerances[ 1 ] = errorTolerances[ 1 ] * errorTolerances[ 1 ] * ( ulpTolerance * FLT_EPSILON );
     errorTolerances[ 2 ] = errorTolerances[ 2 ] * errorTolerances[ 2 ] * ( ulpTolerance * FLT_EPSILON );
 }


 int test_geom_cross(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
 {
     int vecsize;
     RandomSeed seed(gRandomSeed);

     /* Get the default rounding mode */
     cl_device_fp_config defaultRoundingMode = get_default_rounding_mode(deviceID);
     if( 0 == defaultRoundingMode )
         return -1;


     for(vecsize = 3; vecsize <= 4; ++vecsize)
     {
         clProgramWrapper program;
         clKernelWrapper kernel;
         clMemWrapper streams[3];
         BufferOwningPtr<cl_float> A(malloc(sizeof(cl_float) * TEST_SIZE * vecsize));
         BufferOwningPtr<cl_float> B(malloc(sizeof(cl_float) * TEST_SIZE * vecsize));
         BufferOwningPtr<cl_float> C(malloc(sizeof(cl_float) * TEST_SIZE * vecsize));
         cl_float testVector[4];
         int error, i;
         cl_float *inDataA = A;
         cl_float *inDataB = B;
         cl_float *outData = C;
         size_t threads[1], localThreads[1];

         /* Create kernels */
         if( create_single_kernel_helper( context, &program, &kernel, 1, vecsize == 3 ? &crossKernelSourceV3 : &crossKernelSource, "sample_test" ) )
             return -1;

         /* Generate some streams. Note: deliberately do some random data in w to verify that it gets ignored */
         for( i = 0; i < TEST_SIZE * vecsize; i++ )
         {
             inDataA[ i ] = get_random_float( -512.f, 512.f, seed );
             inDataB[ i ] = get_random_float( -512.f, 512.f, seed );
         }
         fillWithTrickyNumbers( inDataA, inDataB, vecsize );

         streams[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_COPY_HOST_PTR), sizeof(cl_float) * vecsize * TEST_SIZE, inDataA, NULL);
         if( streams[0] == NULL )
         {
             log_error("ERROR: Creating input array A failed!\n");
             return -1;
         }
         streams[1] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_COPY_HOST_PTR), sizeof(cl_float) * vecsize * TEST_SIZE, inDataB, NULL);
         if( streams[1] == NULL )
         {
             log_error("ERROR: Creating input array B failed!\n");
             return -1;
         }
         streams[2] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE), sizeof(cl_float) * vecsize * TEST_SIZE, NULL, NULL);
         if( streams[2] == NULL )
         {
             log_error("ERROR: Creating output array failed!\n");
             return -1;
         }

         /* Assign streams and execute */
         for( i = 0; i < 3; i++ )
         {
             error = clSetKernelArg(kernel, i, sizeof( streams[i] ), &streams[i]);
             test_error( error, "Unable to set indexed kernel arguments" );
         }

         /* Run the kernel */
         threads[0] = TEST_SIZE;

         error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
         test_error( error, "Unable to get work group size to use" );

         error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
         test_error( error, "Unable to execute test kernel" );

         /* Now get the results */
         error = clEnqueueReadBuffer( queue, streams[2], true, 0, sizeof( cl_float ) * TEST_SIZE * vecsize, outData, 0, NULL, NULL );
         test_error( error, "Unable to read output array!" );

         /* And verify! */
         for( i = 0; i < TEST_SIZE; i++ )
         {
             float errorTolerances[ 4 ];
             // On an embedded device w/ round-to-zero, 3 ulps is the worst-case tolerance for cross product
             cross_product( inDataA + i * vecsize, inDataB + i * vecsize, testVector, errorTolerances, 3.f );

         // RTZ devices accrue approximately double the amount of error per operation.  Allow for that.
         if( defaultRoundingMode == CL_FP_ROUND_TO_ZERO )
         {
             errorTolerances[0] *= 2.0f;
             errorTolerances[1] *= 2.0f;
             errorTolerances[2] *= 2.0f;
             errorTolerances[3] *= 2.0f;
         }

             float errs[] = { fabsf( testVector[ 0 ] - outData[ i * vecsize + 0 ] ),
                              fabsf( testVector[ 1 ] - outData[ i * vecsize + 1 ] ),
                              fabsf( testVector[ 2 ] - outData[ i * vecsize + 2 ] ) };

             if( errs[ 0 ] > errorTolerances[ 0 ] || errs[ 1 ] > errorTolerances[ 1 ] || errs[ 2 ] > errorTolerances[ 2 ] )
             {
                 log_error( "ERROR: Data sample %d does not validate! Expected (%a,%a,%a,%a), got (%a,%a,%a,%a)\n",
                           i, testVector[0], testVector[1], testVector[2], testVector[3],
                           outData[i*vecsize], outData[i*vecsize+1], outData[i*vecsize+2], outData[i*vecsize+3] );
                 log_error( "    Input: (%a %a %a) and (%a %a %a)\n",
                           inDataA[ i * vecsize + 0 ], inDataA[ i * vecsize + 1 ], inDataA[ i * vecsize + 2 ],
                           inDataB[ i * vecsize + 0 ], inDataB[ i * vecsize + 1 ], inDataB[ i * vecsize + 2 ] );
                 log_error( "    Errors: (%a out of %a), (%a out of %a), (%a out of %a)\n",
                           errs[ 0 ], errorTolerances[ 0 ], errs[ 1 ], errorTolerances[ 1 ], errs[ 2 ], errorTolerances[ 2 ] );
                 log_error("     ulp %f\n", Ulp_Error( outData[ i * vecsize + 1 ], testVector[ 1 ] ) );
                 return -1;
             }
         }
     } // for(vecsize=...

     if(!is_extension_available(deviceID, "cl_khr_fp64")) {
         log_info("Extension cl_khr_fp64 not supported; skipping double tests.\n");
         return 0;
     } else {
         log_info("Testing doubles...\n");
         return test_geom_cross_double( deviceID,  context,  queue,  num_elements, seed);
     }
 }

 float getMaxValue( float vecA[], float vecB[], size_t vecSize )
 {
     float a = fmaxf( fabsf( vecA[ 0 ] ), fabsf( vecB[ 0 ] ) );
     for( size_t i = 1; i < vecSize; i++ )
         a = fmaxf( fabsf( vecA[ i ] ), fmaxf( fabsf( vecB[ i ] ), a ) );
     return a;
 }

 typedef double (*twoToFloatVerifyFn)( float *srcA, float *srcB, size_t vecSize );

 int test_twoToFloat_kernel(cl_command_queue queue, cl_context context, const char *fnName,
                            size_t vecSize, twoToFloatVerifyFn verifyFn, float ulpLimit, MTdata d )
 {
     clProgramWrapper program;
     clKernelWrapper kernel;
     clMemWrapper streams[3];
     int error;
     size_t i, threads[1], localThreads[1];
     char kernelSource[10240];
     char *programPtr;
     char sizeNames[][4] = { "", "2", "3", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" };
     int hasInfNan = 1;
     cl_device_id device = NULL;

     error = clGetCommandQueueInfo( queue, CL_QUEUE_DEVICE, sizeof( device ), &device, NULL );
     test_error( error, "Unable to get command queue device" );

     /* Check for embedded devices doing nutty stuff */
     error = clGetDeviceInfo( device, CL_DEVICE_PROFILE, sizeof( kernelSource ), kernelSource, NULL );
     test_error( error, "Unable to get device profile" );
     if( 0 == strcmp( kernelSource, "EMBEDDED_PROFILE" ) )
     {
         cl_device_fp_config config = 0;
         error = clGetDeviceInfo( device, CL_DEVICE_SINGLE_FP_CONFIG, sizeof( config ), &config, NULL );
         test_error( error, "Unable to get CL_DEVICE_SINGLE_FP_CONFIG" );

         if( CL_FP_ROUND_TO_ZERO == (config & (CL_FP_ROUND_TO_NEAREST|CL_FP_ROUND_TO_ZERO)))
             ulpLimit *= 2.0f; // rtz operations average twice the accrued error of rte operations

         if( 0 == (config & CL_FP_INF_NAN) )
             hasInfNan = 0;
     }

     BufferOwningPtr<cl_float> A(malloc(sizeof(cl_float) * TEST_SIZE * 4));
     BufferOwningPtr<cl_float> B(malloc(sizeof(cl_float) * TEST_SIZE * 4));
     BufferOwningPtr<cl_float> C(malloc(sizeof(cl_float) * TEST_SIZE));

     cl_float *inDataA = A;
     cl_float *inDataB = B;
     cl_float *outData = C;

     /* Create the source */
     sprintf( kernelSource, vecSize == 3 ? twoToFloatKernelPatternV3 : twoToFloatKernelPattern, sizeNames[vecSize-1], sizeNames[vecSize-1], fnName );

     /* Create kernels */
     programPtr = kernelSource;
     if( create_single_kernel_helper( context, &program, &kernel, 1, (const char **)&programPtr, "sample_test" ) )
     {
         return -1;
     }
     /* Generate some streams */
     for( i = 0; i < TEST_SIZE * vecSize; i++ )
     {
         inDataA[ i ] = get_random_float( -512.f, 512.f, d );
         inDataB[ i ] = get_random_float( -512.f, 512.f, d );
     }
     fillWithTrickyNumbers( inDataA, inDataB, vecSize );

     /* Clamp values to be in range for fast_ functions */
     if( verifyFn == verifyFastDistance )
     {
         for( i = 0; i < TEST_SIZE * vecSize; i++ )
         {
             if( fabsf( inDataA[i] ) > MAKE_HEX_FLOAT(0x1.0p62f, 0x1L, 62) || fabsf( inDataA[i] ) < MAKE_HEX_FLOAT(0x1.0p-62f, 0x1L, -62) )
                 inDataA[ i ] = get_random_float( -512.f, 512.f, d );
             if( fabsf( inDataB[i] ) > MAKE_HEX_FLOAT(0x1.0p62f, 0x1L, 62) || fabsf( inDataB[i] ) < MAKE_HEX_FLOAT(0x1.0p-62f, 0x1L, -62) )
                 inDataB[ i ] = get_random_float( -512.f, 512.f, d );
         }
     }


     streams[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_COPY_HOST_PTR), sizeof(cl_float) * vecSize * TEST_SIZE, inDataA, NULL);
     if( streams[0] == NULL )
     {
         log_error("ERROR: Creating input array A failed!\n");
         return -1;
     }
     streams[1] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_COPY_HOST_PTR), sizeof(cl_float) * vecSize * TEST_SIZE, inDataB, NULL);
     if( streams[1] == NULL )
     {
         log_error("ERROR: Creating input array B failed!\n");
         return -1;
     }
     streams[2] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE), sizeof(cl_float) * TEST_SIZE, NULL, NULL);
     if( streams[2] == NULL )
     {
         log_error("ERROR: Creating output array failed!\n");
         return -1;
     }

     /* Assign streams and execute */
     for( i = 0; i < 3; i++ )
     {
         error = clSetKernelArg(kernel, (int)i, sizeof( streams[i] ), &streams[i]);
         test_error( error, "Unable to set indexed kernel arguments" );
     }

     /* Run the kernel */
     threads[0] = TEST_SIZE;

     error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
     test_error( error, "Unable to get work group size to use" );

     error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
     test_error( error, "Unable to execute test kernel" );

     /* Now get the results */
     error = clEnqueueReadBuffer( queue, streams[2], true, 0, sizeof( cl_float ) * TEST_SIZE, outData, 0, NULL, NULL );
     test_error( error, "Unable to read output array!" );


     /* And verify! */
     int skipCount = 0;
     for( i = 0; i < TEST_SIZE; i++ )
     {
         cl_float *src1 = inDataA + i * vecSize;
         cl_float *src2 = inDataB + i * vecSize;
         double expected = verifyFn( src1, src2, vecSize );
         if( (float) expected != outData[ i ] )
         {
             if( isnan(expected) && isnan( outData[i] ) )
                 continue;

             if( ! hasInfNan )
             {
                 size_t ii;
                 for( ii = 0; ii < vecSize; ii++ )
                 {
                     if( ! isfinite( src1[ii] ) || ! isfinite( src2[ii] ) )
                     {
                         skipCount++;
                         continue;
                     }
                 }
                 if( ! isfinite( (cl_float) expected ) )
                 {
                     skipCount++;
                     continue;
                 }
             }

             if( ulpLimit < 0 )
             {
                 // Limit below zero means we need to test via a computed error (like cross product does)
                 float maxValue =
                 getMaxValue( inDataA + i * vecSize, inDataB + i * vecSize,vecSize );
                 // In this case (dot is the only one that gets here), the ulp is 2*vecSize - 1 (n + n-1 max # of errors)
                 float errorTolerance = maxValue * maxValue * ( 2.f * (float)vecSize - 1.f ) * FLT_EPSILON;

                 // Limit below zero means test via epsilon instead
                 double error =
                 fabs( (double)expected - (double)outData[ i ] );
                 if( error > errorTolerance )
                 {

                     log_error( "ERROR: Data sample %d at size %d does not validate! Expected (%a), got (%a), sources (%a and %a) error of %g against tolerance %g\n",
                               (int)i, (int)vecSize, expected,
                               outData[ i ],
                               inDataA[i*vecSize],
                               inDataB[i*vecSize],
                               (float)error,
                               (float)errorTolerance );

                     char vecA[1000], vecB[1000];
                     vector2string( vecA, inDataA +i * vecSize, vecSize );
                     vector2string( vecB, inDataB + i * vecSize, vecSize );
                     log_error( "\tvector A: %s, vector B: %s\n", vecA, vecB );
                     return -1;
                 }
             }
             else
             {
                 float error = Ulp_Error( outData[ i ], expected );
                 if( fabsf(error) > ulpLimit )
                 {
                     log_error( "ERROR: Data sample %d at size %d does not validate! Expected (%a), got (%a), sources (%a and %a) ulp of %f\n",
                               (int)i, (int)vecSize, expected, outData[ i ], inDataA[i*vecSize], inDataB[i*vecSize], error );

                     char vecA[1000], vecB[1000];
                     vector2string( vecA, inDataA + i * vecSize, vecSize );
                     vector2string( vecB, inDataB + i * vecSize, vecSize );
                     log_error( "\tvector A: %s, vector B: %s\n", vecA, vecB );
                     return -1;
                 }
             }
         }
     }

     if( skipCount )
         log_info( "Skipped %d tests out of %d because they contained Infs or NaNs\n\tEMBEDDED_PROFILE Device does not support CL_FP_INF_NAN\n", skipCount, TEST_SIZE );

     return 0;
 }

 double verifyDot( float *srcA, float *srcB, size_t vecSize )
 {
     double total = 0.f;

     for( unsigned int i = 0; i < vecSize; i++ )
         total += (double)srcA[ i ] * (double)srcB[ i ];

     return total;
 }

 int test_geom_dot(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
 {
     size_t sizes[] = { 1, 2, 3, 4, 0 };
     unsigned int size;
     int retVal = 0;
     RandomSeed seed(gRandomSeed);

     for( size = 0; sizes[ size ] != 0 ; size++ )
     {
         if( test_twoToFloat_kernel( queue, context, "dot", sizes[size], verifyDot, -1.0f /*magic value*/, seed ) != 0 )
         {
             log_error( "   dot vector size %d FAILED\n", (int)sizes[ size ] );
             retVal = -1;
         }
     }

     if (retVal)
         return retVal;

     if(!is_extension_available(deviceID, "cl_khr_fp64"))
     {
         log_info("Extension cl_khr_fp64 not supported; skipping double tests.\n");
         return 0;
     }

     log_info("Testing doubles...\n");
     return test_geom_dot_double( deviceID,  context,  queue,  num_elements, seed);
 }

 double verifyFastDistance( float *srcA, float *srcB, size_t vecSize )
 {
     double total = 0, value;
     unsigned int i;

     // We calculate the distance as a double, to try and make up for the fact that
     // the GPU has better precision distance since it's a single op
     for( i = 0; i < vecSize; i++ )
     {
         value = (double)srcA[i] - (double)srcB[i];
         total += value * value;
     }

     return sqrt( total );
 }

 int test_geom_fast_distance(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
 {
     size_t sizes[] = { 1, 2, 3, 4, 0 };
     unsigned int size;
     int retVal = 0;
     RandomSeed seed(gRandomSeed);

     for( size = 0; sizes[ size ] != 0 ; size++ )
     {
         float maxUlps = 8192.0f +                           // error in sqrt
         ( 1.5f * (float) sizes[size] +      // cumulative error for multiplications  (a-b+0.5ulp)**2 = (a-b)**2 + a*0.5ulp + b*0.5 ulp + 0.5 ulp for multiplication
          0.5f * (float) (sizes[size]-1));    // cumulative error for additions

         if( test_twoToFloat_kernel( queue, context, "fast_distance",
                                    sizes[ size ], verifyFastDistance,
                                    maxUlps, seed ) != 0 )
         {
             log_error( "   fast_distance vector size %d FAILED\n",
                       (int)sizes[ size ] );
             retVal = -1;
         }
         else
         {
             log_info( "   fast_distance vector size %d passed\n",
                      (int)sizes[ size ] );
         }
     }
     return retVal;
 }


 double verifyDistance( float *srcA, float *srcB, size_t vecSize )
 {
     double total = 0, value;
     unsigned int i;

     // We calculate the distance as a double, to try and make up for the fact that
     // the GPU has better precision distance since it's a single op
     for( i = 0; i < vecSize; i++ )
     {
         value = (double)srcA[i] - (double)srcB[i];
         total += value * value;
     }

     return sqrt( total );
 }

 int test_geom_distance(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
 {
     size_t sizes[] = { 1, 2, 3, 4, 0 };
     unsigned int size;
     int retVal = 0;
     RandomSeed seed(gRandomSeed );

     for( size = 0; sizes[ size ] != 0 ; size++ )
     {
         float maxUlps = 3.0f +                              // error in sqrt
         ( 1.5f * (float) sizes[size] +      // cumulative error for multiplications  (a-b+0.5ulp)**2 = (a-b)**2 + a*0.5ulp + b*0.5 ulp + 0.5 ulp for multiplication
          0.5f * (float) (sizes[size]-1));    // cumulative error for additions

         if( test_twoToFloat_kernel( queue, context, "distance", sizes[ size ], verifyDistance, maxUlps, seed ) != 0 )
         {
             log_error( "   distance vector size %d FAILED\n",
                       (int)sizes[ size ] );
             retVal = -1;
         }
         else
         {
             log_info( "   distance vector size %d passed\n", (int)sizes[ size ] );
         }
     }
     if (retVal)
         return retVal;

     if(!is_extension_available(deviceID, "cl_khr_fp64"))
     {
         log_info("Extension cl_khr_fp64 not supported; skipping double tests.\n");
         return 0;
     } else {
         log_info("Testing doubles...\n");
         return test_geom_distance_double( deviceID,  context,  queue,  num_elements, seed);
     }
 }

 typedef double (*oneToFloatVerifyFn)( float *srcA, size_t vecSize );

 int test_oneToFloat_kernel(cl_command_queue queue, cl_context context, const char *fnName,
                            size_t vecSize, oneToFloatVerifyFn verifyFn, float ulpLimit, MTdata d )
 {
     clProgramWrapper program;
     clKernelWrapper kernel;
     clMemWrapper streams[2];
     BufferOwningPtr<cl_float> A(malloc(sizeof(cl_float) * TEST_SIZE * 4));
     BufferOwningPtr<cl_float> B(malloc(sizeof(cl_float) * TEST_SIZE));
     int error;
     size_t i, threads[1], localThreads[1];
     char kernelSource[10240];
     char *programPtr;
     char sizeNames[][4] = { "", "2", "3", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" };
     cl_float *inDataA = A;
     cl_float *outData = B;

     /* Create the source */
     sprintf( kernelSource, vecSize == 3? oneToFloatKernelPatternV3 : oneToFloatKernelPattern, sizeNames[vecSize-1], fnName );

     /* Create kernels */
     programPtr = kernelSource;
     if( create_single_kernel_helper( context, &program, &kernel, 1, (const char **)&programPtr, "sample_test" ) )
     {
         return -1;
     }

     /* Generate some streams */
     for( i = 0; i < TEST_SIZE * vecSize; i++ )
     {
         inDataA[ i ] = get_random_float( -512.f, 512.f, d );
     }
     fillWithTrickyNumbers( inDataA, NULL, vecSize );

     /* Clamp values to be in range for fast_ functions */
     if( verifyFn == verifyFastLength )
     {
         for( i = 0; i < TEST_SIZE * vecSize; i++ )
         {
             if( fabsf( inDataA[i] ) > MAKE_HEX_FLOAT(0x1.0p62f, 0x1L, 62) || fabsf( inDataA[i] ) < MAKE_HEX_FLOAT(0x1.0p-62f, 0x1L, -62) )
                 inDataA[ i ] = get_random_float( -512.f, 512.f, d );
         }
     }

     streams[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_COPY_HOST_PTR),
                                 sizeof(cl_float) * vecSize * TEST_SIZE, inDataA, NULL);
     if( streams[0] == NULL )
     {
         log_error("ERROR: Creating input array A failed!\n");
         return -1;
     }
     streams[1] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE),
                                 sizeof(cl_float) * TEST_SIZE, NULL, NULL);
     if( streams[1] == NULL )
     {
         log_error("ERROR: Creating output array failed!\n");
         return -1;
     }

     /* Assign streams and execute */
     error = clSetKernelArg( kernel, 0, sizeof( streams[ 0 ] ), &streams[0] );
     test_error( error, "Unable to set indexed kernel arguments" );
     error = clSetKernelArg( kernel, 1, sizeof( streams[ 1 ] ), &streams[1] );
     test_error( error, "Unable to set indexed kernel arguments" );

     /* Run the kernel */
     threads[0] = TEST_SIZE;

     error = get_max_common_work_group_size( context, kernel, threads[0],
                                            &localThreads[0] );
     test_error( error, "Unable to get work group size to use" );

     error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads,
                                    localThreads, 0, NULL, NULL );
     test_error( error, "Unable to execute test kernel" );

     /* Now get the results */
     error = clEnqueueReadBuffer( queue, streams[1], true, 0,
                                 sizeof( cl_float ) * TEST_SIZE, outData,
                                 0, NULL, NULL );
     test_error( error, "Unable to read output array!" );

     /* And verify! */
     for( i = 0; i < TEST_SIZE; i++ )
     {
         double expected = verifyFn( inDataA + i * vecSize, vecSize );
         if( (float) expected != outData[ i ] )
         {
             float ulps = Ulp_Error( outData[i], expected );
             if( fabsf( ulps ) <= ulpLimit )
                 continue;

             // We have to special case NAN
             if( isnan( outData[ i ] ) && isnan( expected ) )
                 continue;

             if(! (fabsf(ulps) < ulpLimit) )
             {
                 log_error( "ERROR: Data sample %d at size %d does not validate! Expected (%a), got (%a), source (%a), ulp %f\n",
                           (int)i, (int)vecSize, expected, outData[ i ],  inDataA[i*vecSize], ulps );
                 char vecA[1000];
                 vector2string( vecA, inDataA + i *vecSize, vecSize );
                 log_error( "\tvector: %s", vecA );
                 return -1;
             }
         }
     }

     return 0;
 }

 double verifyLength( float *srcA, size_t vecSize )
 {
     double total = 0;
     unsigned int i;

     // We calculate the distance as a double, to try and make up for the fact that
     // the GPU has better precision distance since it's a single op
     for( i = 0; i < vecSize; i++ )
     {
         total += (double)srcA[i] * (double)srcA[i];
     }

     return sqrt( total );
 }

 int test_geom_length(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
 {
     size_t sizes[] = { 1, 2, 3, 4, 0 };
     unsigned int size;
     int retVal = 0;
     RandomSeed seed( gRandomSeed );

     for( size = 0; sizes[ size ] != 0 ; size++ )
     {
         float maxUlps = 3.0f +                              // error in sqrt
         0.5f *                              // effect on e of taking sqrt( x + e )
         ( 0.5f * (float) sizes[size] +      // cumulative error for multiplications
          0.5f * (float) (sizes[size]-1));    // cumulative error for additions

         if( test_oneToFloat_kernel( queue, context, "length", sizes[ size ], verifyLength, maxUlps, seed ) != 0 )
         {
             log_error( "   length vector size %d FAILED\n", (int)sizes[ size ] );
             retVal = -1;
         }
         else
         {
             log_info( "   length vector vector size %d passed\n", (int)sizes[ size ] );
         }
     }
     if (retVal)
         return retVal;

     if(!is_extension_available(deviceID, "cl_khr_fp64"))
     {
         log_info("Extension cl_khr_fp64 not supported; skipping double tests.\n");
         return 0;
     }
     else
     {
         log_info("Testing doubles...\n");
         return test_geom_length_double( deviceID,  context,  queue,  num_elements, seed);
     }
 }


 double verifyFastLength( float *srcA, size_t vecSize )
 {
     double total = 0;
     unsigned int i;

     // We calculate the distance as a double, to try and make up for the fact that
     // the GPU has better precision distance since it's a single op
     for( i = 0; i < vecSize; i++ )
     {
         total += (double)srcA[i] * (double)srcA[i];
     }

     return sqrt( total );
 }

 int test_geom_fast_length(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
 {
     size_t sizes[] = { 1, 2, 3, 4, 0 };
     unsigned int size;
     int retVal = 0;
     RandomSeed seed(gRandomSeed);

     for( size = 0; sizes[ size ] != 0 ; size++ )
     {
         float maxUlps = 8192.0f +                           // error in half_sqrt
         ( 0.5f * (float) sizes[size] +      // cumulative error for multiplications
          0.5f * (float) (sizes[size]-1));    // cumulative error for additions

         if( test_oneToFloat_kernel( queue, context, "fast_length", sizes[ size ], verifyFastLength, maxUlps, seed ) != 0 )
         {
             log_error( "   fast_length vector size %d FAILED\n", (int)sizes[ size ] );
             retVal = -1;
         }
         else
         {
             log_info( "   fast_length vector size %d passed\n", (int)sizes[ size ] );
         }
     }
     return retVal;
 }


 typedef void (*oneToOneVerifyFn)( float *srcA, float *dstA, size_t vecSize );


 int test_oneToOne_kernel(cl_command_queue queue, cl_context context, const char *fnName,
                          size_t vecSize, oneToOneVerifyFn verifyFn, float ulpLimit, int softball, MTdata d )
 {
     clProgramWrapper program;
     clKernelWrapper kernel;
     clMemWrapper streams[2];
     BufferOwningPtr<cl_float> A(malloc(sizeof(cl_float) * TEST_SIZE
                                        * vecSize));
     BufferOwningPtr<cl_float> B(malloc(sizeof(cl_float) * TEST_SIZE
                                        * vecSize));
     int error;
     size_t i, j, threads[1], localThreads[1];
     char kernelSource[10240];
     char *programPtr;
     char sizeNames[][4] = { "", "2", "3", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" };
     cl_float *inDataA = A;
     cl_float *outData = B;
     float ulp_error = 0;

     /* Create the source */
     sprintf( kernelSource, vecSize == 3 ? oneToOneKernelPatternV3: oneToOneKernelPattern, sizeNames[vecSize-1], sizeNames[vecSize-1], fnName );

     /* Create kernels */
     programPtr = kernelSource;
     if( create_single_kernel_helper( context, &program, &kernel, 1, (const char **)&programPtr,  "sample_test" ) )
         return -1;

     /* Initialize data.  First element always 0. */
     memset( inDataA, 0, sizeof(cl_float) * vecSize );
     if( 0 == strcmp( fnName, "fast_normalize" ))
     { // keep problematic cases out of the fast function
         for( i = vecSize; i < TEST_SIZE * vecSize; i++ )
         {
             cl_float z = get_random_float( -MAKE_HEX_FLOAT( 0x1.0p60f, 1, 60), MAKE_HEX_FLOAT( 0x1.0p60f, 1, 60), d);
             if( fabsf(z) < MAKE_HEX_FLOAT( 0x1.0p-60f, 1, -60) )
                 z = copysignf( 0.0f, z );
             inDataA[i] = z;
         }
     }
     else
     {
         for( i = vecSize; i < TEST_SIZE * vecSize; i++ )
             inDataA[i] = any_float(d);
     }

     streams[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_COPY_HOST_PTR), sizeof(cl_float) * vecSize* TEST_SIZE, inDataA, NULL);
     if( streams[0] == NULL )
     {
         log_error("ERROR: Creating input array A failed!\n");
         return -1;
     }
     streams[1] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE), sizeof(cl_float) * vecSize  * TEST_SIZE, NULL, NULL);
     if( streams[1] == NULL )
     {
         log_error("ERROR: Creating output array failed!\n");
         return -1;
     }

     /* Assign streams and execute */
     error = clSetKernelArg(kernel, 0, sizeof( streams[0] ), &streams[0] );
     test_error( error, "Unable to set indexed kernel arguments" );
     error = clSetKernelArg(kernel, 1, sizeof( streams[1] ), &streams[1] );
     test_error( error, "Unable to set indexed kernel arguments" );

     /* Run the kernel */
     threads[0] = TEST_SIZE;

     error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
     test_error( error, "Unable to get work group size to use" );

     error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
     test_error( error, "Unable to execute test kernel" );

     /* Now get the results */
     error = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof( cl_float ) * TEST_SIZE  * vecSize, outData, 0, NULL, NULL );
     test_error( error, "Unable to read output array!" );

     /* And verify! */
     for( i = 0; i < TEST_SIZE; i++ )
     {
         float expected[4];
         int fail = 0;
         verifyFn( inDataA + i * vecSize, expected, vecSize );
         for( j = 0; j < vecSize; j++ )
         {
             // We have to special case NAN
             if( isnan( outData[ i * vecSize + j ] )
                && isnan( expected[ j ] ) )
                 continue;

             if( expected[j] != outData[ i * vecSize + j ] ) {
                 ulp_error = Ulp_Error(  outData[i*vecSize+j], expected[ j ] );

                 if( fabsf(ulp_error) > ulpLimit ) {
                     fail = 1;
                     break;
                 }
             }

         }

         // try again with subnormals flushed to zero if the platform flushes
         if( fail && gFlushDenormsToZero )
         {
             float temp[4], expected2[4];
             for( j = 0; j < vecSize; j++ )
             {
                 if( IsFloatSubnormal(inDataA[i*vecSize+j] ) )
                     temp[j] = copysignf( 0.0f, inDataA[i*vecSize+j] );
                 else
                     temp[j] = inDataA[ i*vecSize +j];
             }

             verifyFn( temp, expected2, vecSize );
             fail = 0;

             for( j = 0; j < vecSize; j++ )
             {
                 // We have to special case NAN
                 if( isnan( outData[ i * vecSize + j ] ) && isnan( expected[ j ] ) )
                     continue;

                 if( expected2[j] != outData[ i * vecSize + j ] )
                 {
                     ulp_error = Ulp_Error(outData[i*vecSize + j ], expected[ j ]  );

                     if( fabsf(ulp_error) > ulpLimit )
                     {
                         if( IsFloatSubnormal(expected2[j]) )
                         {
                             expected2[j] = 0.0f;
                             if( expected2[j] !=  outData[i*vecSize + j ] )
                             {
                                 ulp_error = Ulp_Error(  outData[ i * vecSize + j ], expected[ j ] );
                                 if( fabsf(ulp_error) > ulpLimit ) {
                                     fail = 1;
                                     break;
                                 }
                             }
                         }
                     }
                 }
             }
         }

         if( fail )
         {
             log_error( "ERROR: Data sample {%d,%d} at size %d does not validate! Expected %12.24f (%a), got %12.24f (%a), ulp %f\n",
                       (int)i, (int)j, (int)vecSize, expected[j], expected[j], outData[ i*vecSize+j], outData[ i*vecSize+j], ulp_error );
             log_error( "       Source: " );
             for( size_t q = 0; q < vecSize; q++ )
                 log_error( "%g ", inDataA[ i * vecSize+q]);
             log_error( "\n             : " );
             for( size_t q = 0; q < vecSize; q++ )
                 log_error( "%a ", inDataA[i*vecSize +q] );
             log_error( "\n" );
             log_error( "       Result: " );
             for( size_t q = 0; q < vecSize; q++ )
                 log_error( "%g ", outData[ i *vecSize + q ] );
             log_error( "\n             : " );
             for( size_t q = 0; q < vecSize; q++ )
                 log_error( "%a ", outData[ i * vecSize + q ] );
             log_error( "\n" );
             log_error( "       Expected: " );
             for( size_t q = 0; q < vecSize; q++ )
                 log_error( "%g ", expected[ q ] );
             log_error( "\n             : " );
             for( size_t q = 0; q < vecSize; q++ )
                 log_error( "%a ", expected[ q ] );
             log_error( "\n" );
             return -1;
         }
     }

     return 0;
 }

 void verifyNormalize( float *srcA, float *dst, size_t vecSize )
 {
     double total = 0, value;
     unsigned int i;

     // We calculate everything as a double, to try and make up for the fact that
     // the GPU has better precision distance since it's a single op
     for( i = 0; i < vecSize; i++ )
         total += (double)srcA[i] * (double)srcA[i];

     if( total == 0.f )
     {
         // Special edge case: copy vector over without change
         for( i = 0; i < vecSize; i++ )
             dst[i] = srcA[i];
         return;
     }

     // Deal with infinities
     if( total == INFINITY )
     {
         total = 0.0f;
         for( i = 0; i < vecSize; i++ )
         {
             if( fabsf( srcA[i]) == INFINITY )
                 dst[i] = copysignf( 1.0f, srcA[i] );
             else
                 dst[i] = copysignf( 0.0f, srcA[i] );
             total += (double)dst[i] * (double)dst[i];
         }

         srcA = dst;
     }

     value = sqrt( total );
     for( i = 0; i < vecSize; i++ )
         dst[i] = (float)( (double)srcA[i] / value );
 }

 int test_geom_normalize(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
 {
     size_t sizes[] = { 1, 2, 3, 4, 0 };
     unsigned int size;
     int retVal = 0;
     RandomSeed seed(gRandomSeed);

     for( size = 0; sizes[ size ] != 0 ; size++ )
     {
         float maxUlps = 2.5f +                              // error in rsqrt + error in multiply
         ( 0.5f * (float) sizes[size] +      // cumulative error for multiplications
          0.5f * (float) (sizes[size]-1));    // cumulative error for additions
         if( test_oneToOne_kernel( queue, context, "normalize", sizes[ size ], verifyNormalize, maxUlps, 0, seed ) != 0 )
         {
             log_error( "   normalized vector size %d FAILED\n", (int)sizes[ size ] );
             retVal = -1;
         }
         else
         {
             log_info( "   normalized vector size %d passed\n", (int)sizes[ size ] );
         }
     }
     if (retVal)
         return retVal;

     if(!is_extension_available(deviceID, "cl_khr_fp64"))
     {
         log_info("Extension cl_khr_fp64 not supported; skipping double tests.\n");
         return 0;
     } else {
         log_info("Testing doubles...\n");
         return test_geom_normalize_double( deviceID,  context,  queue,  num_elements, seed);
     }
 }


 int test_geom_fast_normalize(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
 {
     size_t sizes[] = { 1, 2, 3, 4, 0 };
     unsigned int size;
     int retVal = 0;
     RandomSeed seed( gRandomSeed );

     for( size = 0; sizes[ size ] != 0 ; size++ )
     {
         float maxUlps = 8192.5f +                           // error in rsqrt + error in multiply
         ( 0.5f * (float) sizes[size] +      // cumulative error for multiplications
          0.5f * (float) (sizes[size]-1));    // cumulative error for additions

         if( test_oneToOne_kernel( queue, context, "fast_normalize", sizes[ size ], verifyNormalize, maxUlps, 1, seed ) != 0 )
         {
             log_error( "   fast_normalize vector size %d FAILED\n", (int)sizes[ size ] );
             retVal = -1;
         }
         else
         {
             log_info( "   fast_normalize vector size %d passed\n", (int)sizes[ size ] );
         }
     }
     return retVal;
 }