test_common/harness/ThreadPool.cpp - platform/external/OpenCL-CTS - Git at Google

 //
 // Copyright (c) 2017 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 #include "ThreadPool.h"
 #include "errorHelpers.h"
 #include "fpcontrol.h"
 #include <stdio.h>
 #include <stdlib.h>

 #if  defined( __APPLE__ ) || defined( __linux__ ) || defined( _WIN32 )  // or any other POSIX system

 #if defined( _WIN32 )
 #include <windows.h>
 #if defined(_MSC_VER)
 #include <intrin.h>
 #endif
 #include "mingw_compat.h"
 #include <process.h>
 #else // !_WIN32
 #include <pthread.h>
 #include <unistd.h>
 #include <sys/errno.h>
 #ifdef __linux__
 #include <sched.h>
 #endif
 #endif // !_WIN32

 // declarations
 #ifdef  _WIN32
 void ThreadPool_WorkerFunc( void *p );
 #else
 void *ThreadPool_WorkerFunc( void *p );
 #endif
 void ThreadPool_Init(void);
 void ThreadPool_Exit(void);

 #if defined (__MINGW32__)
     // Mutex for implementing super heavy atomic operations if you don't have GCC or MSVC
     CRITICAL_SECTION     gAtomicLock;
 #elif defined( __GNUC__ ) || defined( _MSC_VER)
 #else
     pthread_mutex_t     gAtomicLock;
 #endif

 // Atomic add operator with mem barrier.  Mem barrier needed to protect state modified by the worker functions.
 cl_int ThreadPool_AtomicAdd( volatile cl_int *a, cl_int b )
 {
 #if defined (__MINGW32__)
     // No atomics on Mingw32
     EnterCriticalSection(&gAtomicLock);
     cl_int old = *a;
     *a = old + b;
     LeaveCriticalSection(&gAtomicLock);
     return old;
 #elif defined( __GNUC__ )
     // GCC extension: http://gcc.gnu.org/onlinedocs/gcc/Atomic-Builtins.html#Atomic-Builtins
     return __sync_fetch_and_add( a, b );
     // do we need __sync_synchronize() here, too?  GCC docs are unclear whether __sync_fetch_and_add does a synchronize
 #elif defined( _MSC_VER )
     return (cl_int) _InterlockedExchangeAdd( (volatile LONG*) a, (LONG) b );
 #else
     #warning  Please add a atomic add implementation here, with memory barrier.  Fallback code is slow.
     if( pthread_mutex_lock(&gAtomicLock) )
         log_error( "Atomic operation failed. pthread_mutex_lock(&gAtomicLock) returned an error\n");
     cl_int old = *a;
     *a = old + b;
     if( pthread_mutex_unlock(&gAtomicLock) )
         log_error( "Failed to release gAtomicLock. Further atomic operations may deadlock!\n");
     return old;
 #endif
 }

 #if defined( _WIN32 )
 // Uncomment the following line if Windows XP support is not required.
 // #define HAS_INIT_ONCE_EXECUTE_ONCE 1

 #if defined(HAS_INIT_ONCE_EXECUTE_ONCE)
 #define _INIT_ONCE           INIT_ONCE
 #define _PINIT_ONCE          PINIT_ONCE
 #define _InitOnceExecuteOnce InitOnceExecuteOnce
 #else // !HAS_INIT_ONCE_EXECUTE_ONCE

 typedef volatile LONG _INIT_ONCE;
 typedef _INIT_ONCE *_PINIT_ONCE;
 typedef BOOL (CALLBACK *_PINIT_ONCE_FN)(_PINIT_ONCE, PVOID, PVOID *);

 #define _INIT_ONCE_UNINITIALIZED 0
 #define _INIT_ONCE_IN_PROGRESS   1
 #define _INIT_ONCE_DONE          2

 static BOOL _InitOnceExecuteOnce(
   _PINIT_ONCE InitOnce,
   _PINIT_ONCE_FN InitFn,
   PVOID Parameter,
   LPVOID *Context
 )
 {
     while ( *InitOnce != _INIT_ONCE_DONE )
     {
         if (*InitOnce != _INIT_ONCE_IN_PROGRESS && _InterlockedCompareExchange( InitOnce, _INIT_ONCE_IN_PROGRESS, _INIT_ONCE_UNINITIALIZED ) == _INIT_ONCE_UNINITIALIZED )
         {
             InitFn( InitOnce, Parameter, Context );
             *InitOnce = _INIT_ONCE_DONE;
             return TRUE;
         }
         Sleep( 1 );
     }
     return TRUE;
 }
 #endif // !HAS_INIT_ONCE_EXECUTE_ONCE

 // Uncomment the following line if Windows XP support is not required.
 // #define HAS_CONDITION_VARIABLE 1

 #if defined(HAS_CONDITION_VARIABLE)
 #define _CONDITION_VARIABLE          CONDITION_VARIABLE
 #define _InitializeConditionVariable InitializeConditionVariable
 #define _SleepConditionVariableCS    SleepConditionVariableCS
 #define _WakeAllConditionVariable    WakeAllConditionVariable
 #else // !HAS_CONDITION_VARIABLE
 typedef struct
 {
     HANDLE           mEvent; // Used to park the thread.
     CRITICAL_SECTION mLock[1]; // Used to protect mWaiters, mGeneration and mReleaseCount.
     volatile cl_int  mWaiters; // Number of threads waiting on this cond var.
     volatile cl_int  mGeneration; // Wait generation count.
     volatile cl_int  mReleaseCount; // Number of releases to execute before reseting the event.
 } _CONDITION_VARIABLE;

 typedef _CONDITION_VARIABLE *_PCONDITION_VARIABLE;

 static void _InitializeConditionVariable( _PCONDITION_VARIABLE cond_var )
 {
     cond_var->mEvent = CreateEvent( NULL, TRUE, FALSE, NULL );
     InitializeCriticalSection( cond_var->mLock );
     cond_var->mWaiters = 0;
     cond_var->mGeneration = 0;
 #if !defined ( NDEBUG )
     cond_var->mReleaseCount = 0;
 #endif // !NDEBUG
 }

 static void _SleepConditionVariableCS( _PCONDITION_VARIABLE cond_var, PCRITICAL_SECTION cond_lock, DWORD ignored)
 {
     EnterCriticalSection( cond_var->mLock );
     cl_int generation = cond_var->mGeneration;
     ++cond_var->mWaiters;
     LeaveCriticalSection( cond_var->mLock );
     LeaveCriticalSection( cond_lock );

     while ( TRUE )
     {
         WaitForSingleObject( cond_var->mEvent, INFINITE );
         EnterCriticalSection( cond_var->mLock );
         BOOL done = cond_var->mReleaseCount > 0 && cond_var->mGeneration != generation;
         LeaveCriticalSection( cond_var->mLock );
         if ( done )
         {
             break;
         }
     }

     EnterCriticalSection( cond_lock );
     EnterCriticalSection( cond_var->mLock );
     if ( --cond_var->mReleaseCount == 0 )
     {
         ResetEvent( cond_var->mEvent );
     }
     --cond_var->mWaiters;
     LeaveCriticalSection( cond_var->mLock );
 }

 static void _WakeAllConditionVariable( _PCONDITION_VARIABLE cond_var )
 {
     EnterCriticalSection( cond_var->mLock );
     if (cond_var->mWaiters > 0 )
     {
         ++cond_var->mGeneration;
         cond_var->mReleaseCount = cond_var->mWaiters;
         SetEvent( cond_var->mEvent );
     }
     LeaveCriticalSection( cond_var->mLock );
 }
 #endif // !HAS_CONDITION_VARIABLE
 #endif // _WIN32

 #define MAX_COUNT   (1<<29)

 // Global state to coordinate whether the threads have been launched successfully or not
 #if defined( _MSC_VER ) && (_WIN32_WINNT >= 0x600)
 static _INIT_ONCE threadpool_init_control;
 #elif defined (_WIN32)  // MingW of XP
 static int threadpool_init_control;
 #else // Posix platforms
 pthread_once_t threadpool_init_control = PTHREAD_ONCE_INIT;
 #endif
 cl_int threadPoolInitErr = -1;          // set to CL_SUCCESS on successful thread launch

 // critical region lock around ThreadPool_Do.  We can only run one ThreadPool_Do at a time,
 // because we are too lazy to set up a queue here, and don't expect to need one.
 #if defined( _WIN32 )
 CRITICAL_SECTION    gThreadPoolLock[1];
 #else // !_WIN32
 pthread_mutex_t     gThreadPoolLock;
 #endif // !_WIN32

 // Condition variable to park ThreadPool threads when not working
 #if defined( _WIN32 )
 CRITICAL_SECTION    cond_lock[1];
 _CONDITION_VARIABLE cond_var[1];
 #else // !_WIN32
 pthread_mutex_t     cond_lock;
 pthread_cond_t      cond_var;
 #endif // !_WIN32
 volatile cl_int     gRunCount = 0;              // Condition variable state. How many iterations on the function left to run.
                                                 // set to CL_INT_MAX to cause worker threads to exit. Note: this value might go negative.

 // State that only changes when the threadpool is not working.
 volatile TPFuncPtr  gFunc_ptr = NULL;
 volatile void       *gUserInfo = NULL;
 volatile cl_int     gJobCount = 0;

 // State that may change while the thread pool is working
 volatile cl_int     jobError = CL_SUCCESS;      // err code return for the job as a whole

 // Condition variable to park caller while waiting
 #if defined( _WIN32 )
 HANDLE              caller_event;
 #else // !_WIN32
 pthread_mutex_t     caller_cond_lock;
 pthread_cond_t      caller_cond_var;
 #endif // !_WIN32
 volatile cl_int     gRunning = 0;       // # of threads intended to be running. Running threads will decrement this as they discover they've run out of work to do.

 // The total number of threads launched.
 volatile cl_int     gThreadCount = 0;
 #ifdef _WIN32
 void ThreadPool_WorkerFunc( void *p )
 #else
 void *ThreadPool_WorkerFunc( void *p )
 #endif
 {
     cl_uint threadID = ThreadPool_AtomicAdd( (volatile cl_int *) p, 1 );
     cl_int item = ThreadPool_AtomicAdd( &gRunCount, -1 );
 //    log_info( "ThreadPool_WorkerFunc start: gRunning = %d\n", gRunning );

     while( MAX_COUNT > item )
     {
         cl_int err;

         // check for more work to do
         if( 0 >= item )
         {
 //            log_info( "Thread %d has run out of work.\n", threadID );

             // No work to do. Attempt to block waiting for work
 #if defined( _WIN32 )
             EnterCriticalSection( cond_lock );
 #else // !_WIN32
             if((err = pthread_mutex_lock( &cond_lock) ))
             {
                 log_error("Error %d from pthread_mutex_lock. Worker %d unable to block waiting for work. ThreadPool_WorkerFunc failed.\n", err, threadID );
                 goto exit;
             }
 #endif // !_WIN32

             cl_int remaining = ThreadPool_AtomicAdd( &gRunning, -1 );
 //            log_info( "ThreadPool_WorkerFunc: gRunning = %d\n", remaining - 1 );
             if( 1 == remaining )
             { // last thread out signal the main thread to wake up
 #if defined( _WIN32 )
                 SetEvent( caller_event );
 #else // !_WIN32
                 if((err = pthread_mutex_lock( &caller_cond_lock) ))
                 {
                     log_error("Error %d from pthread_mutex_lock. Unable to wake caller.\n", err );
                     goto exit;
                 }
                 if( (err = pthread_cond_broadcast( &caller_cond_var )))
                 {
                     log_error("Error %d from pthread_cond_broadcast. Unable to wake up main thread. ThreadPool_WorkerFunc failed.\n", err );
                     goto exit;
                 }
                 if((err = pthread_mutex_unlock( &caller_cond_lock) ))
                 {
                     log_error("Error %d from pthread_mutex_lock. Unable to wake caller.\n", err );
                     goto exit;
                 }
 #endif // !_WIN32
             }

             // loop in case we are woken only to discover that some other thread already did all the work
             while( 0 >= item )
             {
 #if defined( _WIN32 )
                 _SleepConditionVariableCS( cond_var, cond_lock, INFINITE );
 #else // !_WIN32
                 if((err = pthread_cond_wait( &cond_var, &cond_lock) ))
                 {
                     log_error("Error %d from pthread_cond_wait. Unable to block for waiting for work. ThreadPool_WorkerFunc failed.\n", err );
                     pthread_mutex_unlock( &cond_lock);
                     goto exit;
                 }
 #endif // !_WIN32

                 // try again to get a valid item id
                 item = ThreadPool_AtomicAdd( &gRunCount, -1 );
                 if( MAX_COUNT <= item )  // exit if we are done
                 {
 #if defined( _WIN32 )
                     LeaveCriticalSection( cond_lock );
 #else // !_WIN32
                     pthread_mutex_unlock( &cond_lock);
 #endif // !_WIN32
                     goto exit;
                 }
             }

             ThreadPool_AtomicAdd( &gRunning, 1 );
 //            log_info( "Thread %d has found work.\n", threadID);

 #if defined( _WIN32 )
             LeaveCriticalSection( cond_lock );
 #else // !_WIN32
             if((err = pthread_mutex_unlock( &cond_lock) ))
             {
                 log_error("Error %d from pthread_mutex_unlock. Unable to block for waiting for work. ThreadPool_WorkerFunc failed.\n", err );
                 goto exit;
             }
 #endif // !_WIN32

         }

         // we have a valid item, so do the work
         if( CL_SUCCESS == jobError )  // but only if we haven't already encountered an error
         {
 //            log_info( "Thread %d doing job %d\n", threadID, item - 1);

 #if defined(__APPLE__) && defined(__arm__)
             // On most platforms which support denorm, default is FTZ off. However,
             // on some hardware where the reference is computed, default might be flush denorms to zero e.g. arm.
             // This creates issues in result verification. Since spec allows the implementation to either flush or
             // not flush denorms to zero, an implementation may choose not be flush i.e. return denorm result whereas
             // reference result may be zero (flushed denorm). Hence we need to disable denorm flushing on host side
             // where reference is being computed to make sure we get non-flushed reference result. If implementation
             // returns flushed result, we correctly take care of that in verification code.
             FPU_mode_type oldMode;
             DisableFTZ( &oldMode );
 #endif

             // Call the user's function with this item ID
             err = gFunc_ptr( item - 1, threadID, (void*) gUserInfo );
 #if defined(__APPLE__) && defined(__arm__)
             // Restore FP state
             RestoreFPState( &oldMode );
 #endif

             if( err )
             {
 #if (__MINGW32__)
                 EnterCriticalSection(&gAtomicLock);
                 if( jobError == CL_SUCCESS );
                     jobError = err;
                 gRunCount = 0;
                 LeaveCriticalSection(&gAtomicLock);
 #elif defined( __GNUC__ )
                 // GCC extension: http://gcc.gnu.org/onlinedocs/gcc/Atomic-Builtins.html#Atomic-Builtins
                 // set the new error if we are the first one there.
                 __sync_val_compare_and_swap( &jobError, CL_SUCCESS, err );

                 // drop run count to 0
                 gRunCount = 0;
                 __sync_synchronize();
 #elif defined( _MSC_VER )
                 // set the new error if we are the first one there.
                 _InterlockedCompareExchange( (volatile LONG*) &jobError, err, CL_SUCCESS );

                 // drop run count to 0
                 gRunCount = 0;
                 _mm_mfence();
 #else
                 if( pthread_mutex_lock(&gAtomicLock) )
                     log_error( "Atomic operation failed. pthread_mutex_lock(&gAtomicLock) returned an error\n");
                 if( jobError == CL_SUCCESS );
                     jobError = err;
                 gRunCount = 0;
                 if( pthread_mutex_unlock(&gAtomicLock) )
                     log_error( "Failed to release gAtomicLock. Further atomic operations may deadlock\n");
 #endif
             }
         }

         // get the next item
         item = ThreadPool_AtomicAdd( &gRunCount, -1 );
     }

 exit:
     log_info( "ThreadPool: thread %d exiting.\n", threadID );
     ThreadPool_AtomicAdd( &gThreadCount, -1 );
 #if !defined(_WIN32)
     return NULL;
 #endif
 }

 // SetThreadCount() may be used to artifically set the number of worker threads
 // If the value is 0 (the default) the number of threads will be determined based on
 // the number of CPU cores.  If it is a unicore machine, then 2 will be used, so
 // that we still get some testing for thread safety.
 //
 // If count < 2 or the CL_TEST_SINGLE_THREADED environment variable is set then the
 // code will run single threaded, but will report an error to indicate that the test
 // is invalid.  This option is intended for debugging purposes only. It is suggested
 // as a convention that test apps set the thread count to 1 in response to the -m flag.
 //
 // SetThreadCount() must be called before the first call to GetThreadCount() or ThreadPool_Do(),
 // otherwise the behavior is indefined.
 void        SetThreadCount( int count )
 {
     if( threadPoolInitErr == CL_SUCCESS )
     {
         log_error( "Error: It is illegal to set the thread count after the first call to ThreadPool_Do or GetThreadCount\n" );
         abort();
     }

     gThreadCount = count;
 }

 void ThreadPool_Init(void)
 {
     cl_int i;
     int err;
     volatile cl_uint threadID = 0;

     // Check for manual override of multithreading code. We add this for better debuggability.
     if( getenv( "CL_TEST_SINGLE_THREADED" ) )
     {
         log_error("ERROR: CL_TEST_SINGLE_THREADED is set in the environment. Running single threaded.\n*** TEST IS INVALID! ***\n");
         gThreadCount = 1;
         return;
     }

     // Figure out how many threads to run -- check first for non-zero to give the implementation the chance
     if( 0 == gThreadCount )
     {
 #if defined(_MSC_VER) || defined (__MINGW64__)
         PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = NULL;
         DWORD length = 0;

         GetLogicalProcessorInformation( NULL, &length );
         buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION) malloc( length );
         if( buffer != NULL )
         {
             if ( GetLogicalProcessorInformation( buffer, &length ) == TRUE )
             {
                 PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
                 while( ptr < &buffer[ length / sizeof( SYSTEM_LOGICAL_PROCESSOR_INFORMATION ) ] )
                 {
                     if( ptr->Relationship == RelationProcessorCore )
                     {
                         // Count the number of bits in ProcessorMask (number of logical cores)
                         ULONG mask = ptr->ProcessorMask;
                         while( mask )
                         {
                             ++gThreadCount;
                             mask &= mask - 1; // Remove 1 bit at a time
                         }
                     }
                     ++ptr;
                 }
             }
             free(buffer);
         }
 #elif defined (__MINGW32__)
         {
             #warning  How about this, instead of hard coding it to 2?
             SYSTEM_INFO sysinfo;
             GetSystemInfo( &sysinfo );
             gThreadCount = sysinfo.dwNumberOfProcessors;
         }
 #elif defined (__linux__) && !defined(__ANDROID__)
         cpu_set_t    affinity;
         if ( 0 == sched_getaffinity(0, sizeof(cpu_set_t), &affinity) )
         {
 #if !(defined(CPU_COUNT))
         gThreadCount = 1;
 #else
             gThreadCount = CPU_COUNT(&affinity);
 #endif
         }
         else
         {
             gThreadCount = (cl_int) sysconf(_SC_NPROCESSORS_CONF);       // Hopefully your system returns logical cpus here, as does MacOS X
         }
 #else // !_WIN32
         gThreadCount = (cl_int) sysconf(_SC_NPROCESSORS_CONF);       // Hopefully your system returns logical cpus here, as does MacOS X
 #endif // !_WIN32

         // Multithreaded tests are required to run multithreaded even on unicore systems so as to test thread safety
         if( 1 == gThreadCount )
             gThreadCount = 2;
     }

     // When working in 32 bit limit the thread number to 12
     // This fix was made due to memory issues in integer_ops test
     // When running integer_ops, the test opens as many threads as the
     // machine has and each thread allocates a fixed amount of memory
     // When running this test on dual socket machine in 32-bit, the
     // process memory is not sufficient and the test fails
     #if defined(_WIN32) && !defined(_M_X64)
         if (gThreadCount > 12) {
             gThreadCount = 12;
         }
     #endif

     //Allow the app to set thread count to <0 for debugging purposes.  This will cause the test to run single threaded.
     if( gThreadCount < 2 )
     {
         log_error( "ERROR: Running single threaded because thread count < 2. \n*** TEST IS INVALID! ***\n");
         gThreadCount = 1;
         return;
     }

 #if defined( _WIN32 )
     InitializeCriticalSection( gThreadPoolLock );
     InitializeCriticalSection( cond_lock );
     _InitializeConditionVariable( cond_var );
     caller_event = CreateEvent( NULL, FALSE, FALSE, NULL );
 #elif defined (__GNUC__)
     // Dont rely on PTHREAD_MUTEX_INITIALIZER for intialization of a mutex since it might cause problem
     // with some flavors of gcc compilers.
     pthread_cond_init(&cond_var, NULL);
     pthread_mutex_init(&cond_lock ,NULL);
     pthread_cond_init(&caller_cond_var, NULL);
     pthread_mutex_init(&caller_cond_lock, NULL);
     pthread_mutex_init(&gThreadPoolLock, NULL);
 #endif

 #if !(defined(__GNUC__) || defined(_MSC_VER) || defined(__MINGW32__))
     pthread_mutex_initialize(gAtomicLock);
 #elif defined (__MINGW32__)
     InitializeCriticalSection(&gAtomicLock);
 #endif
     // Make sure the last thread done in the work pool doesn't signal us to wake before we get to the point where we are supposed to wait
     //  That would cause a deadlock.
 #if !defined( _WIN32 )
     if((err = pthread_mutex_lock( &caller_cond_lock) ))
     {
         log_error("Error %d from pthread_mutex_lock. Unable to block for work to finish. ThreadPool_Init failed.\n", err );
         gThreadCount = 1;
         return;
     }
 #endif // !_WIN32

     gRunning = gThreadCount;
     // init threads
     for( i = 0; i < gThreadCount; i++ )
     {
 #if defined( _WIN32 )
         uintptr_t handle = _beginthread(ThreadPool_WorkerFunc, 0, (void*) &threadID);
         err = ( handle == 0 );
 #else // !_WIN32
         pthread_t tid = 0;
         err = pthread_create( &tid, NULL, ThreadPool_WorkerFunc, (void*) &threadID );
 #endif // !_WIN32
         if( err )
         {
             log_error( "Error %d launching thread %d\n", err, i );
             threadPoolInitErr = err;
             gThreadCount = i;
             break;
         }
     }

     atexit( ThreadPool_Exit );

 // block until they are done launching.
     do
     {
 #if defined( _WIN32 )
         WaitForSingleObject( caller_event, INFINITE );
 #else // !_WIN32
         if((err = pthread_cond_wait( &caller_cond_var, &caller_cond_lock) ))
         {
             log_error("Error %d from pthread_cond_wait. Unable to block for work to finish. ThreadPool_Init failed.\n", err );
             pthread_mutex_unlock( &caller_cond_lock);
             return;
         }
 #endif // !_WIN32
     }
     while( gRunCount != -gThreadCount );
 #if !defined( _WIN32 )
     if((err = pthread_mutex_unlock( &caller_cond_lock) ))
     {
         log_error("Error %d from pthread_mutex_unlock. Unable to block for work to finish. ThreadPool_Init failed.\n", err );
         return;
     }
 #endif // !_WIN32

     threadPoolInitErr = CL_SUCCESS;
 }

 #if defined(_MSC_VER)
 static BOOL CALLBACK _ThreadPool_Init(_PINIT_ONCE InitOnce, PVOID Parameter, PVOID *lpContex)
 {
     ThreadPool_Init();
     return TRUE;
 }
 #endif

 void ThreadPool_Exit(void)
 {
     int err, count;
     gRunCount = CL_INT_MAX;

 #if defined( __GNUC__ )
     // GCC extension: http://gcc.gnu.org/onlinedocs/gcc/Atomic-Builtins.html#Atomic-Builtins
     __sync_synchronize();
 #elif defined( _MSC_VER )
     _mm_mfence();
 #else
     #warning   If this is a weakly ordered memory system, please add a memory barrier here to force this and everything else to memory before we proceed
 #endif

     // spin waiting for threads to die
     for (count = 0; 0 != gThreadCount && count < 1000; count++)
     {
 #if defined( _WIN32 )
         _WakeAllConditionVariable( cond_var );
         Sleep(1);
 #else // !_WIN32
         if( (err = pthread_cond_broadcast( &cond_var )))
         {
             log_error("Error %d from pthread_cond_broadcast. Unable to wake up work threads. ThreadPool_Exit failed.\n", err );
             break;
         }
         usleep(1000);
 #endif // !_WIN32
     }

     if( gThreadCount )
         log_error( "Error: Thread pool timed out after 1 second with %d threads still active.\n", gThreadCount );
     else
         log_info( "Thread pool exited in a orderly fashion.\n" );
 }


 // Blocking API that farms out count jobs to a thread pool.
 // It may return with some work undone if func_ptr() returns a non-zero
 // result.
 //
 // This function obviously has its shortcommings. Only one call to ThreadPool_Do
 // can be running at a time. It is not intended for general purpose use.
 // If clEnqueueNativeKernelFn, out of order queues and a CL_DEVICE_TYPE_CPU were
 // all available then it would make more sense to use those features.
 cl_int ThreadPool_Do( TPFuncPtr func_ptr,
                       cl_uint count,
                       void *userInfo )
 {
     cl_int newErr;
     cl_int err = 0;
     // Lazily set up our threads
 #if defined(_MSC_VER) && (_WIN32_WINNT >= 0x600)
     err = !_InitOnceExecuteOnce( &threadpool_init_control, _ThreadPool_Init, NULL, NULL );
 #elif defined (_WIN32)
     if (threadpool_init_control == 0) {
     #warning  This is buggy and race prone.  Find a better way.
         ThreadPool_Init();
         threadpool_init_control = 1;
     }
 #else //posix platform
     err = pthread_once( &threadpool_init_control, ThreadPool_Init );
     if( err )
     {
         log_error("Error %d from pthread_once. Unable to init threads. ThreadPool_Do failed.\n", err );
         return err;
     }
 #endif
     // Single threaded code to handle case where threadpool wasn't allocated or was disabled by environment variable
     if( threadPoolInitErr )
     {
         cl_uint currentJob = 0;
         cl_int  result = CL_SUCCESS;

 #if defined(__APPLE__) && defined(__arm__)
         // On most platforms which support denorm, default is FTZ off. However,
         // on some hardware where the reference is computed, default might be flush denorms to zero e.g. arm.
         // This creates issues in result verification. Since spec allows the implementation to either flush or
         // not flush denorms to zero, an implementation may choose not be flush i.e. return denorm result whereas
         // reference result may be zero (flushed denorm). Hence we need to disable denorm flushing on host side
         // where reference is being computed to make sure we get non-flushed reference result. If implementation
         // returns flushed result, we correctly take care of that in verification code.
         FPU_mode_type oldMode;
         DisableFTZ( &oldMode );
 #endif
         for( currentJob = 0; currentJob < count; currentJob++ )
             if((result = func_ptr( currentJob, 0, userInfo )))
             {
 #if defined(__APPLE__) && defined(__arm__)
                 // Restore FP state before leaving
                 RestoreFPState( &oldMode );
 #endif
                 return result;
             }

 #if defined(__APPLE__) && defined(__arm__)
         // Restore FP state before leaving
         RestoreFPState( &oldMode );
 #endif

         return CL_SUCCESS;
     }

     if( count >= MAX_COUNT )
     {
         log_error("Error: ThreadPool_Do count %d >= max threadpool count of %d\n", count, MAX_COUNT );
         return -1;
     }

     // Enter critical region
 #if defined( _WIN32 )
     EnterCriticalSection( gThreadPoolLock );
 #else // !_WIN32
     if( (err = pthread_mutex_lock( &gThreadPoolLock )))
     {
         switch (err)
         {
             case EDEADLK:
                 log_error("Error EDEADLK returned in ThreadPool_Do(). ThreadPool_Do is not designed to work recursively!\n" );
                 break;
             case EINVAL:
                 log_error("Error EINVAL returned in ThreadPool_Do(). How did we end up with an invalid gThreadPoolLock?\n" );
                 break;
             default:
                 break;
         }
         return err;
     }
 #endif // !_WIN32

     // Start modifying the job state observable by worker threads
 #if defined( _WIN32 )
     EnterCriticalSection( cond_lock );
 #else // !_WIN32
     if((err = pthread_mutex_lock( &cond_lock) ))
     {
         log_error("Error %d from pthread_mutex_lock. Unable to wake up work threads. ThreadPool_Do failed.\n", err );
         goto exit;
     }
 #endif // !_WIN32

     // Make sure the last thread done in the work pool doesn't signal us to wake before we get to the point where we are supposed to wait
     //  That would cause a deadlock.
 #if !defined( _WIN32 )
     if((err = pthread_mutex_lock( &caller_cond_lock) ))
     {
         log_error("Error %d from pthread_mutex_lock. Unable to block for work to finish. ThreadPool_Do failed.\n", err );
         goto exit;
     }
 #endif // !_WIN32

     // Prime the worker threads to get going
     jobError = CL_SUCCESS;
     gRunCount = gJobCount = count;
     gFunc_ptr = func_ptr;
     gUserInfo = userInfo;

 #if defined( _WIN32 )
     ResetEvent(caller_event);
     _WakeAllConditionVariable( cond_var );
     LeaveCriticalSection( cond_lock );
 #else // !_WIN32
     if( (err = pthread_cond_broadcast( &cond_var )))
     {
         log_error("Error %d from pthread_cond_broadcast. Unable to wake up work threads. ThreadPool_Do failed.\n", err );
         goto exit;
     }
     if((err = pthread_mutex_unlock( &cond_lock) ))
     {
         log_error("Error %d from pthread_mutex_unlock. Unable to wake up work threads. ThreadPool_Do failed.\n", err );
         goto exit;
     }
 #endif // !_WIN32

 // block until they are done.  It would be slightly more efficient to do some of the work here though.
     do
     {
 #if defined( _WIN32 )
         WaitForSingleObject( caller_event, INFINITE );
 #else // !_WIN32
         if((err = pthread_cond_wait( &caller_cond_var, &caller_cond_lock) ))
         {
             log_error("Error %d from pthread_cond_wait. Unable to block for work to finish. ThreadPool_Do failed.\n", err );
             pthread_mutex_unlock( &caller_cond_lock);
             goto exit;
         }
 #endif // !_WIN32
     }
     while( gRunning );
 #if !defined(_WIN32)
     if((err = pthread_mutex_unlock( &caller_cond_lock) ))
     {
         log_error("Error %d from pthread_mutex_unlock. Unable to block for work to finish. ThreadPool_Do failed.\n", err );
         goto exit;
     }
 #endif // !_WIN32

     err = jobError;

 exit:
     // exit critical region
 #if defined( _WIN32 )
     LeaveCriticalSection( gThreadPoolLock );
 #else // !_WIN32
     newErr = pthread_mutex_unlock( &gThreadPoolLock );
     if( newErr)
     {
         log_error("Error %d from pthread_mutex_unlock. Unable to exit critical region. ThreadPool_Do failed.\n", newErr );
         return err;
     }
 #endif // !_WIN32

     return err;
 }

 cl_uint GetThreadCount( void )
 {
     // Lazily set up our threads
 #if defined(_MSC_VER) && (_WIN32_WINNT >= 0x600)
     cl_int err = !_InitOnceExecuteOnce( &threadpool_init_control, _ThreadPool_Init, NULL, NULL );
 #elif defined (_WIN32)
     if (threadpool_init_control == 0) {
     #warning  This is buggy and race prone.  Find a better way.
         ThreadPool_Init();
         threadpool_init_control = 1;
     }
 #else
     cl_int err = pthread_once( &threadpool_init_control, ThreadPool_Init );
     if( err )
     {
         log_error("Error %d from pthread_once. Unable to init threads. ThreadPool_Do failed.\n", err );
         return err;
     }
 #endif // !_WIN32

     if( gThreadCount < 1 )
         return 1;

     return gThreadCount;
 }

 #else

 #ifndef MY_OS_REALLY_REALLY_DOESNT_SUPPORT_THREADS
     #error ThreadPool implementation has not been multithreaded for this operating system. You must multithread this section.
 #endif
 //
 // We require multithreading in parts of the test as a means of simultaneously testing reentrancy requirements
 // of OpenCL API, while also checking
 //
 // A sample single threaded implementation follows, for documentation / bootstrapping purposes.
 // It is not okay to use this for conformance testing!!!
 //
 // Exception:  If your operating system does not support multithreaded execution of any kind, then you may use this code.
 //

 cl_int ThreadPool_AtomicAdd( volatile cl_int *a, cl_int b )
 {
     cl_uint r = *a;

     // since this fallback code path is not multithreaded, we just do a regular add here
     // If your operating system supports memory-barrier-atomics, use those here
     *a = r + b;

     return r;
 }

 // Blocking API that farms out count jobs to a thread pool.
 // It may return with some work undone if func_ptr() returns a non-zero
 // result.
 cl_int ThreadPool_Do(   TPFuncPtr func_ptr,
                         cl_uint count,
                         void *userInfo )
 {
     cl_uint currentJob = 0;
     cl_int  result = CL_SUCCESS;

 #ifndef MY_OS_REALLY_REALLY_DOESNT_SUPPORT_THREADS
     // THIS FUNCTION IS NOT INTENDED FOR USE!!
     log_error( "ERROR:  Test must be multithreaded!\n" );
     exit(-1);
 #else
     static int spewCount = 0;

     if( 0 == spewCount )
     {
         log_info( "\nWARNING:  The operating system is claimed not to support threads of any sort. Running single threaded.\n" );
         spewCount = 1;
     }
 #endif

 // The multithreaded code should mimic this behavior:
     for( currentJob = 0; currentJob < count; currentJob++ )
         if((result = func_ptr( currentJob, 0, userInfo )))
             return result;

     return CL_SUCCESS;
 }

 cl_uint GetThreadCount( void )
 {
     return 1;
 }

 void SetThreadCount( int count )
 {
     if( count > 1 )
         log_info( "WARNING: SetThreadCount(%d) ignored\n", count );
 }

 #endif