blob: 59c4abf50ec8342fc975ee9ac85718888b7e6170 [file] [log] [blame]
#ifndef PTHREADPOOL_H_
#define PTHREADPOOL_H_
#include <stddef.h>
#include <stdint.h>
typedef struct pthreadpool* pthreadpool_t;
typedef void (*pthreadpool_task_1d_t)(void*, size_t);
typedef void (*pthreadpool_task_1d_tile_1d_t)(void*, size_t, size_t);
typedef void (*pthreadpool_task_2d_t)(void*, size_t, size_t);
typedef void (*pthreadpool_task_2d_tile_1d_t)(void*, size_t, size_t, size_t);
typedef void (*pthreadpool_task_2d_tile_2d_t)(void*, size_t, size_t, size_t, size_t);
typedef void (*pthreadpool_task_3d_t)(void*, size_t, size_t, size_t);
typedef void (*pthreadpool_task_3d_tile_1d_t)(void*, size_t, size_t, size_t, size_t);
typedef void (*pthreadpool_task_3d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t);
typedef void (*pthreadpool_task_4d_t)(void*, size_t, size_t, size_t, size_t);
typedef void (*pthreadpool_task_4d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t);
typedef void (*pthreadpool_task_4d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t);
typedef void (*pthreadpool_task_5d_t)(void*, size_t, size_t, size_t, size_t, size_t);
typedef void (*pthreadpool_task_5d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t);
typedef void (*pthreadpool_task_5d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
typedef void (*pthreadpool_task_6d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t);
typedef void (*pthreadpool_task_6d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
typedef void (*pthreadpool_task_6d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
typedef void (*pthreadpool_task_1d_with_id_t)(void*, uint32_t, size_t);
typedef void (*pthreadpool_task_2d_tile_2d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t);
typedef void (*pthreadpool_task_3d_tile_2d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t, size_t);
typedef void (*pthreadpool_task_4d_tile_2d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t, size_t, size_t);
/**
* Disable support for denormalized numbers to the maximum extent possible for
* the duration of the computation.
*
* Handling denormalized floating-point numbers is often implemented in
* microcode, and incurs significant performance degradation. This hint
* instructs the thread pool to disable support for denormalized numbers before
* running the computation by manipulating architecture-specific control
* registers, and restore the initial value of control registers after the
* computation is complete. The thread pool temporary disables denormalized
* numbers on all threads involved in the computation (i.e. the caller threads,
* and potentially worker threads).
*
* Disabling denormalized numbers may have a small negative effect on results'
* accuracy. As various architectures differ in capabilities to control
* processing of denormalized numbers, using this flag may also hurt results'
* reproducibility across different instruction set architectures.
*/
#define PTHREADPOOL_FLAG_DISABLE_DENORMALS 0x00000001
/**
* Yield worker threads to the system scheduler after the operation is finished.
*
* Force workers to use kernel wait (instead of active spin-wait by default) for
* new commands after this command is processed. This flag affects only the
* immediate next operation on this thread pool. To make the thread pool always
* use kernel wait, pass this flag to all parallelization functions.
*/
#define PTHREADPOOL_FLAG_YIELD_WORKERS 0x00000002
#ifdef __cplusplus
extern "C" {
#endif
/**
* Create a thread pool with the specified number of threads.
*
* @param threads_count the number of threads in the thread pool.
* A value of 0 has special interpretation: it creates a thread pool with as
* many threads as there are logical processors in the system.
*
* @returns A pointer to an opaque thread pool object if the call is
* successful, or NULL pointer if the call failed.
*/
pthreadpool_t pthreadpool_create(size_t threads_count);
/**
* Query the number of threads in a thread pool.
*
* @param threadpool the thread pool to query.
*
* @returns The number of threads in the thread pool.
*/
size_t pthreadpool_get_threads_count(pthreadpool_t threadpool);
/**
* Process items on a 1D grid.
*
* The function implements a parallel version of the following snippet:
*
* for (size_t i = 0; i < range; i++)
* function(context, i);
*
* When the function returns, all items have been processed and the thread pool
* is ready for a new task.
*
* @note If multiple threads call this function with the same thread pool, the
* calls are serialized.
*
* @param threadpool the thread pool to use for parallelisation. If threadpool
* is NULL, all items are processed serially on the calling thread.
* @param function the function to call for each item.
* @param context the first argument passed to the specified function.
* @param range the number of items on the 1D grid to process. The
* specified function will be called once for each item.
* @param flags a bitwise combination of zero or more optional flags
* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
*/
void pthreadpool_parallelize_1d(
pthreadpool_t threadpool,
pthreadpool_task_1d_t function,
void* context,
size_t range,
uint32_t flags);
/**
* Process items on a 1D grid using a microarchitecture-aware task function.
*
* The function implements a parallel version of the following snippet:
*
* uint32_t uarch_index = cpuinfo_initialize() ?
* cpuinfo_get_current_uarch_index() : default_uarch_index;
* if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
* for (size_t i = 0; i < range; i++)
* function(context, uarch_index, i);
*
* When the function returns, all items have been processed and the thread pool
* is ready for a new task.
*
* @note If multiple threads call this function with the same thread pool, the
* calls are serialized.
*
* @param threadpool the thread pool to use for parallelisation. If
* threadpool is NULL, all items are processed serially on the calling
* thread.
* @param function the function to call for each item.
* @param context the first argument passed to the specified
* function.
* @param default_uarch_index the microarchitecture index to use when
* pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
* or index returned by cpuinfo_get_current_uarch_index() exceeds the
* max_uarch_index value.
* @param max_uarch_index the maximum microarchitecture index expected by
* the specified function. If the index returned by
* cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
* will be used instead. default_uarch_index can exceed max_uarch_index.
* @param range the number of items on the 1D grid to process.
* The specified function will be called once for each item.
* @param flags a bitwise combination of zero or more optional
* flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
* PTHREADPOOL_FLAG_YIELD_WORKERS)
*/
void pthreadpool_parallelize_1d_with_uarch(
pthreadpool_t threadpool,
pthreadpool_task_1d_with_id_t function,
void* context,
uint32_t default_uarch_index,
uint32_t max_uarch_index,
size_t range,
uint32_t flags);
/**
* Process items on a 1D grid with specified maximum tile size.
*
* The function implements a parallel version of the following snippet:
*
* for (size_t i = 0; i < range; i += tile)
* function(context, i, min(range - i, tile));
*
* When the call returns, all items have been processed and the thread pool is
* ready for a new task.
*
* @note If multiple threads call this function with the same thread pool,
* the calls are serialized.
*
* @param threadpool the thread pool to use for parallelisation. If threadpool
* is NULL, all items are processed serially on the calling thread.
* @param function the function to call for each tile.
* @param context the first argument passed to the specified function.
* @param range the number of items on the 1D grid to process.
* @param tile the maximum number of items on the 1D grid to process in
* one function call.
* @param flags a bitwise combination of zero or more optional flags
* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
*/
void pthreadpool_parallelize_1d_tile_1d(
pthreadpool_t threadpool,
pthreadpool_task_1d_tile_1d_t function,
void* context,
size_t range,
size_t tile,
uint32_t flags);
/**
* Process items on a 2D grid.
*
* The function implements a parallel version of the following snippet:
*
* for (size_t i = 0; i < range_i; i++)
* for (size_t j = 0; j < range_j; j++)
* function(context, i, j);
*
* When the function returns, all items have been processed and the thread pool
* is ready for a new task.
*
* @note If multiple threads call this function with the same thread pool, the
* calls are serialized.
*
* @param threadpool the thread pool to use for parallelisation. If threadpool
* is NULL, all items are processed serially on the calling thread.
* @param function the function to call for each item.
* @param context the first argument passed to the specified function.
* @param range_i the number of items to process along the first dimension
* of the 2D grid.
* @param range_j the number of items to process along the second dimension
* of the 2D grid.
* @param flags a bitwise combination of zero or more optional flags
* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
*/
void pthreadpool_parallelize_2d(
pthreadpool_t threadpool,
pthreadpool_task_2d_t function,
void* context,
size_t range_i,
size_t range_j,
uint32_t flags);
/**
* Process items on a 2D grid with the specified maximum tile size along the
* last grid dimension.
*
* The function implements a parallel version of the following snippet:
*
* for (size_t i = 0; i < range_i; i++)
* for (size_t j = 0; j < range_j; j += tile_j)
* function(context, i, j, min(range_j - j, tile_j));
*
* When the function returns, all items have been processed and the thread pool
* is ready for a new task.
*
* @note If multiple threads call this function with the same thread pool, the
* calls are serialized.
*
* @param threadpool the thread pool to use for parallelisation. If threadpool
* is NULL, all items are processed serially on the calling thread.
* @param function the function to call for each tile.
* @param context the first argument passed to the specified function.
* @param range_i the number of items to process along the first dimension
* of the 2D grid.
* @param range_j the number of items to process along the second dimension
* of the 2D grid.
* @param tile_j the maximum number of items along the second dimension of
* the 2D grid to process in one function call.
* @param flags a bitwise combination of zero or more optional flags
* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
*/
void pthreadpool_parallelize_2d_tile_1d(
pthreadpool_t threadpool,
pthreadpool_task_2d_tile_1d_t function,
void* context,
size_t range_i,
size_t range_j,
size_t tile_j,
uint32_t flags);
/**
* Process items on a 2D grid with the specified maximum tile size along each
* grid dimension.
*
* The function implements a parallel version of the following snippet:
*
* for (size_t i = 0; i < range_i; i += tile_i)
* for (size_t j = 0; j < range_j; j += tile_j)
* function(context, i, j,
* min(range_i - i, tile_i), min(range_j - j, tile_j));
*
* When the function returns, all items have been processed and the thread pool
* is ready for a new task.
*
* @note If multiple threads call this function with the same thread pool, the
* calls are serialized.
*
* @param threadpool the thread pool to use for parallelisation. If threadpool
* is NULL, all items are processed serially on the calling thread.
* @param function the function to call for each tile.
* @param context the first argument passed to the specified function.
* @param range_i the number of items to process along the first dimension
* of the 2D grid.
* @param range_j the number of items to process along the second dimension
* of the 2D grid.
* @param tile_j the maximum number of items along the first dimension of
* the 2D grid to process in one function call.
* @param tile_j the maximum number of items along the second dimension of
* the 2D grid to process in one function call.
* @param flags a bitwise combination of zero or more optional flags
* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
*/
void pthreadpool_parallelize_2d_tile_2d(
pthreadpool_t threadpool,
pthreadpool_task_2d_tile_2d_t function,
void* context,
size_t range_i,
size_t range_j,
size_t tile_i,
size_t tile_j,
uint32_t flags);
/**
* Process items on a 2D grid with the specified maximum tile size along each
* grid dimension using a microarchitecture-aware task function.
*
* The function implements a parallel version of the following snippet:
*
* uint32_t uarch_index = cpuinfo_initialize() ?
* cpuinfo_get_current_uarch_index() : default_uarch_index;
* if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
* for (size_t i = 0; i < range_i; i += tile_i)
* for (size_t j = 0; j < range_j; j += tile_j)
* function(context, uarch_index, i, j,
* min(range_i - i, tile_i), min(range_j - j, tile_j));
*
* When the function returns, all items have been processed and the thread pool
* is ready for a new task.
*
* @note If multiple threads call this function with the same thread pool, the
* calls are serialized.
*
* @param threadpool the thread pool to use for parallelisation. If
* threadpool is NULL, all items are processed serially on the calling
* thread.
* @param function the function to call for each tile.
* @param context the first argument passed to the specified
* function.
* @param default_uarch_index the microarchitecture index to use when
* pthreadpool is configured without cpuinfo,
* cpuinfo initialization failed, or index returned
* by cpuinfo_get_current_uarch_index() exceeds
* the max_uarch_index value.
* @param max_uarch_index the maximum microarchitecture index expected
* by the specified function. If the index returned
* by cpuinfo_get_current_uarch_index() exceeds this
* value, default_uarch_index will be used instead.
* default_uarch_index can exceed max_uarch_index.
* @param range_i the number of items to process along the first
* dimension of the 2D grid.
* @param range_j the number of items to process along the second
* dimension of the 2D grid.
* @param tile_j the maximum number of items along the first
* dimension of the 2D grid to process in one function call.
* @param tile_j the maximum number of items along the second
* dimension of the 2D grid to process in one function call.
* @param flags a bitwise combination of zero or more optional
* flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
* PTHREADPOOL_FLAG_YIELD_WORKERS)
*/
void pthreadpool_parallelize_2d_tile_2d_with_uarch(
pthreadpool_t threadpool,
pthreadpool_task_2d_tile_2d_with_id_t function,
void* context,
uint32_t default_uarch_index,
uint32_t max_uarch_index,
size_t range_i,
size_t range_j,
size_t tile_i,
size_t tile_j,
uint32_t flags);
/**
* Process items on a 3D grid.
*
* The function implements a parallel version of the following snippet:
*
* for (size_t i = 0; i < range_i; i++)
* for (size_t j = 0; j < range_j; j++)
* for (size_t k = 0; k < range_k; k++)
* function(context, i, j, k);
*
* When the function returns, all items have been processed and the thread pool
* is ready for a new task.
*
* @note If multiple threads call this function with the same thread pool, the
* calls are serialized.
*
* @param threadpool the thread pool to use for parallelisation. If threadpool
* is NULL, all items are processed serially on the calling thread.
* @param function the function to call for each tile.
* @param context the first argument passed to the specified function.
* @param range_i the number of items to process along the first dimension
* of the 3D grid.
* @param range_j the number of items to process along the second dimension
* of the 3D grid.
* @param range_k the number of items to process along the third dimension
* of the 3D grid.
* @param flags a bitwise combination of zero or more optional flags
* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
*/
void pthreadpool_parallelize_3d(
pthreadpool_t threadpool,
pthreadpool_task_3d_t function,
void* context,
size_t range_i,
size_t range_j,
size_t range_k,
uint32_t flags);
/**
* Process items on a 3D grid with the specified maximum tile size along the
* last grid dimension.
*
* The function implements a parallel version of the following snippet:
*
* for (size_t i = 0; i < range_i; i++)
* for (size_t j = 0; j < range_j; j++)
* for (size_t k = 0; k < range_k; k += tile_k)
* function(context, i, j, k, min(range_k - k, tile_k));
*
* When the function returns, all items have been processed and the thread pool
* is ready for a new task.
*
* @note If multiple threads call this function with the same thread pool, the
* calls are serialized.
*
* @param threadpool the thread pool to use for parallelisation. If threadpool
* is NULL, all items are processed serially on the calling thread.
* @param function the function to call for each tile.
* @param context the first argument passed to the specified function.
* @param range_i the number of items to process along the first dimension
* of the 3D grid.
* @param range_j the number of items to process along the second dimension
* of the 3D grid.
* @param range_k the number of items to process along the third dimension
* of the 3D grid.
* @param tile_k the maximum number of items along the third dimension of
* the 3D grid to process in one function call.
* @param flags a bitwise combination of zero or more optional flags
* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
*/
void pthreadpool_parallelize_3d_tile_1d(
pthreadpool_t threadpool,
pthreadpool_task_3d_tile_1d_t function,
void* context,
size_t range_i,
size_t range_j,
size_t range_k,
size_t tile_k,
uint32_t flags);
/**
* Process items on a 3D grid with the specified maximum tile size along the
* last two grid dimensions.
*
* The function implements a parallel version of the following snippet:
*
* for (size_t i = 0; i < range_i; i++)
* for (size_t j = 0; j < range_j; j += tile_j)
* for (size_t k = 0; k < range_k; k += tile_k)
* function(context, i, j, k,
* min(range_j - j, tile_j), min(range_k - k, tile_k));
*
* When the function returns, all items have been processed and the thread pool
* is ready for a new task.
*
* @note If multiple threads call this function with the same thread pool, the
* calls are serialized.
*
* @param threadpool the thread pool to use for parallelisation. If threadpool
* is NULL, all items are processed serially on the calling thread.
* @param function the function to call for each tile.
* @param context the first argument passed to the specified function.
* @param range_i the number of items to process along the first dimension
* of the 3D grid.
* @param range_j the number of items to process along the second dimension
* of the 3D grid.
* @param range_k the number of items to process along the third dimension
* of the 3D grid.
* @param tile_j the maximum number of items along the second dimension of
* the 3D grid to process in one function call.
* @param tile_k the maximum number of items along the third dimension of
* the 3D grid to process in one function call.
* @param flags a bitwise combination of zero or more optional flags
* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
*/
void pthreadpool_parallelize_3d_tile_2d(
pthreadpool_t threadpool,
pthreadpool_task_3d_tile_2d_t function,
void* context,
size_t range_i,
size_t range_j,
size_t range_k,
size_t tile_j,
size_t tile_k,
uint32_t flags);
/**
* Process items on a 3D grid with the specified maximum tile size along the
* last two grid dimensions using a microarchitecture-aware task function.
*
* The function implements a parallel version of the following snippet:
*
* uint32_t uarch_index = cpuinfo_initialize() ?
* cpuinfo_get_current_uarch_index() : default_uarch_index;
* if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
* for (size_t i = 0; i < range_i; i++)
* for (size_t j = 0; j < range_j; j += tile_j)
* for (size_t k = 0; k < range_k; k += tile_k)
* function(context, uarch_index, i, j, k,
* min(range_j - j, tile_j), min(range_k - k, tile_k));
*
* When the function returns, all items have been processed and the thread pool
* is ready for a new task.
*
* @note If multiple threads call this function with the same thread pool, the
* calls are serialized.
*
* @param threadpool the thread pool to use for parallelisation. If
* threadpool is NULL, all items are processed serially on the calling
* thread.
* @param function the function to call for each tile.
* @param context the first argument passed to the specified
* function.
* @param default_uarch_index the microarchitecture index to use when
* pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
* or index returned by cpuinfo_get_current_uarch_index() exceeds the
* max_uarch_index value.
* @param max_uarch_index the maximum microarchitecture index expected by
* the specified function. If the index returned by
* cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
* will be used instead. default_uarch_index can exceed max_uarch_index.
* @param range_i the number of items to process along the first
* dimension of the 3D grid.
* @param range_j the number of items to process along the second
* dimension of the 3D grid.
* @param range_k the number of items to process along the third
* dimension of the 3D grid.
* @param tile_j the maximum number of items along the second
* dimension of the 3D grid to process in one function call.
* @param tile_k the maximum number of items along the third
* dimension of the 3D grid to process in one function call.
* @param flags a bitwise combination of zero or more optional
* flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
* PTHREADPOOL_FLAG_YIELD_WORKERS)
*/
void pthreadpool_parallelize_3d_tile_2d_with_uarch(
pthreadpool_t threadpool,
pthreadpool_task_3d_tile_2d_with_id_t function,
void* context,
uint32_t default_uarch_index,
uint32_t max_uarch_index,
size_t range_i,
size_t range_j,
size_t range_k,
size_t tile_j,
size_t tile_k,
uint32_t flags);
/**
* Process items on a 4D grid.
*
* The function implements a parallel version of the following snippet:
*
* for (size_t i = 0; i < range_i; i++)
* for (size_t j = 0; j < range_j; j++)
* for (size_t k = 0; k < range_k; k++)
* for (size_t l = 0; l < range_l; l++)
* function(context, i, j, k, l);
*
* When the function returns, all items have been processed and the thread pool
* is ready for a new task.
*
* @note If multiple threads call this function with the same thread pool, the
* calls are serialized.
*
* @param threadpool the thread pool to use for parallelisation. If threadpool
* is NULL, all items are processed serially on the calling thread.
* @param function the function to call for each tile.
* @param context the first argument passed to the specified function.
* @param range_i the number of items to process along the first dimension
* of the 4D grid.
* @param range_j the number of items to process along the second dimension
* of the 4D grid.
* @param range_k the number of items to process along the third dimension
* of the 4D grid.
* @param range_l the number of items to process along the fourth dimension
* of the 4D grid.
* @param flags a bitwise combination of zero or more optional flags
* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
*/
void pthreadpool_parallelize_4d(
pthreadpool_t threadpool,
pthreadpool_task_4d_t function,
void* context,
size_t range_i,
size_t range_j,
size_t range_k,
size_t range_l,
uint32_t flags);
/**
* Process items on a 4D grid with the specified maximum tile size along the
* last grid dimension.
*
* The function implements a parallel version of the following snippet:
*
* for (size_t i = 0; i < range_i; i++)
* for (size_t j = 0; j < range_j; j++)
* for (size_t k = 0; k < range_k; k++)
* for (size_t l = 0; l < range_l; l += tile_l)
* function(context, i, j, k, l, min(range_l - l, tile_l));
*
* When the function returns, all items have been processed and the thread pool
* is ready for a new task.
*
* @note If multiple threads call this function with the same thread pool, the
* calls are serialized.
*
* @param threadpool the thread pool to use for parallelisation. If threadpool
* is NULL, all items are processed serially on the calling thread.
* @param function the function to call for each tile.
* @param context the first argument passed to the specified function.
* @param range_i the number of items to process along the first dimension
* of the 4D grid.
* @param range_j the number of items to process along the second dimension
* of the 4D grid.
* @param range_k the number of items to process along the third dimension
* of the 4D grid.
* @param range_l the number of items to process along the fourth dimension
* of the 4D grid.
* @param tile_l the maximum number of items along the fourth dimension of
* the 4D grid to process in one function call.
* @param flags a bitwise combination of zero or more optional flags
* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
*/
void pthreadpool_parallelize_4d_tile_1d(
pthreadpool_t threadpool,
pthreadpool_task_4d_tile_1d_t function,
void* context,
size_t range_i,
size_t range_j,
size_t range_k,
size_t range_l,
size_t tile_l,
uint32_t flags);
/**
* Process items on a 4D grid with the specified maximum tile size along the
* last two grid dimensions.
*
* The function implements a parallel version of the following snippet:
*
* for (size_t i = 0; i < range_i; i++)
* for (size_t j = 0; j < range_j; j++)
* for (size_t k = 0; k < range_k; k += tile_k)
* for (size_t l = 0; l < range_l; l += tile_l)
* function(context, i, j, k, l,
* min(range_k - k, tile_k), min(range_l - l, tile_l));
*
* When the function returns, all items have been processed and the thread pool
* is ready for a new task.
*
* @note If multiple threads call this function with the same thread pool, the
* calls are serialized.
*
* @param threadpool the thread pool to use for parallelisation. If threadpool
* is NULL, all items are processed serially on the calling thread.
* @param function the function to call for each tile.
* @param context the first argument passed to the specified function.
* @param range_i the number of items to process along the first dimension
* of the 4D grid.
* @param range_j the number of items to process along the second dimension
* of the 4D grid.
* @param range_k the number of items to process along the third dimension
* of the 4D grid.
* @param range_l the number of items to process along the fourth dimension
* of the 4D grid.
* @param tile_k the maximum number of items along the third dimension of
* the 4D grid to process in one function call.
* @param tile_l the maximum number of items along the fourth dimension of
* the 4D grid to process in one function call.
* @param flags a bitwise combination of zero or more optional flags
* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
*/
void pthreadpool_parallelize_4d_tile_2d(
pthreadpool_t threadpool,
pthreadpool_task_4d_tile_2d_t function,
void* context,
size_t range_i,
size_t range_j,
size_t range_k,
size_t range_l,
size_t tile_k,
size_t tile_l,
uint32_t flags);
/**
* Process items on a 4D grid with the specified maximum tile size along the
* last two grid dimensions using a microarchitecture-aware task function.
*
* The function implements a parallel version of the following snippet:
*
* uint32_t uarch_index = cpuinfo_initialize() ?
* cpuinfo_get_current_uarch_index() : default_uarch_index;
* if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
* for (size_t i = 0; i < range_i; i++)
* for (size_t j = 0; j < range_j; j++)
* for (size_t k = 0; k < range_k; k += tile_k)
* for (size_t l = 0; l < range_l; l += tile_l)
* function(context, uarch_index, i, j, k, l,
* min(range_k - k, tile_k), min(range_l - l, tile_l));
*
* When the function returns, all items have been processed and the thread pool
* is ready for a new task.
*
* @note If multiple threads call this function with the same thread pool, the
* calls are serialized.
*
* @param threadpool the thread pool to use for parallelisation. If
* threadpool is NULL, all items are processed serially on the calling
* thread.
* @param function the function to call for each tile.
* @param context the first argument passed to the specified
* function.
* @param default_uarch_index the microarchitecture index to use when
* pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
* or index returned by cpuinfo_get_current_uarch_index() exceeds the
* max_uarch_index value.
* @param max_uarch_index the maximum microarchitecture index expected by
* the specified function. If the index returned by
* cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
* will be used instead. default_uarch_index can exceed max_uarch_index.
* @param range_i the number of items to process along the first
* dimension of the 4D grid.
* @param range_j the number of items to process along the second
* dimension of the 4D grid.
* @param range_k the number of items to process along the third
* dimension of the 4D grid.
* @param range_l the number of items to process along the fourth
* dimension of the 4D grid.
* @param tile_k the maximum number of items along the third
* dimension of the 4D grid to process in one function call.
* @param tile_l the maximum number of items along the fourth
* dimension of the 4D grid to process in one function call.
* @param flags a bitwise combination of zero or more optional
* flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
* PTHREADPOOL_FLAG_YIELD_WORKERS)
*/
void pthreadpool_parallelize_4d_tile_2d_with_uarch(
pthreadpool_t threadpool,
pthreadpool_task_4d_tile_2d_with_id_t function,
void* context,
uint32_t default_uarch_index,
uint32_t max_uarch_index,
size_t range_i,
size_t range_j,
size_t range_k,
size_t range_l,
size_t tile_k,
size_t tile_l,
uint32_t flags);
/**
* Process items on a 5D grid.
*
* The function implements a parallel version of the following snippet:
*
* for (size_t i = 0; i < range_i; i++)
* for (size_t j = 0; j < range_j; j++)
* for (size_t k = 0; k < range_k; k++)
* for (size_t l = 0; l < range_l; l++)
* for (size_t m = 0; m < range_m; m++)
* function(context, i, j, k, l, m);
*
* When the function returns, all items have been processed and the thread pool
* is ready for a new task.
*
* @note If multiple threads call this function with the same thread pool, the
* calls are serialized.
*
* @param threadpool the thread pool to use for parallelisation. If threadpool
* is NULL, all items are processed serially on the calling thread.
* @param function the function to call for each tile.
* @param context the first argument passed to the specified function.
* @param range_i the number of items to process along the first dimension
* of the 5D grid.
* @param range_j the number of items to process along the second dimension
* of the 5D grid.
* @param range_k the number of items to process along the third dimension
* of the 5D grid.
* @param range_l the number of items to process along the fourth dimension
* of the 5D grid.
* @param range_m the number of items to process along the fifth dimension
* of the 5D grid.
* @param flags a bitwise combination of zero or more optional flags
* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
*/
void pthreadpool_parallelize_5d(
pthreadpool_t threadpool,
pthreadpool_task_5d_t function,
void* context,
size_t range_i,
size_t range_j,
size_t range_k,
size_t range_l,
size_t range_m,
uint32_t flags);
/**
* Process items on a 5D grid with the specified maximum tile size along the
* last grid dimension.
*
* The function implements a parallel version of the following snippet:
*
* for (size_t i = 0; i < range_i; i++)
* for (size_t j = 0; j < range_j; j++)
* for (size_t k = 0; k < range_k; k++)
* for (size_t l = 0; l < range_l; l++)
* for (size_t m = 0; m < range_m; m += tile_m)
* function(context, i, j, k, l, m, min(range_m - m, tile_m));
*
* When the function returns, all items have been processed and the thread pool
* is ready for a new task.
*
* @note If multiple threads call this function with the same thread pool, the
* calls are serialized.
*
* @param threadpool the thread pool to use for parallelisation. If threadpool
* is NULL, all items are processed serially on the calling thread.
* @param function the function to call for each tile.
* @param context the first argument passed to the specified function.
* @param range_i the number of items to process along the first dimension
* of the 5D grid.
* @param range_j the number of items to process along the second dimension
* of the 5D grid.
* @param range_k the number of items to process along the third dimension
* of the 5D grid.
* @param range_l the number of items to process along the fourth dimension
* of the 5D grid.
* @param range_m the number of items to process along the fifth dimension
* of the 5D grid.
* @param tile_m the maximum number of items along the fifth dimension of
* the 5D grid to process in one function call.
* @param flags a bitwise combination of zero or more optional flags
* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
*/
void pthreadpool_parallelize_5d_tile_1d(
pthreadpool_t threadpool,
pthreadpool_task_5d_tile_1d_t function,
void* context,
size_t range_i,
size_t range_j,
size_t range_k,
size_t range_l,
size_t range_m,
size_t tile_m,
uint32_t flags);
/**
* Process items on a 5D grid with the specified maximum tile size along the
* last two grid dimensions.
*
* The function implements a parallel version of the following snippet:
*
* for (size_t i = 0; i < range_i; i++)
* for (size_t j = 0; j < range_j; j++)
* for (size_t k = 0; k < range_k; k++)
* for (size_t l = 0; l < range_l; l += tile_l)
* for (size_t m = 0; m < range_m; m += tile_m)
* function(context, i, j, k, l, m,
* min(range_l - l, tile_l), min(range_m - m, tile_m));
*
* When the function returns, all items have been processed and the thread pool
* is ready for a new task.
*
* @note If multiple threads call this function with the same thread pool, the
* calls are serialized.
*
* @param threadpool the thread pool to use for parallelisation. If threadpool
* is NULL, all items are processed serially on the calling thread.
* @param function the function to call for each tile.
* @param context the first argument passed to the specified function.
* @param range_i the number of items to process along the first dimension
* of the 5D grid.
* @param range_j the number of items to process along the second dimension
* of the 5D grid.
* @param range_k the number of items to process along the third dimension
* of the 5D grid.
* @param range_l the number of items to process along the fourth dimension
* of the 5D grid.
* @param range_m the number of items to process along the fifth dimension
* of the 5D grid.
* @param tile_l the maximum number of items along the fourth dimension of
* the 5D grid to process in one function call.
* @param tile_m the maximum number of items along the fifth dimension of
* the 5D grid to process in one function call.
* @param flags a bitwise combination of zero or more optional flags
* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
*/
void pthreadpool_parallelize_5d_tile_2d(
pthreadpool_t threadpool,
pthreadpool_task_5d_tile_2d_t function,
void* context,
size_t range_i,
size_t range_j,
size_t range_k,
size_t range_l,
size_t range_m,
size_t tile_l,
size_t tile_m,
uint32_t flags);
/**
* Process items on a 6D grid.
*
* The function implements a parallel version of the following snippet:
*
* for (size_t i = 0; i < range_i; i++)
* for (size_t j = 0; j < range_j; j++)
* for (size_t k = 0; k < range_k; k++)
* for (size_t l = 0; l < range_l; l++)
* for (size_t m = 0; m < range_m; m++)
* for (size_t n = 0; n < range_n; n++)
* function(context, i, j, k, l, m, n);
*
* When the function returns, all items have been processed and the thread pool
* is ready for a new task.
*
* @note If multiple threads call this function with the same thread pool, the
* calls are serialized.
*
* @param threadpool the thread pool to use for parallelisation. If threadpool
* is NULL, all items are processed serially on the calling thread.
* @param function the function to call for each tile.
* @param context the first argument passed to the specified function.
* @param range_i the number of items to process along the first dimension
* of the 6D grid.
* @param range_j the number of items to process along the second dimension
* of the 6D grid.
* @param range_k the number of items to process along the third dimension
* of the 6D grid.
* @param range_l the number of items to process along the fourth dimension
* of the 6D grid.
* @param range_m the number of items to process along the fifth dimension
* of the 6D grid.
* @param range_n the number of items to process along the sixth dimension
* of the 6D grid.
* @param tile_n the maximum number of items along the sixth dimension of
* the 6D grid to process in one function call.
* @param flags a bitwise combination of zero or more optional flags
* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
*/
void pthreadpool_parallelize_6d(
pthreadpool_t threadpool,
pthreadpool_task_6d_t function,
void* context,
size_t range_i,
size_t range_j,
size_t range_k,
size_t range_l,
size_t range_m,
size_t range_n,
uint32_t flags);
/**
* Process items on a 6D grid with the specified maximum tile size along the
* last grid dimension.
*
* The function implements a parallel version of the following snippet:
*
* for (size_t i = 0; i < range_i; i++)
* for (size_t j = 0; j < range_j; j++)
* for (size_t k = 0; k < range_k; k++)
* for (size_t l = 0; l < range_l; l++)
* for (size_t m = 0; m < range_m; m++)
* for (size_t n = 0; n < range_n; n += tile_n)
* function(context, i, j, k, l, m, n, min(range_n - n, tile_n));
*
* When the function returns, all items have been processed and the thread pool
* is ready for a new task.
*
* @note If multiple threads call this function with the same thread pool, the
* calls are serialized.
*
* @param threadpool the thread pool to use for parallelisation. If threadpool
* is NULL, all items are processed serially on the calling thread.
* @param function the function to call for each tile.
* @param context the first argument passed to the specified function.
* @param range_i the number of items to process along the first dimension
* of the 6D grid.
* @param range_j the number of items to process along the second dimension
* of the 6D grid.
* @param range_k the number of items to process along the third dimension
* of the 6D grid.
* @param range_l the number of items to process along the fourth dimension
* of the 6D grid.
* @param range_m the number of items to process along the fifth dimension
* of the 6D grid.
* @param range_n the number of items to process along the sixth dimension
* of the 6D grid.
* @param tile_n the maximum number of items along the sixth dimension of
* the 6D grid to process in one function call.
* @param flags a bitwise combination of zero or more optional flags
* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
*/
void pthreadpool_parallelize_6d_tile_1d(
pthreadpool_t threadpool,
pthreadpool_task_6d_tile_1d_t function,
void* context,
size_t range_i,
size_t range_j,
size_t range_k,
size_t range_l,
size_t range_m,
size_t range_n,
size_t tile_n,
uint32_t flags);
/**
* Process items on a 6D grid with the specified maximum tile size along the
* last two grid dimensions.
*
* The function implements a parallel version of the following snippet:
*
* for (size_t i = 0; i < range_i; i++)
* for (size_t j = 0; j < range_j; j++)
* for (size_t k = 0; k < range_k; k++)
* for (size_t l = 0; l < range_l; l++)
* for (size_t m = 0; m < range_m; m += tile_m)
* for (size_t n = 0; n < range_n; n += tile_n)
* function(context, i, j, k, l, m, n,
* min(range_m - m, tile_m), min(range_n - n, tile_n));
*
* When the function returns, all items have been processed and the thread pool
* is ready for a new task.
*
* @note If multiple threads call this function with the same thread pool, the
* calls are serialized.
*
* @param threadpool the thread pool to use for parallelisation. If threadpool
* is NULL, all items are processed serially on the calling thread.
* @param function the function to call for each tile.
* @param context the first argument passed to the specified function.
* @param range_i the number of items to process along the first dimension
* of the 6D grid.
* @param range_j the number of items to process along the second dimension
* of the 6D grid.
* @param range_k the number of items to process along the third dimension
* of the 6D grid.
* @param range_l the number of items to process along the fourth dimension
* of the 6D grid.
* @param range_m the number of items to process along the fifth dimension
* of the 6D grid.
* @param range_n the number of items to process along the sixth dimension
* of the 6D grid.
* @param tile_m the maximum number of items along the fifth dimension of
* the 6D grid to process in one function call.
* @param tile_n the maximum number of items along the sixth dimension of
* the 6D grid to process in one function call.
* @param flags a bitwise combination of zero or more optional flags
* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
*/
void pthreadpool_parallelize_6d_tile_2d(
pthreadpool_t threadpool,
pthreadpool_task_6d_tile_2d_t function,
void* context,
size_t range_i,
size_t range_j,
size_t range_k,
size_t range_l,
size_t range_m,
size_t range_n,
size_t tile_m,
size_t tile_n,
uint32_t flags);
/**
* Terminates threads in the thread pool and releases associated resources.
*
* @warning Accessing the thread pool after a call to this function constitutes
* undefined behaviour and may cause data corruption.
*
* @param[in,out] threadpool The thread pool to destroy.
*/
void pthreadpool_destroy(pthreadpool_t threadpool);
#ifndef PTHREADPOOL_NO_DEPRECATED_API
/* Legacy API for compatibility with pre-existing users (e.g. NNPACK) */
#if defined(__GNUC__)
#define PTHREADPOOL_DEPRECATED __attribute__((__deprecated__))
#else
#define PTHREADPOOL_DEPRECATED
#endif
typedef void (*pthreadpool_function_1d_t)(void*, size_t) PTHREADPOOL_DEPRECATED;
typedef void (*pthreadpool_function_1d_tiled_t)(void*, size_t, size_t) PTHREADPOOL_DEPRECATED;
typedef void (*pthreadpool_function_2d_t)(void*, size_t, size_t) PTHREADPOOL_DEPRECATED;
typedef void (*pthreadpool_function_2d_tiled_t)(void*, size_t, size_t, size_t, size_t) PTHREADPOOL_DEPRECATED;
typedef void (*pthreadpool_function_3d_tiled_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t) PTHREADPOOL_DEPRECATED;
typedef void (*pthreadpool_function_4d_tiled_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t) PTHREADPOOL_DEPRECATED;
void pthreadpool_compute_1d(
pthreadpool_t threadpool,
pthreadpool_function_1d_t function,
void* argument,
size_t range) PTHREADPOOL_DEPRECATED;
void pthreadpool_compute_1d_tiled(
pthreadpool_t threadpool,
pthreadpool_function_1d_tiled_t function,
void* argument,
size_t range,
size_t tile) PTHREADPOOL_DEPRECATED;
void pthreadpool_compute_2d(
pthreadpool_t threadpool,
pthreadpool_function_2d_t function,
void* argument,
size_t range_i,
size_t range_j) PTHREADPOOL_DEPRECATED;
void pthreadpool_compute_2d_tiled(
pthreadpool_t threadpool,
pthreadpool_function_2d_tiled_t function,
void* argument,
size_t range_i,
size_t range_j,
size_t tile_i,
size_t tile_j) PTHREADPOOL_DEPRECATED;
void pthreadpool_compute_3d_tiled(
pthreadpool_t threadpool,
pthreadpool_function_3d_tiled_t function,
void* argument,
size_t range_i,
size_t range_j,
size_t range_k,
size_t tile_i,
size_t tile_j,
size_t tile_k) PTHREADPOOL_DEPRECATED;
void pthreadpool_compute_4d_tiled(
pthreadpool_t threadpool,
pthreadpool_function_4d_tiled_t function,
void* argument,
size_t range_i,
size_t range_j,
size_t range_k,
size_t range_l,
size_t tile_i,
size_t tile_j,
size_t tile_k,
size_t tile_l) PTHREADPOOL_DEPRECATED;
#endif /* PTHREADPOOL_NO_DEPRECATED_API */
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* PTHREADPOOL_H_ */