aten/src/ATen/Parallel.h - platform/external/pytorch - Git at Google

 #pragma once
 #include <ATen/ATen.h>
 #include <ATen/core/ivalue.h>

 namespace at {
 namespace internal {
 // This parameter is heuristically chosen to determine the minimum number of
 // work that warrants parallelism. For example, when summing an array, it is
 // deemed inefficient to parallelise over arrays shorter than 32768. Further,
 // no parallel algorithm (such as parallel_reduce) should split work into
 // smaller than GRAIN_SIZE chunks.
 constexpr int64_t GRAIN_SIZE = 32768;
 } // namespace internal

 inline int64_t divup(int64_t x, int64_t y) {
   return (x + y - 1) / y;
 }

 // Called during new thread initialization
 CAFFE2_API void init_num_threads();

 // Sets the number of threads to be used in parallel region
 CAFFE2_API void set_num_threads(int);

 // Returns the number of threads used in parallel region
 CAFFE2_API int get_num_threads();

 // Returns the current thread number (starting from 0)
 // in the current parallel region, or 0 in the sequential region
 CAFFE2_API int get_thread_num();

 // Checks whether the code runs in parallel region
 CAFFE2_API bool in_parallel_region();

 /*
 parallel_for

 begin: index at which to start applying user function

 end: index at which to stop applying user function

 grain_size: number of elements per chunk. impacts the degree of parallelization

 f: user function applied in parallel to the chunks, signature:
   void f(int64_t begin, int64_t end)
 */
 template <class F>
 inline void parallel_for(
     const int64_t begin,
     const int64_t end,
     const int64_t grain_size,
     const F& f);

 /*
 parallel_reduce

 begin: index at which to start applying reduction

 end: index at which to stop applying reduction

 grain_size: number of elements per chunk. impacts number of elements in
 intermediate results tensor and degree of parallelization.

 ident: identity for binary combination function sf. sf(ident, x) needs to return
 x.

 f: function for reduction over a chunk. f needs to be of signature scalar_t
 f(int64_t partial_begin, int64_t partial_end, scalar_t identifiy)

 sf: function to combine two partial results. sf needs to be of signature
 scalar_t sf(scalar_t x, scalar_t y)

 For example, you might have a tensor of 10000 entires and want to sum together
 all the elements. Parallel_reduce with a grain_size of 2500 will then allocate
 an intermediate result tensor with 4 elements. Then it will execute the function
 "f" you provide and pass the beginning and end index of these chunks, so
 0-2499, 2500-4999, etc. and the combination identity. It will then write out
 the result from each of these chunks into the intermediate result tensor. After
 that it'll reduce the partial results from each chunk into a single number using
 the combination function sf and the identity ident. For a total summation this
 would be "+" and 0 respectively. This is similar to tbb's approach [1], where
 you need to provide a function to accumulate a subrange, a function to combine
 two partial results and an identity.

 [1] https://software.intel.com/en-us/node/506154
 */
 template <class scalar_t, class F, class SF>
 inline scalar_t parallel_reduce(
     const int64_t begin,
     const int64_t end,
     const int64_t grain_size,
     const scalar_t ident,
     const F& f,
     const SF& sf);

 // Returns a detailed string describing parallelization settings
 CAFFE2_API std::string get_parallel_info();

 // Sets number of threads used for inter-op parallelism
 CAFFE2_API void set_num_interop_threads(int);

 // Returns the number of threads used for inter-op parallelism
 CAFFE2_API int get_num_interop_threads();

 // Launches inter-op parallel task
 CAFFE2_API void launch(std::function<void()> func);

 // Launches intra-op parallel task
 CAFFE2_API void intraop_launch(std::function<void()> func);

 // Launches intra-op parallel task, returns a future
 CAFFE2_API std::shared_ptr<c10::ivalue::Future> intraop_launch_future(
     std::function<void()> func);

 // Returns number of intra-op threads used by default
 CAFFE2_API int intraop_default_num_threads();

 } // namespace at

 #if AT_PARALLEL_OPENMP
 #include <ATen/ParallelOpenMP.h>
 #elif AT_PARALLEL_NATIVE
 #include <ATen/ParallelNative.h>
 #elif AT_PARALLEL_NATIVE_TBB
 #include <ATen/ParallelNativeTBB.h>
 #endif
	#pragma once
	#include <ATen/ATen.h>
	#include <ATen/core/ivalue.h>

	namespace at {
	namespace internal {
	// This parameter is heuristically chosen to determine the minimum number of
	// work that warrants parallelism. For example, when summing an array, it is
	// deemed inefficient to parallelise over arrays shorter than 32768. Further,
	// no parallel algorithm (such as parallel_reduce) should split work into
	// smaller than GRAIN_SIZE chunks.
	constexpr int64_t GRAIN_SIZE = 32768;
	} // namespace internal

	inline int64_t divup(int64_t x, int64_t y) {
	return (x + y - 1) / y;
	}

	// Called during new thread initialization
	CAFFE2_API void init_num_threads();

	// Sets the number of threads to be used in parallel region
	CAFFE2_API void set_num_threads(int);

	// Returns the number of threads used in parallel region
	CAFFE2_API int get_num_threads();

	// Returns the current thread number (starting from 0)
	// in the current parallel region, or 0 in the sequential region
	CAFFE2_API int get_thread_num();

	// Checks whether the code runs in parallel region
	CAFFE2_API bool in_parallel_region();

	/*
	parallel_for

	begin: index at which to start applying user function

	end: index at which to stop applying user function

	grain_size: number of elements per chunk. impacts the degree of parallelization

	f: user function applied in parallel to the chunks, signature:
	void f(int64_t begin, int64_t end)
	*/
	template <class F>
	inline void parallel_for(
	const int64_t begin,
	const int64_t end,
	const int64_t grain_size,
	const F& f);

	/*
	parallel_reduce

	begin: index at which to start applying reduction

	end: index at which to stop applying reduction

	grain_size: number of elements per chunk. impacts number of elements in
	intermediate results tensor and degree of parallelization.

	ident: identity for binary combination function sf. sf(ident, x) needs to return
	x.

	f: function for reduction over a chunk. f needs to be of signature scalar_t
	f(int64_t partial_begin, int64_t partial_end, scalar_t identifiy)

	sf: function to combine two partial results. sf needs to be of signature
	scalar_t sf(scalar_t x, scalar_t y)

	For example, you might have a tensor of 10000 entires and want to sum together
	all the elements. Parallel_reduce with a grain_size of 2500 will then allocate
	an intermediate result tensor with 4 elements. Then it will execute the function
	"f" you provide and pass the beginning and end index of these chunks, so
	0-2499, 2500-4999, etc. and the combination identity. It will then write out
	the result from each of these chunks into the intermediate result tensor. After
	that it'll reduce the partial results from each chunk into a single number using
	the combination function sf and the identity ident. For a total summation this
	would be "+" and 0 respectively. This is similar to tbb's approach [1], where
	you need to provide a function to accumulate a subrange, a function to combine
	two partial results and an identity.

	[1] https://software.intel.com/en-us/node/506154
	*/
	template <class scalar_t, class F, class SF>
	inline scalar_t parallel_reduce(
	const int64_t begin,
	const int64_t end,
	const int64_t grain_size,
	const scalar_t ident,
	const F& f,
	const SF& sf);

	// Returns a detailed string describing parallelization settings
	CAFFE2_API std::string get_parallel_info();

	// Sets number of threads used for inter-op parallelism
	CAFFE2_API void set_num_interop_threads(int);

	// Returns the number of threads used for inter-op parallelism
	CAFFE2_API int get_num_interop_threads();

	// Launches inter-op parallel task
	CAFFE2_API void launch(std::function<void()> func);

	// Launches intra-op parallel task
	CAFFE2_API void intraop_launch(std::function<void()> func);

	// Launches intra-op parallel task, returns a future
	CAFFE2_API std::shared_ptr<c10::ivalue::Future> intraop_launch_future(
	std::function<void()> func);

	// Returns number of intra-op threads used by default
	CAFFE2_API int intraop_default_num_threads();

	} // namespace at

	#if AT_PARALLEL_OPENMP
	#include <ATen/ParallelOpenMP.h>
	#elif AT_PARALLEL_NATIVE
	#include <ATen/ParallelNative.h>
	#elif AT_PARALLEL_NATIVE_TBB
	#include <ATen/ParallelNativeTBB.h>
	#endif