| #pragma once |
| #include <ATen/ATen.h> |
| #include <ATen/core/ivalue.h> |
| |
| namespace at { |
| namespace internal { |
| // This parameter is heuristically chosen to determine the minimum number of |
| // work that warrants parallelism. For example, when summing an array, it is |
| // deemed inefficient to parallelise over arrays shorter than 32768. Further, |
| // no parallel algorithm (such as parallel_reduce) should split work into |
| // smaller than GRAIN_SIZE chunks. |
| constexpr int64_t GRAIN_SIZE = 32768; |
| } // namespace internal |
| |
| inline int64_t divup(int64_t x, int64_t y) { |
| return (x + y - 1) / y; |
| } |
| |
| // Called during new thread initialization |
| CAFFE2_API void init_num_threads(); |
| |
| // Sets the number of threads to be used in parallel region |
| CAFFE2_API void set_num_threads(int); |
| |
| // Returns the number of threads used in parallel region |
| CAFFE2_API int get_num_threads(); |
| |
| // Returns the current thread number (starting from 0) |
| // in the current parallel region, or 0 in the sequential region |
| CAFFE2_API int get_thread_num(); |
| |
| // Checks whether the code runs in parallel region |
| CAFFE2_API bool in_parallel_region(); |
| |
| /* |
| parallel_for |
| |
| begin: index at which to start applying user function |
| |
| end: index at which to stop applying user function |
| |
| grain_size: number of elements per chunk. impacts the degree of parallelization |
| |
| f: user function applied in parallel to the chunks, signature: |
| void f(int64_t begin, int64_t end) |
| */ |
| template <class F> |
| inline void parallel_for( |
| const int64_t begin, |
| const int64_t end, |
| const int64_t grain_size, |
| const F& f); |
| |
| /* |
| parallel_reduce |
| |
| begin: index at which to start applying reduction |
| |
| end: index at which to stop applying reduction |
| |
| grain_size: number of elements per chunk. impacts number of elements in |
| intermediate results tensor and degree of parallelization. |
| |
| ident: identity for binary combination function sf. sf(ident, x) needs to return |
| x. |
| |
| f: function for reduction over a chunk. f needs to be of signature scalar_t |
| f(int64_t partial_begin, int64_t partial_end, scalar_t identifiy) |
| |
| sf: function to combine two partial results. sf needs to be of signature |
| scalar_t sf(scalar_t x, scalar_t y) |
| |
| For example, you might have a tensor of 10000 entires and want to sum together |
| all the elements. Parallel_reduce with a grain_size of 2500 will then allocate |
| an intermediate result tensor with 4 elements. Then it will execute the function |
| "f" you provide and pass the beginning and end index of these chunks, so |
| 0-2499, 2500-4999, etc. and the combination identity. It will then write out |
| the result from each of these chunks into the intermediate result tensor. After |
| that it'll reduce the partial results from each chunk into a single number using |
| the combination function sf and the identity ident. For a total summation this |
| would be "+" and 0 respectively. This is similar to tbb's approach [1], where |
| you need to provide a function to accumulate a subrange, a function to combine |
| two partial results and an identity. |
| |
| [1] https://software.intel.com/en-us/node/506154 |
| */ |
| template <class scalar_t, class F, class SF> |
| inline scalar_t parallel_reduce( |
| const int64_t begin, |
| const int64_t end, |
| const int64_t grain_size, |
| const scalar_t ident, |
| const F& f, |
| const SF& sf); |
| |
| // Returns a detailed string describing parallelization settings |
| CAFFE2_API std::string get_parallel_info(); |
| |
| // Sets number of threads used for inter-op parallelism |
| CAFFE2_API void set_num_interop_threads(int); |
| |
| // Returns the number of threads used for inter-op parallelism |
| CAFFE2_API int get_num_interop_threads(); |
| |
| // Launches inter-op parallel task |
| CAFFE2_API void launch(std::function<void()> func); |
| |
| // Launches intra-op parallel task |
| CAFFE2_API void intraop_launch(std::function<void()> func); |
| |
| // Launches intra-op parallel task, returns a future |
| CAFFE2_API std::shared_ptr<c10::ivalue::Future> intraop_launch_future( |
| std::function<void()> func); |
| |
| // Returns number of intra-op threads used by default |
| CAFFE2_API int intraop_default_num_threads(); |
| |
| } // namespace at |
| |
| #if AT_PARALLEL_OPENMP |
| #include <ATen/ParallelOpenMP.h> |
| #elif AT_PARALLEL_NATIVE |
| #include <ATen/ParallelNative.h> |
| #elif AT_PARALLEL_NATIVE_TBB |
| #include <ATen/ParallelNativeTBB.h> |
| #endif |