| // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // multi_thread_gemm.h: Multi-threaded GEMM entry point. |
| // Readers note: To understand this file, it is useful to first |
| // read and understand the much simpler single_thread_gemm.h. |
| |
| #ifndef GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_ |
| #define GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_ |
| |
| #include <vector> |
| |
| #include "single_thread_gemm.h" |
| |
| namespace gemmlowp { |
| |
| // On X86 and ARM platforms we enable a busy-wait spinlock before waiting on a |
| // pthread conditional variable. In order to implement that correctly we need |
| // to put some explicit memory load/store barriers. |
| |
| #if defined(GEMMLOWP_ALLOW_INLINE_ASM) && !defined(GEMMLOWP_NO_BUSYWAIT) && \ |
| (defined(GEMMLOWP_ARM) || defined(GEMMLOWP_X86)) |
| |
| #define GEMMLOWP_USE_BUSYWAIT |
| |
| const int kMaxBusyWaitNOPs = 32 * 1000 * 1000; |
| |
| #define GEMMLOWP_NOP "nop\n" |
| |
| #define GEMMLOWP_STRING_CONCAT_4(X) X X X X |
| #define GEMMLOWP_NOP4 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP) |
| #define GEMMLOWP_NOP16 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP4) |
| #define GEMMLOWP_NOP64 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP16) |
| |
| inline int Do256NOPs() { |
| asm volatile(GEMMLOWP_NOP64); |
| return 64; |
| } |
| |
| #undef GEMMLOWP_STRING_CONCAT_4 |
| #undef GEMMLOWP_NOP256 |
| #undef GEMMLOWP_NOP64 |
| #undef GEMMLOWP_NOP16 |
| #undef GEMMLOWP_NOP4 |
| #undef GEMMLOWP_NOP |
| |
| inline void WriteBarrier() { |
| #if defined(_MSC_VER) |
| MemoryBarrier(); |
| #elif defined(GEMMLOWP_ARM_32) |
| asm volatile("" ::: "memory"); |
| #elif defined(GEMMLOWP_ARM_64) |
| asm volatile("dmb ishst" ::: "memory"); |
| #elif defined(GEMMLOWP_X86) |
| asm volatile("sfence" ::: "memory"); |
| #else |
| #error "Unsupported architecture for WriteBarrier." |
| #endif |
| } |
| |
| inline void ReadBarrier() { |
| #if defined(_MSC_VER) |
| MemoryBarrier(); |
| #elif defined(GEMMLOWP_ARM_32) |
| asm volatile("" ::: "memory"); |
| #elif defined(GEMMLOWP_ARM_64) |
| asm volatile("dmb ishld" ::: "memory"); |
| #elif defined(GEMMLOWP_X86) |
| asm volatile("lfence" ::: "memory"); |
| #else |
| #error "Unsupported architecture for ReadBarrier." |
| #endif |
| } |
| |
| #endif |
| |
| // Waits until *var != initial_value. |
| // |
| // Returns the new value of *var. The guarantee here is that |
| // the return value is different from initial_value, and that that |
| // new value has been taken by *var at some point during the |
| // execution of this function. There is no guarantee that this is |
| // still the value of *var when this function returns, since *var is |
| // not assumed to be guarded by any lock. |
| // |
| // First does some busy-waiting for a fixed number of no-op cycles, |
| // then falls back to passive waiting for the given condvar, guarded |
| // by the given mutex. |
| // |
| // The idea of doing some initial busy-waiting is to help get |
| // better and more consistent multithreading benefits for small GEMM sizes. |
| // Busy-waiting help ensuring that if we need to wake up soon after having |
| // started waiting, then we can wake up quickly (as opposed to, say, |
| // having to wait to be scheduled again by the OS). On the other hand, |
| // we must still eventually revert to passive waiting for longer waits |
| // (e.g. worker threads having finished a GEMM and waiting until the next GEMM) |
| // so as to avoid permanently spinning. |
| // |
| template <typename T> |
| T WaitForVariableChange(volatile T* var, T initial_value, pthread_cond_t* cond, |
| pthread_mutex_t* mutex) { |
| #ifdef GEMMLOWP_USE_BUSYWAIT |
| // If we are on a platform that supports it, spin for some time. |
| { |
| int nops = 0; |
| // First, trivial case where the variable already changed value. |
| T new_value = *var; |
| if (new_value != initial_value) { |
| ReadBarrier(); |
| return new_value; |
| } |
| // Then try busy-waiting. |
| while (nops < kMaxBusyWaitNOPs) { |
| nops += Do256NOPs(); |
| new_value = *var; |
| if (new_value != initial_value) { |
| ReadBarrier(); |
| return new_value; |
| } |
| } |
| } |
| #endif |
| |
| // Finally, do real passive waiting. |
| pthread_mutex_lock(mutex); |
| T new_value = *var; |
| if (new_value == initial_value) { |
| pthread_cond_wait(cond, mutex); |
| new_value = *var; |
| assert(new_value != initial_value); |
| } |
| pthread_mutex_unlock(mutex); |
| return new_value; |
| } |
| |
| // A BlockingCounter lets one thread to wait for N events to occur. |
| // This is how the master thread waits for all the worker threads |
| // to have finished working. |
| class BlockingCounter { |
| public: |
| BlockingCounter() |
| : count_(0), |
| initial_count_(0) { |
| pthread_cond_init(&cond_, nullptr); |
| pthread_mutex_init(&mutex_, nullptr); |
| } |
| |
| ~BlockingCounter() { |
| pthread_cond_destroy(&cond_); |
| pthread_mutex_destroy(&mutex_); |
| } |
| |
| // Sets/resets the counter; initial_count is the number of |
| // decrementing events that the Wait() call will be waiting for. |
| void Reset(std::size_t initial_count) { |
| pthread_mutex_lock(&mutex_); |
| assert(count_ == 0); |
| initial_count_ = initial_count; |
| count_ = initial_count_; |
| pthread_mutex_unlock(&mutex_); |
| } |
| |
| // Decrements the counter; if the counter hits zero, signals |
| // the thread that was waiting for that, and returns true. |
| // Otherwise (if the decremented count is still nonzero), |
| // returns false. |
| bool DecrementCount() { |
| pthread_mutex_lock(&mutex_); |
| assert(count_ > 0); |
| count_--; |
| #ifdef GEMMLOWP_USE_BUSYWAIT |
| WriteBarrier(); |
| #endif |
| if (count_ == 0) { |
| pthread_cond_signal(&cond_); |
| } |
| bool retval = count_ == 0; |
| pthread_mutex_unlock(&mutex_); |
| return retval; |
| } |
| |
| // Waits for the N other threads (N having been set by Reset()) |
| // to hit the BlockingCounter. |
| void Wait() { |
| ScopedProfilingLabel label("BlockingCounter::Wait"); |
| while (count_) { |
| #ifdef GEMMLOWP_USE_BUSYWAIT |
| ReadBarrier(); |
| #else |
| // This is likely unnecessary, but is kept to ensure regressions are not |
| // introduced. |
| #ifndef _WIN32 |
| asm volatile("" ::: "memory"); |
| #endif |
| #endif |
| const std::size_t count_value = count_; |
| if (count_value) { |
| WaitForVariableChange(&count_, count_value, &cond_, &mutex_); |
| } |
| } |
| } |
| |
| private: |
| pthread_cond_t cond_; |
| pthread_mutex_t mutex_; |
| std::size_t count_; |
| std::size_t initial_count_; |
| }; |
| |
| // A workload for a worker. |
| struct Task { |
| Task() : local_allocator(nullptr) {} |
| virtual ~Task() {} |
| virtual void Run() = 0; |
| Allocator* local_allocator; |
| }; |
| |
| // A worker thread. |
| class Worker { |
| public: |
| enum class State { |
| ThreadStartup, // The initial state before the thread main loop runs. |
| Ready, // Is not working, has not yet received new work to do. |
| HasWork, // Has work to do. |
| ExitAsSoonAsPossible // Should exit at earliest convenience. |
| }; |
| |
| explicit Worker(BlockingCounter* counter_to_decrement_when_ready) |
| : task_(nullptr), |
| state_(State::ThreadStartup), |
| counter_to_decrement_when_ready_(counter_to_decrement_when_ready) { |
| pthread_cond_init(&state_cond_, nullptr); |
| pthread_mutex_init(&state_mutex_, nullptr); |
| pthread_create(&thread_, nullptr, ThreadFunc, this); |
| } |
| |
| ~Worker() { |
| ChangeState(State::ExitAsSoonAsPossible); |
| pthread_join(thread_, nullptr); |
| pthread_cond_destroy(&state_cond_); |
| pthread_mutex_destroy(&state_mutex_); |
| } |
| |
| // Changes State; may be called from either the worker thread |
| // or the master thread; however, not all state transitions are legal, |
| // which is guarded by assertions. |
| void ChangeState(State new_state) { |
| ScopedProfilingLabel label("Worker::ChangeState"); |
| pthread_mutex_lock(&state_mutex_); |
| assert(new_state != state_); |
| switch (state_) { |
| case State::ThreadStartup: |
| assert(new_state == State::Ready); |
| break; |
| case State::Ready: |
| assert(new_state == State::HasWork || |
| new_state == State::ExitAsSoonAsPossible); |
| break; |
| case State::HasWork: |
| assert(new_state == State::Ready || |
| new_state == State::ExitAsSoonAsPossible); |
| break; |
| default: |
| abort(); |
| } |
| state_ = new_state; |
| pthread_cond_signal(&state_cond_); |
| if (state_ == State::Ready) { |
| counter_to_decrement_when_ready_->DecrementCount(); |
| } |
| pthread_mutex_unlock(&state_mutex_); |
| } |
| |
| // Thread entry point. |
| void ThreadFunc() { |
| ScopedProfilingLabel label("Worker::ThreadFunc"); |
| RegisterCurrentThreadForProfiling(); |
| |
| ChangeState(State::Ready); |
| |
| // Thread main loop |
| while (true) { |
| // Get a state to act on |
| // In the 'Ready' state, we have nothing to do but to wait until |
| // we switch to another state. |
| State state_to_act_upon = WaitForVariableChange( |
| &state_, State::Ready, &state_cond_, &state_mutex_); |
| |
| // We now have a state to act on, so act. |
| switch (state_to_act_upon) { |
| case State::HasWork: |
| // Got work to do! So do it, and then revert to 'Ready' state. |
| assert(task_); |
| task_->Run(); |
| task_ = nullptr; |
| ChangeState(State::Ready); |
| break; |
| case State::ExitAsSoonAsPossible: |
| return; |
| default: |
| abort(); |
| } |
| } |
| } |
| |
| static void* ThreadFunc(void* arg) { |
| static_cast<Worker*>(arg)->ThreadFunc(); |
| return nullptr; |
| } |
| |
| // Called by the master thead to give this worker work to do. |
| // It is only legal to call this if the worker |
| void StartWork(Task* task) { |
| assert(!task_); |
| task->local_allocator = &local_allocator_; |
| task_ = task; |
| #ifdef GEMMLOWP_USE_BUSYWAIT |
| WriteBarrier(); |
| #endif |
| assert(state_ == State::Ready); |
| ChangeState(State::HasWork); |
| } |
| |
| private: |
| // The underlying thread. |
| pthread_t thread_; |
| |
| // The task to be worked on. |
| Task* task_; |
| |
| // The condition variable and mutex guarding state changes. |
| pthread_cond_t state_cond_; |
| pthread_mutex_t state_mutex_; |
| |
| // The state enum tells if we're currently working, waiting for work, etc. |
| State state_; |
| |
| // Each thread had a local allocator so they can allocate temporary |
| // buffers without blocking each other. |
| Allocator local_allocator_; |
| |
| // pointer to the master's thread BlockingCounter object, to notify the |
| // master thread of when this worker switches to the 'Ready' state. |
| BlockingCounter* const counter_to_decrement_when_ready_; |
| }; |
| |
| // A very simple pool of workers, that only allows the very |
| // specific parallelization pattern that we use here: |
| // a fixed number of workers can be given work, and one then |
| // waits for all of them to finish. |
| // |
| // See MultiThreadGemmContextBase for how other WorkersPool implementations can |
| // be used. Note that in those implementations, StartWorker can be free to |
| // ignore the <index> value; that is, the caller of WorkersPool does not rely on |
| // <index> to order tasks with equal <index>. |
| class WorkersPool { |
| public: |
| WorkersPool() {} |
| |
| ~WorkersPool() { |
| for (auto w : workers_) { |
| delete w; |
| } |
| } |
| |
| void Execute(const std::vector<Task*>& tasks) { |
| assert(tasks.size() >= 1); |
| // One of the tasks will be run on the current thread. |
| std::size_t workers_count = tasks.size() - 1; |
| CreateWorkers(workers_count); |
| assert(workers_count <= workers_.size()); |
| counter_to_decrement_when_ready_.Reset(workers_count); |
| int n = 0; |
| std::for_each(tasks.begin(), --tasks.end(), |
| [this, &n](Task* task) { workers_[n++]->StartWork(task); }); |
| // Execute the remaining workload immediately on the current thread. |
| Task* task = tasks.back(); |
| task->local_allocator = &main_thread_task_allocator_; |
| task->Run(); |
| // Wait for the workers submitted above to finish. |
| counter_to_decrement_when_ready_.Wait(); |
| // Cleanup tasks (best to do this from the same thread that allocated |
| // the memory). |
| std::for_each(tasks.begin(), tasks.end(), [](Task* task) { delete task; }); |
| } |
| |
| private: |
| // Ensures that the pool has at least the given count of workers. |
| // If any new worker has to be created, this function waits for it to |
| // be ready. |
| void CreateWorkers(std::size_t workers_count) { |
| if (workers_.size() >= workers_count) { |
| return; |
| } |
| counter_to_decrement_when_ready_.Reset(workers_count - workers_.size()); |
| while (workers_.size() < workers_count) { |
| workers_.push_back(new Worker(&counter_to_decrement_when_ready_)); |
| } |
| counter_to_decrement_when_ready_.Wait(); |
| } |
| |
| // copy construction disallowed |
| WorkersPool(const WorkersPool&) = delete; |
| |
| // The workers in this pool. They are owned by the pool: |
| // the pool creates workers and destroys them in its destructor. |
| std::vector<Worker*> workers_; |
| |
| // The BlockingCounter used to wait for the workers. |
| BlockingCounter counter_to_decrement_when_ready_; |
| |
| // For N-threaded operations, we will use only N-1 worker threads |
| // while the last task will be run directly on the main thread. |
| // It will then use this main_thread_task_allocator_; having a |
| // dedicated allocator for that (separate from the base allocator_) |
| // allows to use the same code for all tasks regardless of which |
| // thread they run on. |
| Allocator main_thread_task_allocator_; |
| }; |
| |
| // The task we use to implement a multi-threaded Gemm: a block of the |
| // RHS has been packed by the master thread; each worker thread |
| // then has to pack a block of the LHS and accumulate the Gemm of these |
| // packed LHS and RHS blocks. |
| template <typename KernelFormat, typename InputScalar, typename OutputScalar, |
| typename BitDepthParams, MapOrder LhsOrder, MapOrder RhsOrder, |
| MapOrder ResultOrder, typename LhsOffset, typename RhsOffset, |
| typename OutputPipelineType, typename GemmContextType> |
| struct GemmWithPackedRhsTask : Task { |
| typedef PackedSideBlock<typename KernelFormat::Lhs> PackedLhs; |
| typedef PackedSideBlock<typename KernelFormat::Rhs> PackedRhs; |
| GemmWithPackedRhsTask(GemmContextType* _context, const KernelBase& _kernel, |
| const MatrixMap<const InputScalar, LhsOrder>& _lhs, |
| const PackedRhs& _packed_rhs, |
| MatrixMap<OutputScalar, ResultOrder>* _result, |
| const MatrixBlockBounds& _result_block, |
| const LhsOffset& _lhs_offset, |
| const RhsOffset& _rhs_offset, |
| const BlockParams& _block_params, |
| const OutputPipelineType& _output_pipeline) |
| : context(_context), |
| kernel(_kernel), |
| lhs(_lhs), |
| packed_rhs(_packed_rhs), |
| result(*_result), |
| result_block(_result_block), |
| lhs_offset(_lhs_offset), |
| rhs_offset(_rhs_offset), |
| block_params(_block_params), |
| output_pipeline(_output_pipeline) {} |
| |
| void Run() override { |
| ScopedProfilingLabel label("GemmWithPackedRhsTask"); |
| |
| const int rows = result_block.rows; |
| const int cols = result_block.cols; |
| const int depth = lhs.cols(); |
| |
| PackedLhs packed_lhs(Side::Lhs, local_allocator, block_params); |
| |
| PackedResult packed_result(local_allocator, block_params); |
| |
| local_allocator->Commit(); |
| |
| for (int c = 0; c < cols; c += block_params.l2_cols) { |
| int cs = std::min(block_params.l2_cols, cols - c); |
| |
| for (int r = 0; r < rows; r += block_params.l2_rows) { |
| int rs = std::min(block_params.l2_rows, rows - r); |
| |
| PackLhs(&packed_lhs, lhs.block(r, 0, rs, depth)); |
| |
| Compute(kernel, block_params, &packed_result, packed_lhs, packed_rhs, |
| depth); |
| |
| auto curr_result_block = MatrixBlockBounds( |
| result_block.start_row + r, result_block.start_col + c, rs, cs); |
| UnpackResult<KernelFormat>( |
| &result, curr_result_block, packed_result, depth, |
| packed_lhs.sums_of_each_slice(), packed_rhs.sums_of_each_slice(), |
| lhs_offset.block(curr_result_block.start_row, rs), |
| rhs_offset.block(curr_result_block.start_col, cs), output_pipeline); |
| } |
| } |
| |
| local_allocator->Decommit(); |
| } |
| |
| const GemmContextType* context; |
| const KernelBase& kernel; |
| const MatrixMap<const InputScalar, LhsOrder> lhs; |
| const PackedRhs packed_rhs; |
| MatrixMap<OutputScalar, ResultOrder> result; |
| const MatrixBlockBounds result_block; |
| const LhsOffset& lhs_offset; |
| const RhsOffset& rhs_offset; |
| const BlockParams& block_params; |
| const OutputPipelineType& output_pipeline; |
| }; |
| |
| // This base class for multi-threading allows subclasses to implement their own |
| // workers_pool() method. See MultiThreadGemmContext below for an example; |
| // any other implementation of workers_pool() must return an object with the |
| // same public methods as WorkersPool. |
| class MultiThreadGemmContextBase : public SingleThreadGemmContext { |
| public: |
| void set_max_num_threads(int n) { max_num_threads_ = n; } |
| |
| int max_num_threads() const { return max_num_threads_; } |
| |
| protected: |
| // The maximum number of worker threads to use (including |
| // the master thread). |
| // The default value 1 means single-threading. That is the default |
| // because gemmlowp's primary target is mobile hardware, where thermal |
| // constraints usually mean that it may not be realistic to use more |
| // than 1 CPU core even if multiple cores are present. |
| // The special value 0 means try to detect the number of hardware threads. |
| // Note: this assumes that all CPU cores are equivalent. That assumption |
| // is defeated on big.LITTLE ARM devices, where we have no API to query |
| // the number of big cores (which is typically what we would want to use, |
| // leaving aside above-mentioned thermal issues). That is the other reason |
| // why the best compromise here is to let max_num_threads_ default to 1, |
| // so users who want multi-threading have to make the decision of how many |
| // threads to use by themselves. |
| int max_num_threads_ = 1; |
| }; |
| |
| class MultiThreadGemmContext : public MultiThreadGemmContextBase { |
| public: |
| WorkersPool* workers_pool() { return &workers_pool_; } |
| |
| private: |
| // The workers pool used by MultiThreadGemm. Making |
| // this part of the context allows it to be persistent, |
| // avoiding recreating threads on every Gemm. |
| WorkersPool workers_pool_; |
| }; |
| |
| // Needed by chrome native builds |
| #ifndef _SC_NPROCESSORS_CONF |
| #define _SC_NPROCESSORS_CONF _SC_NPROCESSORS_ONLN |
| #endif |
| |
| // Determines how many threads should be used for a given Gemm |
| // operation. |
| template <int KernelRows> |
| inline int HowManyThreads(int max_num_threads, int rows, int cols, int depth) { |
| // Early-exit in the default case where multi-threading is disabled. |
| if (max_num_threads == 1) { |
| return 1; |
| } |
| |
| // Determine the maximum number of threads. |
| int max_count = GetHardwareConcurrency(max_num_threads); |
| |
| // Basic calculation: take into account max pool size, and |
| // how many rows we have to feed our kernel. |
| // The motivation for an absolute minimum number of rows per thread, |
| // potentially higher than KernelRows, is that very thin thread workload |
| // currently defeat assumptions of the AddMod generator, resulting |
| // in substantial bias in TestWithRealData on 24 threads. |
| // Ideally, the AddMod generator should be aware of global (r,c) coordinates |
| // so as to be independent of the number of threads. |
| static const int AbsoluteMinRowsPerThread = 16; |
| static const int MinRowsPerThread = KernelRows > AbsoluteMinRowsPerThread |
| ? KernelRows |
| : AbsoluteMinRowsPerThread; |
| int thread_count = std::min(max_count, CeilQuotient(rows, MinRowsPerThread)); |
| |
| // At this point for small products we already have thread_count==1 so |
| // we can avoid doing more work; otherwise, we still want to check |
| // that the cubic size (rows*cols*depth) is big enough to keep |
| // workers_ busy. |
| if (thread_count > 1) { |
| // Empirically determined value. |
| static const std::uint64_t min_cubic_size_per_thread = 64 * 1024; |
| |
| // We can only multiply two out of three sizes without risking overflow |
| const std::uint64_t cubic_size = |
| std::uint64_t(rows) * std::uint64_t(cols) * std::uint64_t(depth); |
| |
| thread_count = |
| std::min(thread_count, int(cubic_size / min_cubic_size_per_thread)); |
| |
| if (thread_count < 1) { |
| thread_count = 1; |
| } |
| } |
| |
| assert(thread_count > 0 && thread_count <= max_count); |
| return thread_count; |
| } |
| |
| // The main multi-threaded Gemm function. |
| // To understand it, first read the code of SingleThreadGemm(). |
| // The parallelization scheme used here is to have this master function |
| // pack a block of RHS and then start worker threads to pack a block of LHS |
| // each, and accumulate the corresponding products. |
| template <typename KernelFormat, typename InputScalar, typename OutputScalar, |
| typename BitDepthParams, MapOrder LhsOrder, MapOrder RhsOrder, |
| MapOrder ResultOrder, typename LhsOffset, typename RhsOffset, |
| typename OutputPipelineType, typename GemmContextType> |
| void MultiThreadGemm(GemmContextType* context, const KernelBase& kernel, |
| const MatrixMap<const InputScalar, LhsOrder>& lhs, |
| const MatrixMap<const InputScalar, RhsOrder>& rhs, |
| MatrixMap<OutputScalar, ResultOrder>* result, |
| const LhsOffset& lhs_offset, const RhsOffset& rhs_offset, |
| const OutputPipelineType& output_pipeline) { |
| ScopedProfilingLabel label("gemmlowp::MultiThreadGemm"); |
| |
| assert(lhs.cols() == rhs.rows()); |
| |
| int rows = result->rows(); |
| int cols = result->cols(); |
| int depth = lhs.cols(); |
| |
| // zero sizes should have been caught earlier and early-returned. |
| assert(rows > 0); |
| assert(cols > 0); |
| assert(depth > 0); |
| |
| // The case of rows<cols should have been caught earlier and transposed. |
| assert(rows >= cols); |
| |
| const int thread_count = HowManyThreads<KernelFormat::kRows>( |
| context->max_num_threads(), rows, cols, depth); |
| if (thread_count == 1) { |
| return SingleThreadGemm<KernelFormat, InputScalar, OutputScalar, |
| BitDepthParams>(context, kernel, lhs, rhs, result, |
| lhs_offset, rhs_offset, |
| output_pipeline); |
| } |
| assert(thread_count > 1); |
| |
| // Simple 1:1 mapping of tasks to physical cores, which is very important |
| // to getting good multithreaded performance, specially for not-very-large |
| // GEMMs, and especially on Android. |
| const int task_count = thread_count; |
| |
| Allocator* allocator = context->allocator(); |
| auto* workers_pool = context->workers_pool(); |
| |
| BlockParams block_params; |
| block_params.Init<KernelFormat>( |
| rows, cols, depth, task_count, context->l1_bytes_to_use(), |
| context->l2_bytes_to_use(), context->l2_rhs_factor()); |
| |
| PackedSideBlock<typename KernelFormat::Rhs> packed_rhs(Side::Rhs, allocator, |
| block_params); |
| allocator->Commit(); |
| |
| // We loop over large blocks of the RHS. |
| for (int c = 0; c < cols; c += block_params.l2_cols) { |
| int cs = std::min(block_params.l2_cols, cols - c); |
| |
| // Pack a large block of the RHS. |
| PackRhs(&packed_rhs, rhs.block(0, c, depth, cs)); |
| |
| // Give work to each worker. |
| std::vector<Task*> tasks; |
| int next_start_row = 0; |
| for (int n = 0; n < task_count; ++n) { |
| int start_row = next_start_row; |
| next_start_row = std::min( |
| rows, RoundUp<KernelFormat::kRows>(rows * (n + 1) / task_count)); |
| |
| int block_rows = next_start_row - start_row; |
| auto lhs_block = lhs.block(start_row, 0, block_rows, depth); |
| typedef GemmWithPackedRhsTask<KernelFormat, InputScalar, OutputScalar, |
| BitDepthParams, LhsOrder, RhsOrder, |
| ResultOrder, LhsOffset, RhsOffset, |
| OutputPipelineType, GemmContextType> |
| TaskType; |
| tasks.push_back( |
| new TaskType(context, kernel, lhs_block, packed_rhs, result, |
| MatrixBlockBounds(start_row, c, block_rows, cs), |
| lhs_offset, rhs_offset, block_params, output_pipeline)); |
| } |
| // Execute the work on the workers (and partially on this thread). |
| workers_pool->Execute(tasks); |
| } |
| |
| allocator->Decommit(); |
| } |
| |
| } // namespace gemmlowp |
| |
| #endif // GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_ |