| /* Copyright 2019 Google LLC. All Rights Reserved. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| |
| #include "trmul.h" |
| |
| #include <cstring> |
| |
| #include "third_party/gemmlowp/profiling/instrumentation.h" |
| #include "allocator.h" |
| #include "block_map.h" |
| #include "common.h" |
| #include "opt_set.h" |
| #include "thread_pool.h" |
| #include "trace.h" |
| |
| namespace ruy { |
| |
| namespace { |
| |
| struct TrMulTask final : Task { |
| TrMulTask(TrMulParams* params_, const BlockMap& block_map_, |
| std::atomic<std::uint32_t>* atomic_n_, std::uint32_t thread_id_, |
| std::atomic<bool>* lhs_packed_, std::atomic<bool>* rhs_packed_, |
| TuningResolver* tuning_resolver_, Allocator* local_allocator_, |
| Trace* trace_) |
| : params(params_), |
| block_map(block_map_), |
| atomic_n(atomic_n_), |
| thread_id(thread_id_), |
| lhs_packed(lhs_packed_), |
| rhs_packed(rhs_packed_), |
| tuning_resolver(tuning_resolver_), |
| local_allocator(local_allocator_), |
| trace(trace_) {} |
| |
| void Run() override { |
| TraceRecordThreadStart(thread_id, trace); |
| |
| std::uint16_t num_blocks_of_rows = NumBlocksOfRows(block_map); |
| std::uint16_t num_blocks_of_cols = NumBlocksOfCols(block_map); |
| std::uint32_t num_blocks = NumBlocks(block_map); |
| |
| bool* local_lhs_packed = nullptr; |
| bool* local_rhs_packed = nullptr; |
| |
| if (lhs_packed) { |
| local_allocator->Allocate(num_blocks_of_rows, &local_lhs_packed); |
| memset(local_lhs_packed, 0, num_blocks_of_rows * sizeof(bool)); |
| } |
| if (rhs_packed) { |
| local_allocator->Allocate(num_blocks_of_cols, &local_rhs_packed); |
| memset(local_rhs_packed, 0, num_blocks_of_cols * sizeof(bool)); |
| } |
| |
| const Tuning tuning = tuning_resolver->Resolve(); |
| |
| TraceRecordThreadLoopStart(thread_id, trace); |
| |
| std::uint16_t block_r, block_c; |
| int start_r, start_c, end_r, end_c; |
| std::uint16_t next_block_r, next_block_c; |
| int next_start_r, next_start_c, next_end_r, next_end_c; |
| |
| std::uint32_t n = thread_id; |
| std::uint32_t next_n; |
| |
| GetBlockByIndex(block_map, n, &block_r, &block_c); |
| TraceRecordBlockReserved(thread_id, n, trace); |
| GetBlockMatrixCoords(block_map, block_r, block_c, &start_r, &start_c, |
| &end_r, &end_c); |
| TraceRecordBlockCoordsComputed(n, trace); |
| |
| while (n < num_blocks) { |
| // Get index of next block to handle |
| next_n = atomic_n->fetch_add(1, std::memory_order_relaxed); |
| // If we actually got a next block to handle (not already at end) |
| if (next_n < num_blocks) { |
| // Get coords of that next block to handle |
| // TODO(benoitjacob): is this whole next_* business worth it? |
| // The idea was to have more independent things to do in parallel |
| // (Pack+Kernel on current block while we resolve next block atomic |
| // index and coords) but we don't seem to be actually taking advantage |
| // of this unless the compiler is doing a really good code-reordering |
| // job here, which is made unlikely by the conditional enclosing this. |
| GetBlockByIndex(block_map, next_n, &next_block_r, &next_block_c); |
| TraceRecordBlockReserved(thread_id, next_n, trace); |
| GetBlockMatrixCoords(block_map, next_block_r, next_block_c, |
| &next_start_r, &next_start_c, &next_end_r, |
| &next_end_c); |
| TraceRecordBlockCoordsComputed(next_n, trace); |
| } |
| // Maybe pack the current LHS block, if not already packed. |
| // Note that if two threads concurrently hit the same LHS block to pack, |
| // we allow them to concurrently pack it, writing the same packed matrix |
| // data to the same location. That is considered worth it to avoid |
| // having one thread blocked on another one. Avoiding that is considered |
| // important especially on mobile, where there can be large speed |
| // discrepancy between threads, e.g. if different threads are scheduled |
| // on CPU cores of different types (big/little), different clock speed, |
| // different contention with other processes. |
| if (local_lhs_packed && !local_lhs_packed[block_r]) { |
| if (!lhs_packed[block_r].load(std::memory_order_acquire)) { |
| params->LhsRunPack(tuning, start_r, end_r); |
| TraceRecordBlockPackedLhs(n, trace); |
| local_lhs_packed[block_r] = true; |
| lhs_packed[block_r].store(true, std::memory_order_release); |
| } |
| } |
| // Maybe pack the current RHS block. Same comments as above for LHS. |
| if (local_rhs_packed && !local_rhs_packed[block_c]) { |
| if (!rhs_packed[block_c].load(std::memory_order_acquire)) { |
| params->RhsRunPack(tuning, start_c, end_c); |
| TraceRecordBlockPackedRhs(n, trace); |
| local_rhs_packed[block_c] = true; |
| rhs_packed[block_c].store(true, std::memory_order_release); |
| } |
| } |
| // Actually do matrix multiplication work |
| params->RunKernel(tuning, start_r, start_c, end_r, end_c); |
| TraceRecordBlockFinished(n, trace); |
| n = next_n; |
| block_r = next_block_r; |
| block_c = next_block_c; |
| start_r = next_start_r; |
| start_c = next_start_c; |
| end_r = next_end_r; |
| end_c = next_end_c; |
| } |
| |
| local_allocator->FreeAll(); |
| |
| TraceRecordThreadEnd(thread_id, trace); |
| } |
| |
| private: |
| TrMulParams* params; |
| const BlockMap& block_map; |
| std::atomic<std::uint32_t>* atomic_n; |
| std::uint32_t thread_id; |
| std::atomic<bool>* lhs_packed; |
| std::atomic<bool>* rhs_packed; |
| TuningResolver* tuning_resolver; |
| Allocator* local_allocator; |
| Trace* trace; |
| }; |
| |
| void AllocatePMatrix(Allocator* allocator, PMatrix* packed) { |
| packed->data = allocator->AllocateBytes(DataSize(*packed)); |
| packed->sums = allocator->AllocateBytes(SumsSize(*packed)); |
| } |
| |
| int GetThreadCount(Context* context, int rows, int cols, int depth) { |
| // Empirically determined rule for reasonable number of |
| // threads to use. This is proportional to the number of arithmetic ops |
| // in this Mul (product of the 3 sizes). |
| int guess = (std::uint64_t(rows) * cols * depth) >> 13; |
| return clamp(guess, 1, context->max_num_threads); |
| } |
| |
| LoopStructure GetLoopStructure(int thread_count, int rows, int cols, int depth, |
| int cache_friendly_traversal_threshold) { |
| if (thread_count == 1 && |
| (rows + cols) * depth < cache_friendly_traversal_threshold) { |
| return LoopStructure::kSimple; |
| } |
| return LoopStructure::kGeneral; |
| } |
| |
| } // namespace |
| |
| void TrMul(TrMulParams* params, Context* context) { |
| gemmlowp::ScopedProfilingLabel label("TrMul"); |
| |
| PMatrix& packed_lhs = params->packed_lhs; |
| PMatrix& packed_rhs = params->packed_rhs; |
| DMatrix& lhs = params->lhs; |
| DMatrix& rhs = params->rhs; |
| |
| const int rows = lhs.layout.cols; |
| const int cols = rhs.layout.cols; |
| const int depth = lhs.layout.rows; |
| const int rows_rounded_up = packed_lhs.layout.cols; |
| const int cols_rounded_up = packed_rhs.layout.cols; |
| |
| int thread_count = GetThreadCount(context, rows, cols, depth); |
| const auto loop_structure = |
| GetLoopStructure(thread_count, rows, cols, depth, |
| params->cache_friendly_traversal_threshold); |
| Allocator* allocator = context->GetMainAllocator(); |
| |
| if (!params->lhs_is_prepacked) { |
| AllocatePMatrix(allocator, &packed_lhs); |
| } |
| if (!params->rhs_is_prepacked) { |
| AllocatePMatrix(allocator, &packed_rhs); |
| } |
| |
| if (loop_structure == LoopStructure::kSimple) { |
| gemmlowp::ScopedProfilingLabel label_simple("TrMulImpl, simple loop"); |
| Tuning tuning = context->GetMainThreadTuning(); |
| |
| if (!params->lhs_is_prepacked) { |
| params->LhsRunPack(tuning, 0, rows_rounded_up); |
| } |
| if (!params->rhs_is_prepacked) { |
| params->RhsRunPack(tuning, 0, cols_rounded_up); |
| } |
| params->RunKernel(tuning, 0, 0, rows_rounded_up, cols_rounded_up); |
| |
| allocator->FreeAll(); |
| return; |
| } |
| |
| gemmlowp::ScopedProfilingLabel label_general("TrMulImpl, general case"); |
| |
| auto* trace = NewTraceOrNull(&context->tracing, rows, depth, cols); |
| TraceRecordStart(trace); |
| |
| // Initialize block map. |
| BlockMap block_map; |
| MakeBlockMap(rows_rounded_up, cols_rounded_up, depth, |
| packed_lhs.layout.kernel.cols, packed_rhs.layout.kernel.cols, |
| packed_lhs.data_type.size, packed_rhs.data_type.size, |
| params->cache_friendly_traversal_threshold, &block_map); |
| std::uint16_t num_blocks_of_rows = NumBlocksOfRows(block_map); |
| std::uint16_t num_blocks_of_cols = NumBlocksOfCols(block_map); |
| std::uint32_t num_blocks = NumBlocks(block_map); |
| RUY_DCHECK_EQ(num_blocks, num_blocks_of_rows * num_blocks_of_cols); |
| |
| // Initialize per-thread state. |
| thread_count = clamp(thread_count, 1, num_blocks); |
| context->EnsureNPerThreadStates(thread_count); |
| for (auto& per_thread_state : context->per_thread_states) { |
| per_thread_state->tuning_resolver.SetTuning(context->explicit_tuning); |
| } |
| |
| // Allocate memory. |
| std::atomic<bool>* lhs_packed = nullptr; |
| if (!params->lhs_is_prepacked) { |
| allocator->Allocate(num_blocks_of_rows, &lhs_packed); |
| } |
| std::atomic<bool>* rhs_packed = nullptr; |
| if (!params->rhs_is_prepacked) { |
| allocator->Allocate(num_blocks_of_cols, &rhs_packed); |
| } |
| std::atomic<std::uint32_t>* atomic_n; |
| allocator->Allocate(1, &atomic_n); |
| TrMulTask* tasks; |
| allocator->Allocate(thread_count, &tasks); |
| |
| // Initialize allocated data. |
| if (lhs_packed != nullptr) { |
| for (int i = 0; i < num_blocks_of_rows; i++) { |
| lhs_packed[i].store(false, std::memory_order_release); |
| } |
| } |
| if (rhs_packed != nullptr) { |
| for (int i = 0; i < num_blocks_of_cols; i++) { |
| rhs_packed[i].store(false, std::memory_order_release); |
| } |
| } |
| atomic_n->store(thread_count); |
| |
| for (int i = 0; i < thread_count; i++) { |
| new (tasks + i) |
| TrMulTask(params, block_map, atomic_n, i, lhs_packed, rhs_packed, |
| &context->per_thread_states[i]->tuning_resolver, |
| &context->per_thread_states[i]->allocator, trace); |
| } |
| |
| // Do the computation. |
| TraceRecordExecute(trace); |
| TraceStartRecordingBlockAndThreadFields(block_map, thread_count, trace); |
| |
| context->workers_pool.Execute(thread_count, tasks); |
| |
| // Finish up. |
| for (int i = 0; i < thread_count; i++) { |
| tasks[i].~TrMulTask(); |
| } |
| |
| TraceRecordEnd(trace); |
| |
| allocator->FreeAll(); |
| } |
| |
| } // namespace ruy |