| /* Copyright 2019 Google LLC. All Rights Reserved. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| |
| #include "block_map.h" |
| |
| #include <algorithm> |
| #include <cstdint> |
| |
| #include "third_party/gemmlowp/profiling/instrumentation.h" |
| #include "check_macros.h" |
| #include "opt_set.h" |
| #include "size_util.h" |
| |
| namespace ruy { |
| |
| void GetBlockByIndex(const BlockMap& block_map, int index, |
| SidePair<int>* block) { |
| gemmlowp::ScopedProfilingLabel label("GetBlockByIndex"); |
| const std::uint32_t index_u32 = index; |
| |
| const std::uint32_t num_blocks_per_local_curve = |
| 1u << (2 * block_map.num_blocks_base_log2); |
| const std::uint32_t n1 = index_u32 & (num_blocks_per_local_curve - 1); |
| |
| SidePair<int> local_pos; |
| if (block_map.traversal_order == BlockMapTraversalOrder::kLinear) { |
| local_pos[Side::kLhs] = n1 & ((1u << block_map.num_blocks_base_log2) - 1); |
| local_pos[Side::kRhs] = n1 >> block_map.num_blocks_base_log2; |
| } else { |
| // Decode fractal z-order |
| const std::uint32_t n2 = (n1 & 0x99999999u) | ((n1 & 0x44444444u) >> 1) | |
| ((n1 & 0x22222222u) << 1); |
| const std::uint32_t n4 = (n2 & 0xc3c3c3c3u) | ((n2 & 0x30303030u) >> 2) | |
| ((n2 & 0x0c0c0c0cu) << 2); |
| const std::uint32_t n8 = (n4 & 0xf00ff00fu) | ((n4 & 0x0f000f00u) >> 4) | |
| ((n4 & 0x00f000f0u) << 4); |
| const std::uint32_t n16 = (n8 & 0xff0000ffu) | ((n8 & 0x00ff0000u) >> 8) | |
| ((n8 & 0x0000ff00u) << 8); |
| local_pos[Side::kLhs] = n16 & 0xffff; |
| local_pos[Side::kRhs] = n16 >> 16; |
| if (block_map.traversal_order == BlockMapTraversalOrder::kFractalU) { |
| // Change fractal z-order to u-order |
| local_pos[Side::kLhs] ^= local_pos[Side::kRhs]; |
| } |
| } |
| |
| const std::uint32_t rectangular_index = |
| index_u32 >> 2 * block_map.num_blocks_base_log2; |
| for (Side side : {Side::kLhs, Side::kRhs}) { |
| const std::uint32_t mask = (1u << block_map.rectangularness_log2[side]) - 1; |
| const int rectangular_offset = (rectangular_index & mask) |
| << block_map.num_blocks_base_log2; |
| (*block)[side] = local_pos[side] + rectangular_offset; |
| } |
| } |
| |
| namespace { |
| |
| int floor_log2_quotient(int num, int denom) { |
| if (num <= denom) { |
| return 0; |
| } |
| int log2_quotient = floor_log2(num) - ceil_log2(denom); |
| if ((denom << (log2_quotient + 1)) <= num) { |
| log2_quotient++; |
| } |
| return log2_quotient; |
| } |
| |
| } // namespace |
| |
| void MakeBlockMap(int rows, int cols, int depth, int kernel_rows, |
| int kernel_cols, int lhs_scalar_size, int rhs_scalar_size, |
| int cache_friendly_traversal_threshold, BlockMap* block_map) { |
| gemmlowp::ScopedProfilingLabel label("MakeBlockMap"); |
| RUY_DCHECK_GE(rows, kernel_rows); |
| RUY_DCHECK_GE(cols, kernel_cols); |
| RUY_DCHECK_EQ(rows % kernel_rows, 0); |
| RUY_DCHECK_EQ(cols % kernel_cols, 0); |
| |
| block_map->traversal_order = BlockMapTraversalOrder::kLinear; |
| if (RUY_OPT_ENABLED(RUY_OPT_FRACTAL) && |
| (rows * lhs_scalar_size + cols * rhs_scalar_size) * depth >= |
| cache_friendly_traversal_threshold) { |
| block_map->traversal_order = RUY_OPT_ENABLED(RUY_OPT_FRACTAL_U) |
| ? BlockMapTraversalOrder::kFractalU |
| : BlockMapTraversalOrder::kFractalZ; |
| } |
| |
| // See the comment on BlockMap in block_map.h. |
| // The destination matrix shape (rows x cols) is to be subdivided into a |
| // square (N x N) grid of blocks, whose shapes must be multiples of the |
| // kernel block shape (kernel_rows x kernel_cols). |
| // Inside each of these N*N blocks, we may have one further level of |
| // subdivision either along rows or along cols but not both, to handle |
| // better the highly rectangular cases. That is what we call |
| // 'rectangularness'. This extra level of subdivision is into |
| // (1 << rows_rectangularness_log2) blocks along rows dimension, or into |
| // (1 << cols_rectangularness_log2) blocks along cols dimension. |
| int rows_rectangularness_log2 = 0; |
| int cols_rectangularness_log2 = 0; |
| // In order to compute these rectangularness values, we need to divide |
| // the destination matrix's aspect ratio, |
| // rows / cols |
| // by the kernel block's aspect ratio, |
| // kernel_block_rows / kernel_block_cols. |
| // The quotient of these two quotients simplifies to |
| // (rows * kernel_cols) / (cols * kernel_rows) |
| // Whence the introduction of the following products: |
| const int rows_times_kernel_cols = rows * kernel_cols; |
| const int cols_times_kernel_rows = cols * kernel_rows; |
| if (rows_times_kernel_cols > cols_times_kernel_rows) { |
| rows_rectangularness_log2 = |
| floor_log2_quotient(rows_times_kernel_cols, cols_times_kernel_rows); |
| // Sanity check that we did not over-estimate rows_rectangularness_log2. |
| RUY_DCHECK_GE(rows_times_kernel_cols >> rows_rectangularness_log2, |
| cols_times_kernel_rows); |
| } else if (cols_times_kernel_rows > rows_times_kernel_cols) { |
| cols_rectangularness_log2 = |
| floor_log2_quotient(cols_times_kernel_rows, rows_times_kernel_cols); |
| // Sanity check that we did not over-estimate cols_rectangularness_log2. |
| RUY_DCHECK_GE(cols_times_kernel_rows >> cols_rectangularness_log2, |
| rows_times_kernel_cols); |
| } |
| |
| RUY_DCHECK(!rows_rectangularness_log2 || !cols_rectangularness_log2); |
| |
| const int size = std::min(rows, cols); |
| const int size_floor_log2 = floor_log2(size); |
| const int depth_ceil_log2 = ceil_log2(depth); |
| const int kernel_width_log2 = pot_log2(std::max(kernel_cols, kernel_rows)); |
| |
| // l1_size_log2 was originally, coarsely speaking the number of rows of LHS, |
| // or the number of columns of RHS in a matrix multiplication that we expect, |
| // to fit in L1 cache. |
| // |
| // This initial rationale is not necessarily still relevant. The logic below |
| // was determined empirically, not in a principled way. |
| int l1_size_log2; |
| if (size_floor_log2 <= 3) { |
| l1_size_log2 = size_floor_log2; |
| } else if (size_floor_log2 <= 6) { |
| l1_size_log2 = 4; |
| } else { |
| l1_size_log2 = 5; |
| } |
| |
| // The 15 here implicitly encodes target a 32k L1 cache (2^15 == 32k). |
| // Once again this only has a distant memory of being originally motivated |
| // by such clear principles linking this logic to cache sizes. |
| l1_size_log2 = std::min( |
| l1_size_log2, 15 - depth_ceil_log2 - |
| pot_log2(std::max(lhs_scalar_size, rhs_scalar_size))); |
| l1_size_log2 = std::max(l1_size_log2, kernel_width_log2); |
| l1_size_log2 = std::min(l1_size_log2, size_floor_log2); |
| |
| int num_blocks_base_log2 = size_floor_log2 - l1_size_log2; |
| RUY_DCHECK_GE(num_blocks_base_log2, 0); |
| |
| const int num_blocks_of_rows_log2 = |
| num_blocks_base_log2 + rows_rectangularness_log2; |
| const int num_blocks_of_cols_log2 = |
| num_blocks_base_log2 + cols_rectangularness_log2; |
| |
| const int smallr = |
| round_down_pot(rows >> num_blocks_of_rows_log2, kernel_rows); |
| const int smallc = |
| round_down_pot(cols >> num_blocks_of_cols_log2, kernel_cols); |
| const int missr = |
| round_up_pot(rows - (smallr << num_blocks_of_rows_log2), kernel_rows) >> |
| pot_log2(kernel_rows); |
| const int missc = |
| round_up_pot(cols - (smallc << num_blocks_of_cols_log2), kernel_cols) >> |
| pot_log2(kernel_cols); |
| |
| block_map->dims[Side::kLhs] = rows; |
| block_map->dims[Side::kRhs] = cols; |
| block_map->kernel_dims[Side::kLhs] = kernel_rows; |
| block_map->kernel_dims[Side::kRhs] = kernel_cols; |
| block_map->num_blocks_base_log2 = num_blocks_base_log2; |
| block_map->rectangularness_log2[Side::kLhs] = rows_rectangularness_log2; |
| block_map->rectangularness_log2[Side::kRhs] = cols_rectangularness_log2; |
| block_map->small_block_dims[Side::kLhs] = smallr; |
| block_map->small_block_dims[Side::kRhs] = smallc; |
| block_map->large_blocks[Side::kLhs] = missr; |
| block_map->large_blocks[Side::kRhs] = missc; |
| } |
| |
| void GetBlockMatrixCoords(Side side, const BlockMap& block_map, int block, |
| int* start, int* end) { |
| gemmlowp::ScopedProfilingLabel label("GetBlockMatrixCoords"); |
| *start = block * block_map.small_block_dims[side] + |
| std::min(block, block_map.large_blocks[side]) * |
| block_map.kernel_dims[side]; |
| *end = |
| *start + block_map.small_block_dims[side] + |
| (block < block_map.large_blocks[side] ? block_map.kernel_dims[side] : 0); |
| |
| RUY_DCHECK_EQ(0, *start % block_map.kernel_dims[side]); |
| RUY_DCHECK_EQ(0, *end % block_map.kernel_dims[side]); |
| RUY_DCHECK_LE(*end, block_map.dims[side]); |
| RUY_DCHECK_LT(*start, *end); |
| RUY_DCHECK_GE(*start, 0); |
| } |
| |
| void GetBlockMatrixCoords(const BlockMap& block_map, const SidePair<int>& block, |
| SidePair<int>* start, SidePair<int>* end) { |
| for (Side side : {Side::kLhs, Side::kRhs}) { |
| GetBlockMatrixCoords(side, block_map, block[side], &(*start)[side], |
| &(*end)[side]); |
| } |
| } |
| |
| } // namespace ruy |