| /* Copyright 2019 Google LLC. All Rights Reserved. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| |
| #include "block_map.h" |
| |
| #include "third_party/gemmlowp/profiling/instrumentation.h" |
| #include "check_macros.h" |
| #include "opt_set.h" |
| #include "size_util.h" |
| |
| namespace ruy { |
| |
| void GetBlockByIndex(const BlockMap& block_map, std::uint32_t index, |
| std::uint16_t* block_r, std::uint16_t* block_c) { |
| gemmlowp::ScopedProfilingLabel label("GetBlockByIndex"); |
| std::uint16_t rectr = |
| index & ((1 << block_map.rows_rectangularness_log2) - 1); |
| std::uint16_t rectc = |
| index & ((1 << block_map.cols_rectangularness_log2) - 1); |
| |
| std::uint16_t n1 = index >> (block_map.rows_rectangularness_log2 + |
| block_map.cols_rectangularness_log2); |
| RUY_DCHECK_EQ(index, (n1 << (block_map.rows_rectangularness_log2 + |
| block_map.cols_rectangularness_log2)) + |
| rectr + rectc); |
| |
| std::uint16_t br, bc; |
| if (block_map.traversal_order == BlockMapTraversalOrder::kLinear) { |
| br = n1 & ((1 << block_map.num_blocks_base_log2) - 1); |
| bc = n1 >> block_map.num_blocks_base_log2; |
| } else { |
| // Decode fractal z-order |
| std::uint16_t n2 = |
| (n1 & 0x9999) | ((n1 & 0x4444) >> 1) | ((n1 & 0x2222) << 1); |
| std::uint16_t n4 = |
| (n2 & 0xc3c3) | ((n2 & 0x3030) >> 2) | ((n2 & 0x0c0c) << 2); |
| std::uint16_t n8 = |
| (n4 & 0xf00f) | ((n4 & 0x0f00) >> 4) | ((n4 & 0x00f0) << 4); |
| br = n8 & 0xff; |
| bc = n8 >> 8; |
| if (block_map.traversal_order == BlockMapTraversalOrder::kFractalU) { |
| // Change fractal z-order to u-order |
| br ^= bc; |
| } |
| } |
| |
| br = (br << block_map.rows_rectangularness_log2) + rectr; |
| bc = (bc << block_map.cols_rectangularness_log2) + rectc; |
| |
| // Store |
| *block_r = br; |
| *block_c = bc; |
| } |
| |
| namespace { |
| |
| int floor_log2_quotient(int num, int denom) { |
| if (num <= denom) { |
| return 0; |
| } |
| int log2_quotient = floor_log2(num) - ceil_log2(denom); |
| if ((denom << (log2_quotient + 1)) <= num) { |
| log2_quotient++; |
| } |
| return log2_quotient; |
| } |
| |
| } // namespace |
| |
| void MakeBlockMap(int rows, int cols, int depth, int kernel_rows, |
| int kernel_cols, int lhs_scalar_size, int rhs_scalar_size, |
| int cache_friendly_traversal_threshold, BlockMap* block_map) { |
| gemmlowp::ScopedProfilingLabel label("MakeBlockMap"); |
| RUY_DCHECK_GE(rows, kernel_rows); |
| RUY_DCHECK_GE(cols, kernel_cols); |
| |
| block_map->traversal_order = BlockMapTraversalOrder::kLinear; |
| if (RUY_OPT_ENABLED(RUY_OPT_FRACTAL) && |
| (rows * lhs_scalar_size + cols * rhs_scalar_size) * depth >= |
| cache_friendly_traversal_threshold) { |
| block_map->traversal_order = RUY_OPT_ENABLED(RUY_OPT_FRACTAL_U) |
| ? BlockMapTraversalOrder::kFractalU |
| : BlockMapTraversalOrder::kFractalZ; |
| } |
| |
| // See the comment on BlockMap in block_map.h. |
| // The destination matrix shape (rows x cols) is to be subdivided into a |
| // square (N x N) grid of blocks, whose shapes must be multiples of the |
| // kernel block shape (kernel_rows x kernel_cols). |
| // Inside each of these N*N blocks, we may have one further level of |
| // subdivision either along rows or along cols but not both, to handle |
| // better the highly rectangular cases. That is what we call |
| // 'rectangularness'. This extra level of subdivision is into |
| // (1 << rows_rectangularness_log2) blocks along rows dimension, or into |
| // (1 << cols_rectangularness_log2) blocks along cols dimension. |
| int rows_rectangularness_log2 = 0; |
| int cols_rectangularness_log2 = 0; |
| // In order to compute these rectangularness values, we need to divide |
| // the destination matrix's aspect ratio, |
| // rows / cols |
| // by the kernel block's aspect ratio, |
| // kernel_block_rows / kernel_block_cols. |
| // The quotient of these two quotients simplifies to |
| // (rows * kernel_cols) / (cols * kernel_rows) |
| // Whence the introduction of the following products: |
| const int rows_times_kernel_cols = rows * kernel_cols; |
| const int cols_times_kernel_rows = cols * kernel_rows; |
| if (rows_times_kernel_cols > cols_times_kernel_rows) { |
| rows_rectangularness_log2 = |
| floor_log2_quotient(rows_times_kernel_cols, cols_times_kernel_rows); |
| // Sanity check that we did not over-estimate rows_rectangularness_log2. |
| RUY_DCHECK_GE(rows_times_kernel_cols >> rows_rectangularness_log2, |
| cols_times_kernel_rows); |
| } else if (cols_times_kernel_rows > rows_times_kernel_cols) { |
| cols_rectangularness_log2 = |
| floor_log2_quotient(cols_times_kernel_rows, rows_times_kernel_cols); |
| // Sanity check that we did not over-estimate cols_rectangularness_log2. |
| RUY_DCHECK_GE(cols_times_kernel_rows >> cols_rectangularness_log2, |
| rows_times_kernel_cols); |
| } |
| |
| RUY_DCHECK(!rows_rectangularness_log2 || !cols_rectangularness_log2); |
| |
| const int size = std::min(rows, cols); |
| const int size_floor_log2 = floor_log2(size); |
| const int depth_ceil_log2 = ceil_log2(depth); |
| const int kernel_width_log2 = ceil_log2(std::max(kernel_cols, kernel_rows)); |
| |
| // l1_size_log2 was originally, coarsely speaking the number of rows of LHS, |
| // or the number of columns of RHS in a matrix multiplication that we expect, |
| // to fit in L1 cache. |
| // |
| // This initial rationale is not necessarily still relevant. The logic below |
| // was determined empirically, not in a principled way. |
| int l1_size_log2; |
| if (size_floor_log2 <= 3) { |
| l1_size_log2 = size_floor_log2; |
| } else if (size_floor_log2 <= 6) { |
| l1_size_log2 = 4; |
| } else { |
| l1_size_log2 = 5; |
| } |
| |
| // The 15 here implicitly encodes target a 32k L1 cache (2^15 == 32k). |
| // Once again this only has a distant memory of being originally motivated |
| // by such clear principles linking this logic to cache sizes. |
| l1_size_log2 = std::min( |
| l1_size_log2, 15 - depth_ceil_log2 - |
| ceil_log2(std::max(lhs_scalar_size, rhs_scalar_size))); |
| l1_size_log2 = std::max(l1_size_log2, kernel_width_log2); |
| l1_size_log2 = std::min(l1_size_log2, size_floor_log2); |
| l1_size_log2 = std::max(l1_size_log2, size_floor_log2 - 8); |
| |
| int num_blocks_base_log2 = size_floor_log2 - l1_size_log2; |
| RUY_DCHECK_GE(num_blocks_base_log2, 0); |
| RUY_DCHECK_LE(num_blocks_base_log2, 8); |
| if (num_blocks_base_log2 == 0) { |
| if ((rows % kernel_rows) || (cols % kernel_cols)) { |
| num_blocks_base_log2 = 1; |
| } |
| } |
| RUY_DCHECK_LE(num_blocks_base_log2 + rows_rectangularness_log2, 16); |
| RUY_DCHECK_LE(num_blocks_base_log2 + cols_rectangularness_log2, 16); |
| |
| int rows_rounded_up = round_up_pot(rows, kernel_rows); |
| int cols_rounded_up = round_up_pot(cols, kernel_cols); |
| |
| const int num_blocks_of_rows_log2 = |
| num_blocks_base_log2 + rows_rectangularness_log2; |
| const int num_blocks_of_cols_log2 = |
| num_blocks_base_log2 + cols_rectangularness_log2; |
| |
| std::uint16_t smallr = |
| round_down_pot(rows_rounded_up >> num_blocks_of_rows_log2, kernel_rows); |
| std::uint16_t smallc = |
| round_down_pot(cols_rounded_up >> num_blocks_of_cols_log2, kernel_cols); |
| std::uint16_t missr = |
| round_up_pot(rows_rounded_up - (smallr << num_blocks_of_rows_log2), |
| kernel_rows) / |
| kernel_rows; |
| std::uint16_t missc = |
| round_up_pot(cols_rounded_up - (smallc << num_blocks_of_cols_log2), |
| kernel_cols) / |
| kernel_cols; |
| |
| block_map->rows = rows; |
| block_map->cols = cols; |
| block_map->kernel_rows = kernel_rows; |
| block_map->kernel_cols = kernel_cols; |
| block_map->num_blocks_base_log2 = num_blocks_base_log2; |
| block_map->rows_rectangularness_log2 = rows_rectangularness_log2; |
| block_map->cols_rectangularness_log2 = cols_rectangularness_log2; |
| block_map->smallr = smallr; |
| block_map->smallc = smallc; |
| block_map->missr = missr; |
| block_map->missc = missc; |
| } |
| |
| void GetBlockMatrixCoords(const BlockMap& block_map, std::uint16_t block_r, |
| std::uint16_t block_c, int* start_r, int* start_c, |
| int* end_r, int* end_c) { |
| gemmlowp::ScopedProfilingLabel label("GetBlockMatrixCoords"); |
| int sr = block_r * block_map.smallr + |
| std::min(block_r, block_map.missr) * block_map.kernel_rows; |
| int er = sr + block_map.smallr + |
| (block_r < block_map.missr) * block_map.kernel_rows; |
| int sc = block_c * block_map.smallc + |
| std::min(block_c, block_map.missc) * block_map.kernel_cols; |
| int ec = sc + block_map.smallc + |
| (block_c < block_map.missc) * block_map.kernel_cols; |
| sc = round_down_pot(sc, block_map.kernel_cols); |
| ec = round_down_pot(ec, block_map.kernel_cols); |
| sr = round_down_pot(sr, block_map.kernel_rows); |
| er = round_down_pot(er, block_map.kernel_rows); |
| |
| ec = std::min(ec, block_map.cols); |
| er = std::min(er, block_map.rows); |
| sc = std::max(0, ec - round_up_pot(ec - sc, block_map.kernel_cols)); |
| sr = std::max(0, er - round_up_pot(er - sr, block_map.kernel_rows)); |
| |
| *start_c = sc; |
| *end_c = ec; |
| *start_r = sr; |
| *end_r = er; |
| |
| RUY_DCHECK_LE(ec, block_map.cols); |
| RUY_DCHECK_LE(er, block_map.rows); |
| RUY_DCHECK_LT(sc, ec); |
| RUY_DCHECK_LT(sr, er); |
| RUY_DCHECK_GE(sc, 0); |
| RUY_DCHECK_GE(sr, 0); |
| } |
| |
| } // namespace ruy |