| // Copyright 2015 Google Inc. All Rights Reserved. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // unpack.h: unpacking the result blocks computed by compute.h, |
| // storing them into the destination matrix. |
| |
| #ifndef GEMMLOWP_INTERNAL_UNPACK_H_ |
| #define GEMMLOWP_INTERNAL_UNPACK_H_ |
| |
| #include "allocator.h" |
| #include "block_params.h" |
| #include "pack.h" |
| |
| #include <cmath> |
| |
| namespace gemmlowp { |
| |
| class PackedResult { |
| public: |
| PackedResult(Allocator* _allocator, const BlockParams& _block_params) |
| : allocator_(_allocator), block_params_(_block_params) { |
| matrix_handle_ = allocator_->Reserve<std::int32_t>(block_params_.l2_rows * |
| block_params_.l2_cols); |
| } |
| |
| ~PackedResult() {} |
| |
| MatrixMap<std::int32_t, MapOrder::ColMajor> Map() { |
| return MatrixMap<std::int32_t, MapOrder::ColMajor>( |
| allocator_->GetPointer<std::int32_t>(matrix_handle_), |
| block_params_.l2_rows, block_params_.l2_cols, block_params_.l2_rows); |
| } |
| |
| MatrixMap<const std::int32_t, MapOrder::ColMajor> Map() const { |
| return MatrixMap<const std::int32_t, MapOrder::ColMajor>( |
| allocator_->GetPointer<const std::int32_t>(matrix_handle_), |
| block_params_.l2_rows, block_params_.l2_cols, block_params_.l2_rows); |
| } |
| |
| private: |
| Allocator* allocator_; |
| Allocator::Handle matrix_handle_; |
| const BlockParams& block_params_; |
| }; |
| |
| template <std::uint32_t numerator, std::uint32_t denominator> |
| std::int32_t MultiplyByConstantFraction(std::int32_t x) { |
| if (numerator == denominator) { |
| return x; |
| } |
| |
| // We'll use only signed arithmetic here. This is |
| // simpler (since this function operates on signed int32's) and |
| // more friendly to ARM NEON, where this allows us to use the |
| // VQRDMULH instruction. |
| static const std::int32_t int_quotient = |
| (numerator + denominator / 2) / denominator; |
| static const std::int32_t remaining_numerator = |
| numerator - int_quotient * denominator; |
| static const std::int32_t scaled_remaining_numerator = |
| static_cast<std::int32_t>( |
| (static_cast<std::int64_t>(remaining_numerator) << 31) / denominator); |
| |
| const std::int64_t scaled_remaining_product = |
| static_cast<std::int64_t>(x) * |
| static_cast<std::int64_t>(scaled_remaining_numerator); |
| |
| const std::int32_t scaled_remaining_product_nudge = |
| (scaled_remaining_product > 0 ? 1 : -1) * (1 << 30); |
| |
| const std::int32_t remaining_product = |
| (scaled_remaining_product + scaled_remaining_product_nudge) / (1u << 31); |
| |
| return x * int_quotient + remaining_product; |
| } |
| |
| template <BitDepthSetting BitDepth, |
| typename ResultBlockType, typename PackedResultType> |
| struct UnpackResultImplGeneric { |
| static void Unpack(ResultBlockType* dst, const PackedResultType& src, |
| int depth, const std::int32_t* lhs_rank_one_update, |
| const std::int32_t* rhs_rank_one_update, |
| std::int32_t lhs_offset, std::int32_t rhs_offset, |
| std::int32_t result_offset, std::int32_t result_mult_int, |
| std::int32_t result_shift) { |
| std::int32_t term_11 = lhs_offset * rhs_offset * depth + result_offset; |
| auto src_map = src.Map(); |
| // No top-level blocking in the depth dimension at the moment. |
| // Too much loss of precision. |
| const int kLhsBits = LhsBitDepth<BitDepth>::kBits; |
| const int kRhsBits = RhsBitDepth<BitDepth>::kBits; |
| const std::int32_t kLhsMax = (1 << kLhsBits) - 1; |
| const std::int32_t kRhsMax = (1 << kRhsBits) - 1; |
| for (int c = 0; c < dst->cols(); c++) { |
| for (int r = 0; r < dst->rows(); r++) { |
| std::int32_t raw_xx = src_map(r, c); |
| std::int32_t raw_x1 = lhs_rank_one_update[r]; |
| std::int32_t raw_1x = rhs_rank_one_update[c]; |
| std::int32_t term_xx = |
| MultiplyByConstantFraction<255 * 255, kLhsMax * kRhsMax>(raw_xx); |
| std::int32_t term_x1 = |
| MultiplyByConstantFraction<255, kLhsMax>(raw_x1); |
| std::int32_t term_1x = |
| MultiplyByConstantFraction<255, kRhsMax>(raw_1x); |
| std::int32_t sum = term_xx + term_x1 + term_1x + term_11; |
| std::int32_t result = |
| (sum * result_mult_int + (1 << (result_shift - 1))) >> result_shift; |
| (*dst)(r, c) = result > 255 ? 255 : result < 0 ? 0 : result; |
| } |
| } |
| } |
| }; |
| |
| template <BitDepthSetting BitDepth, |
| typename ResultBlockType, typename PackedResultType> |
| struct UnpackResultImpl |
| : UnpackResultImplGeneric<BitDepth, ResultBlockType, PackedResultType> {}; |
| |
| template <BitDepthSetting BitDepth, |
| typename ResultBlockType, typename PackedResultType> |
| void UnpackResult(ResultBlockType* dst, const PackedResultType& src, int depth, |
| const std::int32_t* lhs_rank_one_update, |
| const std::int32_t* rhs_rank_one_update, |
| std::int32_t lhs_offset, std::int32_t rhs_offset, |
| std::int32_t result_offset, std::int32_t result_mult_int, |
| std::int32_t result_shift) { |
| ScopedProfilingLabel label("unpack"); |
| UnpackResultImpl<BitDepth, ResultBlockType, PackedResultType>::Unpack( |
| dst, src, depth, lhs_rank_one_update, rhs_rank_one_update, lhs_offset, |
| rhs_offset, result_offset, result_mult_int, result_shift); |
| } |
| |
| } // namespace gemmlowp |
| |
| #ifdef GEMMLOWP_NEON |
| #include "unpack_neon.h" |
| #endif |
| |
| #endif // GEMMLOWP_INTERNAL_UNPACK_H_ |