| /* |
| * Copyright 2019 The libgav1 Authors |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #ifndef LIBGAV1_SRC_UTILS_COMMON_H_ |
| #define LIBGAV1_SRC_UTILS_COMMON_H_ |
| |
| #if defined(_MSC_VER) |
| #include <intrin.h> |
| #pragma intrinsic(_BitScanForward) |
| #pragma intrinsic(_BitScanReverse) |
| #if defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64) |
| #pragma intrinsic(_BitScanReverse64) |
| #define HAVE_BITSCANREVERSE64 |
| #endif // defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64) |
| #endif // defined(_MSC_VER) |
| |
| #include <cassert> |
| #include <cstddef> |
| #include <cstdint> |
| #include <cstdlib> |
| #include <cstring> |
| #include <type_traits> |
| |
| #include "src/utils/bit_mask_set.h" |
| #include "src/utils/constants.h" |
| #include "src/utils/types.h" |
| |
| namespace libgav1 { |
| |
| // Aligns |value| to the desired |alignment|. |alignment| must be a power of 2. |
| template <typename T> |
| inline T Align(T value, T alignment) { |
| assert(alignment != 0); |
| const T alignment_mask = alignment - 1; |
| return (value + alignment_mask) & ~alignment_mask; |
| } |
| |
| // Aligns |addr| to the desired |alignment|. |alignment| must be a power of 2. |
| inline uint8_t* AlignAddr(uint8_t* const addr, const uintptr_t alignment) { |
| const auto value = reinterpret_cast<uintptr_t>(addr); |
| return reinterpret_cast<uint8_t*>(Align(value, alignment)); |
| } |
| |
| inline int32_t Clip3(int32_t value, int32_t low, int32_t high) { |
| return value < low ? low : (value > high ? high : value); |
| } |
| |
| // The following 2 templates set a block of data with uncontiguous memory to |
| // |value|. The compilers usually generate several branches to handle different |
| // cases of |columns| when inlining memset() and std::fill(), and these branches |
| // are unfortunately within the loop of |rows|. So calling these templates |
| // directly could be inefficient. It is recommended to specialize common cases |
| // of |columns|, such as 1, 2, 4, 8, 16 and 32, etc. in advance before |
| // processing the generic case of |columns|. The code size may be larger, but |
| // there would be big speed gains. |
| // Call template MemSetBlock<> when sizeof(|T|) is 1. |
| // Call template SetBlock<> when sizeof(|T|) is larger than 1. |
| template <typename T> |
| void MemSetBlock(int rows, int columns, T value, T* dst, ptrdiff_t stride) { |
| static_assert(sizeof(T) == 1, ""); |
| do { |
| memset(dst, value, columns); |
| dst += stride; |
| } while (--rows != 0); |
| } |
| |
| template <typename T> |
| void SetBlock(int rows, int columns, T value, T* dst, ptrdiff_t stride) { |
| do { |
| std::fill(dst, dst + columns, value); |
| dst += stride; |
| } while (--rows != 0); |
| } |
| |
| #if defined(__GNUC__) |
| |
| inline int CountLeadingZeros(uint32_t n) { |
| assert(n != 0); |
| return __builtin_clz(n); |
| } |
| |
| inline int CountLeadingZeros(uint64_t n) { |
| assert(n != 0); |
| return __builtin_clzll(n); |
| } |
| |
| inline int CountTrailingZeros(uint32_t n) { |
| assert(n != 0); |
| return __builtin_ctz(n); |
| } |
| |
| #elif defined(_MSC_VER) |
| |
| inline int CountLeadingZeros(uint32_t n) { |
| assert(n != 0); |
| unsigned long first_set_bit; // NOLINT(runtime/int) |
| const unsigned char bit_set = _BitScanReverse(&first_set_bit, n); |
| assert(bit_set != 0); |
| static_cast<void>(bit_set); |
| return 31 - static_cast<int>(first_set_bit); |
| } |
| |
| inline int CountLeadingZeros(uint64_t n) { |
| assert(n != 0); |
| unsigned long first_set_bit; // NOLINT(runtime/int) |
| #if defined(HAVE_BITSCANREVERSE64) |
| const unsigned char bit_set = |
| _BitScanReverse64(&first_set_bit, static_cast<unsigned __int64>(n)); |
| #else // !defined(HAVE_BITSCANREVERSE64) |
| const auto n_hi = static_cast<unsigned long>(n >> 32); // NOLINT(runtime/int) |
| if (n_hi != 0) { |
| const unsigned char bit_set = _BitScanReverse(&first_set_bit, n_hi); |
| assert(bit_set != 0); |
| static_cast<void>(bit_set); |
| return 31 - static_cast<int>(first_set_bit); |
| } |
| const unsigned char bit_set = _BitScanReverse( |
| &first_set_bit, static_cast<unsigned long>(n)); // NOLINT(runtime/int) |
| #endif // defined(HAVE_BITSCANREVERSE64) |
| assert(bit_set != 0); |
| static_cast<void>(bit_set); |
| return 63 - static_cast<int>(first_set_bit); |
| } |
| |
| #undef HAVE_BITSCANREVERSE64 |
| |
| inline int CountTrailingZeros(uint32_t n) { |
| assert(n != 0); |
| unsigned long first_set_bit; // NOLINT(runtime/int) |
| const unsigned char bit_set = _BitScanForward(&first_set_bit, n); |
| assert(bit_set != 0); |
| static_cast<void>(bit_set); |
| return static_cast<int>(first_set_bit); |
| } |
| |
| #else // !defined(__GNUC__) && !defined(_MSC_VER) |
| |
| template <const int kMSB, typename T> |
| inline int CountLeadingZeros(T n) { |
| assert(n != 0); |
| const T msb = T{1} << kMSB; |
| int count = 0; |
| while ((n & msb) == 0) { |
| ++count; |
| n <<= 1; |
| } |
| return count; |
| } |
| |
| inline int CountLeadingZeros(uint32_t n) { return CountLeadingZeros<31>(n); } |
| |
| inline int CountLeadingZeros(uint64_t n) { return CountLeadingZeros<63>(n); } |
| |
| // This is the algorithm on the left in Figure 5-23, Hacker's Delight, Second |
| // Edition, page 109. The book says: |
| // If the number of trailing 0's is expected to be small or large, then the |
| // simple loops shown in Figure 5-23 are quite fast. |
| inline int CountTrailingZeros(uint32_t n) { |
| assert(n != 0); |
| // Create a word with 1's at the positions of the trailing 0's in |n|, and |
| // 0's elsewhere (e.g., 01011000 => 00000111). |
| n = ~n & (n - 1); |
| int count = 0; |
| while (n != 0) { |
| ++count; |
| n >>= 1; |
| } |
| return count; |
| } |
| |
| #endif // defined(__GNUC__) |
| |
| inline int FloorLog2(int32_t n) { |
| assert(n > 0); |
| return 31 - CountLeadingZeros(static_cast<uint32_t>(n)); |
| } |
| |
| inline int FloorLog2(uint32_t n) { |
| assert(n > 0); |
| return 31 - CountLeadingZeros(n); |
| } |
| |
| inline int FloorLog2(int64_t n) { |
| assert(n > 0); |
| return 63 - CountLeadingZeros(static_cast<uint64_t>(n)); |
| } |
| |
| inline int FloorLog2(uint64_t n) { |
| assert(n > 0); |
| return 63 - CountLeadingZeros(n); |
| } |
| |
| inline int CeilLog2(unsigned int n) { |
| // The expression FloorLog2(n - 1) + 1 is undefined not only for n == 0 but |
| // also for n == 1, so this expression must be guarded by the n < 2 test. An |
| // alternative implementation is: |
| // return (n == 0) ? 0 : FloorLog2(n) + static_cast<int>((n & (n - 1)) != 0); |
| return (n < 2) ? 0 : FloorLog2(n - 1) + 1; |
| } |
| |
| constexpr int Ceil(int dividend, int divisor) { |
| return dividend / divisor + static_cast<int>(dividend % divisor != 0); |
| } |
| |
| inline int32_t RightShiftWithRounding(int32_t value, int bits) { |
| assert(bits >= 0); |
| return (value + ((1 << bits) >> 1)) >> bits; |
| } |
| |
| inline uint32_t RightShiftWithRounding(uint32_t value, int bits) { |
| assert(bits >= 0); |
| return (value + ((1 << bits) >> 1)) >> bits; |
| } |
| |
| // This variant is used when |value| can exceed 32 bits. Although the final |
| // result must always fit into int32_t. |
| inline int32_t RightShiftWithRounding(int64_t value, int bits) { |
| assert(bits >= 0); |
| return static_cast<int32_t>((value + ((int64_t{1} << bits) >> 1)) >> bits); |
| } |
| |
| inline int32_t RightShiftWithRoundingSigned(int32_t value, int bits) { |
| assert(bits > 0); |
| // The next line is equivalent to: |
| // return (value >= 0) ? RightShiftWithRounding(value, bits) |
| // : -RightShiftWithRounding(-value, bits); |
| return RightShiftWithRounding(value + (value >> 31), bits); |
| } |
| |
| // This variant is used when |value| can exceed 32 bits. Although the final |
| // result must always fit into int32_t. |
| inline int32_t RightShiftWithRoundingSigned(int64_t value, int bits) { |
| assert(bits > 0); |
| // The next line is equivalent to: |
| // return (value >= 0) ? RightShiftWithRounding(value, bits) |
| // : -RightShiftWithRounding(-value, bits); |
| return RightShiftWithRounding(value + (value >> 63), bits); |
| } |
| |
| constexpr int DivideBy2(int n) { return n >> 1; } |
| constexpr int DivideBy4(int n) { return n >> 2; } |
| constexpr int DivideBy8(int n) { return n >> 3; } |
| constexpr int DivideBy16(int n) { return n >> 4; } |
| constexpr int DivideBy32(int n) { return n >> 5; } |
| constexpr int DivideBy64(int n) { return n >> 6; } |
| constexpr int DivideBy128(int n) { return n >> 7; } |
| |
| // Convert |value| to unsigned before shifting to avoid undefined behavior with |
| // negative values. |
| inline int LeftShift(int value, int bits) { |
| assert(bits >= 0); |
| assert(value >= -(int64_t{1} << (31 - bits))); |
| assert(value <= (int64_t{1} << (31 - bits)) - ((bits == 0) ? 1 : 0)); |
| return static_cast<int>(static_cast<uint32_t>(value) << bits); |
| } |
| inline int MultiplyBy2(int n) { return LeftShift(n, 1); } |
| inline int MultiplyBy4(int n) { return LeftShift(n, 2); } |
| inline int MultiplyBy8(int n) { return LeftShift(n, 3); } |
| inline int MultiplyBy16(int n) { return LeftShift(n, 4); } |
| inline int MultiplyBy32(int n) { return LeftShift(n, 5); } |
| inline int MultiplyBy64(int n) { return LeftShift(n, 6); } |
| |
| constexpr int Mod32(int n) { return n & 0x1f; } |
| constexpr int Mod64(int n) { return n & 0x3f; } |
| |
| //------------------------------------------------------------------------------ |
| // Bitstream functions |
| |
| constexpr bool IsIntraFrame(FrameType type) { |
| return type == kFrameKey || type == kFrameIntraOnly; |
| } |
| |
| inline TransformClass GetTransformClass(TransformType tx_type) { |
| constexpr BitMaskSet kTransformClassVerticalMask( |
| kTransformTypeIdentityDct, kTransformTypeIdentityAdst, |
| kTransformTypeIdentityFlipadst); |
| if (kTransformClassVerticalMask.Contains(tx_type)) { |
| return kTransformClassVertical; |
| } |
| constexpr BitMaskSet kTransformClassHorizontalMask( |
| kTransformTypeDctIdentity, kTransformTypeAdstIdentity, |
| kTransformTypeFlipadstIdentity); |
| if (kTransformClassHorizontalMask.Contains(tx_type)) { |
| return kTransformClassHorizontal; |
| } |
| return kTransformClass2D; |
| } |
| |
| inline int RowOrColumn4x4ToPixel(int row_or_column4x4, Plane plane, |
| int8_t subsampling) { |
| return MultiplyBy4(row_or_column4x4) >> (plane == kPlaneY ? 0 : subsampling); |
| } |
| |
| constexpr PlaneType GetPlaneType(Plane plane) { |
| return static_cast<PlaneType>(plane != kPlaneY); |
| } |
| |
| // 5.11.44. |
| constexpr bool IsDirectionalMode(PredictionMode mode) { |
| return mode >= kPredictionModeVertical && mode <= kPredictionModeD67; |
| } |
| |
| // 5.9.3. |
| // |
| // |a| and |b| are order hints, treated as unsigned order_hint_bits-bit |
| // integers. |order_hint_shift_bits| equals (32 - order_hint_bits) % 32. |
| // order_hint_bits is at most 8, so |order_hint_shift_bits| is zero or a |
| // value between 24 and 31 (inclusive). |
| // |
| // If |order_hint_shift_bits| is zero, |a| and |b| are both zeros, and the |
| // result is zero. If |order_hint_shift_bits| is not zero, returns the |
| // signed difference |a| - |b| using "modular arithmetic". More precisely, the |
| // signed difference |a| - |b| is treated as a signed order_hint_bits-bit |
| // integer and cast to an int. The returned difference is between |
| // -(1 << (order_hint_bits - 1)) and (1 << (order_hint_bits - 1)) - 1 |
| // (inclusive). |
| // |
| // NOTE: |a| and |b| are the order_hint_bits least significant bits of the |
| // actual values. This function returns the signed difference between the |
| // actual values. The returned difference is correct as long as the actual |
| // values are not more than 1 << (order_hint_bits - 1) - 1 apart. |
| // |
| // Example: Suppose order_hint_bits is 4 and |order_hint_shift_bits| |
| // is 28. Then |a| and |b| are in the range [0, 15], and the actual values for |
| // |a| and |b| must not be more than 7 apart. (If the actual values for |a| and |
| // |b| are exactly 8 apart, this function cannot tell whether the actual value |
| // for |a| is before or after the actual value for |b|.) |
| // |
| // First, consider the order hints 2 and 6. For this simple case, we have |
| // GetRelativeDistance(2, 6, 28) = 2 - 6 = -4, and |
| // GetRelativeDistance(6, 2, 28) = 6 - 2 = 4. |
| // |
| // On the other hand, consider the order hints 2 and 14. The order hints are |
| // 12 (> 7) apart, so we need to use the actual values instead. The actual |
| // values may be 34 (= 2 mod 16) and 30 (= 14 mod 16), respectively. Therefore |
| // we have |
| // GetRelativeDistance(2, 14, 28) = 34 - 30 = 4, and |
| // GetRelativeDistance(14, 2, 28) = 30 - 34 = -4. |
| // |
| // The following comments apply only to specific CPUs' SIMD implementations, |
| // such as intrinsics code. |
| // For the 2 shift operations in this function, if the SIMD packed data is |
| // 16-bit wide, try to use |order_hint_shift_bits| - 16 as the number of bits to |
| // shift; If the SIMD packed data is 8-bit wide, try to use |
| // |order_hint_shift_bits| - 24 as as the number of bits to shift. |
| // |order_hint_shift_bits| - 16 and |order_hint_shift_bits| - 24 could be -16 or |
| // -24. In these cases diff is 0, and the behavior of left or right shifting -16 |
| // or -24 bits is defined for x86 SIMD instructions and ARM NEON instructions, |
| // and the result of shifting 0 is still 0. There is no guarantee that this |
| // behavior and result apply to other CPUs' SIMD instructions. |
| inline int GetRelativeDistance(const unsigned int a, const unsigned int b, |
| const unsigned int order_hint_shift_bits) { |
| const int diff = a - b; |
| assert(order_hint_shift_bits <= 31); |
| if (order_hint_shift_bits == 0) { |
| assert(a == 0); |
| assert(b == 0); |
| } else { |
| assert(order_hint_shift_bits >= 24); // i.e., order_hint_bits <= 8 |
| assert(a < (1u << (32 - order_hint_shift_bits))); |
| assert(b < (1u << (32 - order_hint_shift_bits))); |
| assert(diff < (1 << (32 - order_hint_shift_bits))); |
| assert(diff >= -(1 << (32 - order_hint_shift_bits))); |
| } |
| // Sign extend the result of subtracting the values. |
| // Cast to unsigned int and then left shift to avoid undefined behavior with |
| // negative values. Cast to int to do the sign extension through right shift. |
| // This requires the right shift of a signed integer be an arithmetic shift, |
| // which is true for clang, gcc, and Visual C++. |
| // These two casts do not generate extra instructions. |
| // Don't use LeftShift(diff) since a valid diff may fail its assertions. |
| // For example, GetRelativeDistance(2, 14, 28), diff equals -12 and is less |
| // than the minimum allowed value of LeftShift() which is -8. |
| // The next 3 lines are equivalent to: |
| // const int order_hint_bits = Mod32(32 - order_hint_shift_bits); |
| // const int m = (1 << order_hint_bits) >> 1; |
| // return (diff & (m - 1)) - (diff & m); |
| return static_cast<int>(static_cast<unsigned int>(diff) |
| << order_hint_shift_bits) >> |
| order_hint_shift_bits; |
| } |
| |
| // Applies |sign| (must be 0 or -1) to |value|, i.e., |
| // return (sign == 0) ? value : -value; |
| // and does so without a branch. |
| constexpr int ApplySign(int value, int sign) { return (value ^ sign) - sign; } |
| |
| // 7.9.3. (without the clamp for numerator and denominator). |
| inline void GetMvProjection(const MotionVector& mv, int numerator, |
| int division_multiplier, |
| MotionVector* projection_mv) { |
| // Allow numerator and to be 0 so that this function can be called |
| // unconditionally. When numerator is 0, |projection_mv| will be 0, and this |
| // is what we want. |
| assert(std::abs(numerator) <= kMaxFrameDistance); |
| for (int i = 0; i < 2; ++i) { |
| projection_mv->mv[i] = |
| Clip3(RightShiftWithRoundingSigned( |
| mv.mv[i] * numerator * division_multiplier, 14), |
| -kProjectionMvClamp, kProjectionMvClamp); |
| } |
| } |
| |
| // 7.9.4. |
| constexpr int Project(int value, int delta, int dst_sign) { |
| return value + ApplySign(delta / 64, dst_sign); |
| } |
| |
| inline bool IsBlockSmallerThan8x8(BlockSize size) { |
| return size < kBlock8x8 && size != kBlock4x16; |
| } |
| |
| // Returns true if the either the width or the height of the block is equal to |
| // four. |
| inline bool IsBlockDimension4(BlockSize size) { |
| return size < kBlock8x8 || size == kBlock16x4; |
| } |
| |
| // Converts bitdepth 8, 10, and 12 to array index 0, 1, and 2, respectively. |
| constexpr int BitdepthToArrayIndex(int bitdepth) { return (bitdepth - 8) >> 1; } |
| |
| // Maps a square transform to an index between [0, 4]. kTransformSize4x4 maps |
| // to 0, kTransformSize8x8 maps to 1 and so on. |
| inline int TransformSizeToSquareTransformIndex(TransformSize tx_size) { |
| assert(kTransformWidth[tx_size] == kTransformHeight[tx_size]); |
| |
| // The values of the square transform sizes happen to be in the right |
| // ranges, so we can just divide them by 4 to get the indexes. |
| static_assert( |
| std::is_unsigned<std::underlying_type<TransformSize>::type>::value, ""); |
| static_assert(kTransformSize4x4 < 4, ""); |
| static_assert(4 <= kTransformSize8x8 && kTransformSize8x8 < 8, ""); |
| static_assert(8 <= kTransformSize16x16 && kTransformSize16x16 < 12, ""); |
| static_assert(12 <= kTransformSize32x32 && kTransformSize32x32 < 16, ""); |
| static_assert(16 <= kTransformSize64x64 && kTransformSize64x64 < 20, ""); |
| return DivideBy4(tx_size); |
| } |
| |
| // Gets the corresponding Y/U/V position, to set and get filter masks |
| // in deblock filtering. |
| // Returns luma_position if it's Y plane, whose subsampling must be 0. |
| // Returns the odd position for U/V plane, if there is subsampling. |
| constexpr int GetDeblockPosition(const int luma_position, |
| const int subsampling) { |
| return luma_position | subsampling; |
| } |
| |
| // Returns the size of the residual buffer required to hold the residual values |
| // for a block or frame of size |rows| by |columns| (taking into account |
| // |subsampling_x|, |subsampling_y| and |residual_size|). |residual_size| is the |
| // number of bytes required to represent one residual value. |
| inline size_t GetResidualBufferSize(const int rows, const int columns, |
| const int subsampling_x, |
| const int subsampling_y, |
| const size_t residual_size) { |
| // The subsampling multipliers are: |
| // Both x and y are subsampled: 3 / 2. |
| // Only x or y is subsampled: 2 / 1 (which is equivalent to 4 / 2). |
| // Both x and y are not subsampled: 3 / 1 (which is equivalent to 6 / 2). |
| // So we compute the final subsampling multiplier as follows: |
| // multiplier = (2 + (4 >> subsampling_x >> subsampling_y)) / 2. |
| // Add 32 * |kResidualPaddingVertical| padding to avoid bottom boundary checks |
| // when parsing quantized coefficients. |
| const int subsampling_multiplier_num = |
| 2 + (4 >> subsampling_x >> subsampling_y); |
| const int number_elements = |
| (rows * columns * subsampling_multiplier_num) >> 1; |
| const int tx_padding = 32 * kResidualPaddingVertical; |
| return residual_size * (number_elements + tx_padding); |
| } |
| |
| // This function is equivalent to: |
| // std::min({kTransformWidthLog2[tx_size] - 2, |
| // kTransformWidthLog2[left_tx_size] - 2, |
| // 2}); |
| constexpr LoopFilterTransformSizeId GetTransformSizeIdWidth( |
| TransformSize tx_size, TransformSize left_tx_size) { |
| return static_cast<LoopFilterTransformSizeId>( |
| static_cast<int>(tx_size > kTransformSize4x16 && |
| left_tx_size > kTransformSize4x16) + |
| static_cast<int>(tx_size > kTransformSize8x32 && |
| left_tx_size > kTransformSize8x32)); |
| } |
| |
| // This is used for 7.11.3.4 Block Inter Prediction Process, to select convolve |
| // filters. |
| inline int GetFilterIndex(const int filter_index, const int length) { |
| if (length <= 4) { |
| if (filter_index == kInterpolationFilterEightTap || |
| filter_index == kInterpolationFilterEightTapSharp) { |
| return 4; |
| } |
| if (filter_index == kInterpolationFilterEightTapSmooth) { |
| return 5; |
| } |
| } |
| return filter_index; |
| } |
| |
| constexpr int SubsampledValue(int value, int subsampling) { |
| return (value + subsampling) >> subsampling; |
| } |
| |
| } // namespace libgav1 |
| |
| #endif // LIBGAV1_SRC_UTILS_COMMON_H_ |