| // Copyright 2019 The libgav1 Authors |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| #include "src/utils/entropy_decoder.h" |
| |
| #include <cassert> |
| #include <cstring> |
| |
| #include "src/utils/common.h" |
| #include "src/utils/compiler_attributes.h" |
| #include "src/utils/constants.h" |
| |
| #if defined(__ARM_NEON__) || defined(__aarch64__) || \ |
| (defined(_MSC_VER) && defined(_M_ARM)) |
| #define LIBGAV1_ENTROPY_DECODER_ENABLE_NEON 1 |
| #else |
| #define LIBGAV1_ENTROPY_DECODER_ENABLE_NEON 0 |
| #endif |
| |
| #if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON |
| #include <arm_neon.h> |
| #endif |
| |
| #if defined(__SSE4_1__) || defined(LIBGAV1_X86_MSVC) |
| #define LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4 1 |
| #else |
| #define LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4 0 |
| #endif |
| |
| #if LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4 |
| #include <smmintrin.h> |
| #endif |
| |
| namespace libgav1 { |
| namespace { |
| |
| constexpr uint32_t kReadBitMask = ~255; |
| // This constant is used to set the value of |bits_| so that bits can be read |
| // after end of stream without trying to refill the buffer for a reasonably long |
| // time. |
| constexpr int kLargeBitCount = 0x4000; |
| constexpr int kCdfPrecision = 6; |
| constexpr int kMinimumProbabilityPerSymbol = 4; |
| |
| // This function computes the "cur" variable as specified inside the do-while |
| // loop in Section 8.2.6 of the spec. This function is monotonically |
| // decreasing as the values of index increases (note that the |cdf| array is |
| // sorted in decreasing order). |
| uint32_t ScaleCdf(uint32_t values_in_range_shifted, const uint16_t* const cdf, |
| int index, int symbol_count) { |
| return ((values_in_range_shifted * (cdf[index] >> kCdfPrecision)) >> 1) + |
| (kMinimumProbabilityPerSymbol * (symbol_count - index)); |
| } |
| |
| void UpdateCdf(uint16_t* const cdf, const int symbol_count, const int symbol) { |
| const uint16_t count = cdf[symbol_count]; |
| // rate is computed in the spec as: |
| // 3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2) |
| // In this case cdf[N] is |count|. |
| // Min(FloorLog2(N), 2) is 1 for symbol_count == {2, 3} and 2 for all |
| // symbol_count > 3. So the equation becomes: |
| // 4 + (count > 15) + (count > 31) + (symbol_count > 3). |
| // Note that the largest value for count is 32 (it is not incremented beyond |
| // 32). So using that information: |
| // count >> 4 is 0 for count from 0 to 15. |
| // count >> 4 is 1 for count from 16 to 31. |
| // count >> 4 is 2 for count == 31. |
| // Now, the equation becomes: |
| // 4 + (count >> 4) + (symbol_count > 3). |
| // Since (count >> 4) can only be 0 or 1 or 2, the addition can be replaced |
| // with bitwise or. So the final equation is: |
| // (4 | (count >> 4)) + (symbol_count > 3). |
| const int rate = (4 | (count >> 4)) + static_cast<int>(symbol_count > 3); |
| // Hints for further optimizations: |
| // |
| // 1. clang can vectorize this for loop with width 4, even though the loop |
| // contains an if-else statement. Therefore, it may be advantageous to use |
| // "i < symbol_count" as the loop condition when symbol_count is 8, 12, or 16 |
| // (a multiple of 4 that's not too small). |
| // |
| // 2. The for loop can be rewritten in the following form, which would enable |
| // clang to vectorize the loop with width 8: |
| // |
| // const int rounding = (1 << rate) - 1; |
| // for (int i = 0; i < symbol_count - 1; ++i) { |
| // const uint16_t a = (i < symbol) ? kCdfMaxProbability : rounding; |
| // cdf[i] += static_cast<int16_t>(a - cdf[i]) >> rate; |
| // } |
| // |
| // The subtraction (a - cdf[i]) relies on the overflow semantics of unsigned |
| // integer arithmetic. The result of the unsigned subtraction is cast to a |
| // signed integer and right-shifted. This requires the right shift of a |
| // signed integer be an arithmetic shift, which is true for clang, gcc, and |
| // Visual C++. |
| for (int i = 0; i < symbol_count - 1; ++i) { |
| if (i < symbol) { |
| cdf[i] += (kCdfMaxProbability - cdf[i]) >> rate; |
| } else { |
| cdf[i] -= cdf[i] >> rate; |
| } |
| } |
| cdf[symbol_count] += static_cast<uint16_t>(count < 32); |
| } |
| |
| // Define the UpdateCdfN functions. UpdateCdfN is a specialized implementation |
| // of UpdateCdf based on the fact that symbol_count == N. UpdateCdfN uses the |
| // SIMD instruction sets if available. |
| |
| #if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON |
| |
| // The UpdateCdf() method contains the following for loop: |
| // |
| // for (int i = 0; i < symbol_count - 1; ++i) { |
| // if (i < symbol) { |
| // cdf[i] += (kCdfMaxProbability - cdf[i]) >> rate; |
| // } else { |
| // cdf[i] -= cdf[i] >> rate; |
| // } |
| // } |
| // |
| // It can be rewritten in the following two forms, which are amenable to SIMD |
| // implementations: |
| // |
| // const int rounding = (1 << rate) - 1; |
| // for (int i = 0; i < symbol_count - 1; ++i) { |
| // const uint16_t a = (i < symbol) ? kCdfMaxProbability : rounding; |
| // cdf[i] += static_cast<int16_t>(a - cdf[i]) >> rate; |
| // } |
| // |
| // or: |
| // |
| // const int rounding = (1 << rate) - 1; |
| // for (int i = 0; i < symbol_count - 1; ++i) { |
| // const uint16_t a = (i < symbol) ? (kCdfMaxProbability - rounding) : 0; |
| // cdf[i] -= static_cast<int16_t>(cdf[i] - a) >> rate; |
| // } |
| // |
| // The following ARM NEON implementations use the second form, which seems |
| // slightly faster. |
| // |
| // The cdf array has symbol_count + 1 elements. The first symbol_count elements |
| // are the CDF. The last element is a count that is initialized to 0 and may |
| // grow up to 32. The for loop in UpdateCdf updates the CDF in the array. Since |
| // cdf[symbol_count - 1] is always 0, the for loop does not update |
| // cdf[symbol_count - 1]. However, it would be correct to have the for loop |
| // update cdf[symbol_count - 1] anyway: since symbol_count - 1 >= symbol, the |
| // for loop would take the else branch when i is symbol_count - 1: |
| // cdf[i] -= cdf[i] >> rate; |
| // Since cdf[symbol_count - 1] is 0, cdf[symbol_count - 1] would still be 0 |
| // after the update. The ARM NEON implementations take advantage of this in the |
| // following two cases: |
| // 1. When symbol_count is 8 or 16, the vectorized code updates the first |
| // symbol_count elements in the array. |
| // 2. When symbol_count is 7, the vectorized code updates all the 8 elements in |
| // the cdf array. Since an invalid CDF value is written into cdf[7], the |
| // count in cdf[7] needs to be fixed up after the vectorized code. |
| |
| void UpdateCdf5(uint16_t* const cdf, const int symbol) { |
| uint16x4_t cdf_vec = vld1_u16(cdf); |
| const uint16_t count = cdf[5]; |
| const int rate = (4 | (count >> 4)) + 1; |
| const uint16x4_t zero = vdup_n_u16(0); |
| const uint16x4_t cdf_max_probability = |
| vdup_n_u16(kCdfMaxProbability + 1 - (1 << rate)); |
| const uint16x4_t index = vcreate_u16(0x0003000200010000); |
| const uint16x4_t symbol_vec = vdup_n_u16(symbol); |
| const uint16x4_t mask = vclt_u16(index, symbol_vec); |
| const uint16x4_t a = vbsl_u16(mask, cdf_max_probability, zero); |
| const int16x4_t diff = vreinterpret_s16_u16(vsub_u16(cdf_vec, a)); |
| const int16x4_t negative_rate = vdup_n_s16(-rate); |
| const uint16x4_t delta = vreinterpret_u16_s16(vshl_s16(diff, negative_rate)); |
| cdf_vec = vsub_u16(cdf_vec, delta); |
| vst1_u16(cdf, cdf_vec); |
| cdf[5] = count + static_cast<uint16_t>(count < 32); |
| } |
| |
| // This version works for |symbol_count| = 7, 8, or 9. |
| template <int symbol_count> |
| void UpdateCdf7To9(uint16_t* const cdf, const int symbol) { |
| static_assert(symbol_count >= 7 && symbol_count <= 9, ""); |
| uint16x8_t cdf_vec = vld1q_u16(cdf); |
| const uint16_t count = cdf[symbol_count]; |
| const int rate = (4 | (count >> 4)) + 1; |
| const uint16x8_t zero = vdupq_n_u16(0); |
| const uint16x8_t cdf_max_probability = |
| vdupq_n_u16(kCdfMaxProbability + 1 - (1 << rate)); |
| const uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000), |
| vcreate_u16(0x0007000600050004)); |
| const uint16x8_t symbol_vec = vdupq_n_u16(symbol); |
| const uint16x8_t mask = vcltq_u16(index, symbol_vec); |
| const uint16x8_t a = vbslq_u16(mask, cdf_max_probability, zero); |
| const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(cdf_vec, a)); |
| const int16x8_t negative_rate = vdupq_n_s16(-rate); |
| const uint16x8_t delta = |
| vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate)); |
| cdf_vec = vsubq_u16(cdf_vec, delta); |
| vst1q_u16(cdf, cdf_vec); |
| cdf[symbol_count] = count + static_cast<uint16_t>(count < 32); |
| } |
| |
| void UpdateCdf7(uint16_t* const cdf, const int symbol) { |
| UpdateCdf7To9<7>(cdf, symbol); |
| } |
| |
| void UpdateCdf8(uint16_t* const cdf, const int symbol) { |
| UpdateCdf7To9<8>(cdf, symbol); |
| } |
| |
| void UpdateCdf11(uint16_t* const cdf, const int symbol) { |
| uint16x8_t cdf_vec = vld1q_u16(cdf + 2); |
| const uint16_t count = cdf[11]; |
| cdf[11] = count + static_cast<uint16_t>(count < 32); |
| const int rate = (4 | (count >> 4)) + 1; |
| if (symbol > 1) { |
| cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate; |
| cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate; |
| const uint16x8_t zero = vdupq_n_u16(0); |
| const uint16x8_t cdf_max_probability = |
| vdupq_n_u16(kCdfMaxProbability + 1 - (1 << rate)); |
| const uint16x8_t symbol_vec = vdupq_n_u16(symbol); |
| const int16x8_t negative_rate = vdupq_n_s16(-rate); |
| const uint16x8_t index = vcombine_u16(vcreate_u16(0x0005000400030002), |
| vcreate_u16(0x0009000800070006)); |
| const uint16x8_t mask = vcltq_u16(index, symbol_vec); |
| const uint16x8_t a = vbslq_u16(mask, cdf_max_probability, zero); |
| const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(cdf_vec, a)); |
| const uint16x8_t delta = |
| vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate)); |
| cdf_vec = vsubq_u16(cdf_vec, delta); |
| vst1q_u16(cdf + 2, cdf_vec); |
| } else { |
| if (symbol != 0) { |
| cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate; |
| cdf[1] -= cdf[1] >> rate; |
| } else { |
| cdf[0] -= cdf[0] >> rate; |
| cdf[1] -= cdf[1] >> rate; |
| } |
| const int16x8_t negative_rate = vdupq_n_s16(-rate); |
| const uint16x8_t delta = vshlq_u16(cdf_vec, negative_rate); |
| cdf_vec = vsubq_u16(cdf_vec, delta); |
| vst1q_u16(cdf + 2, cdf_vec); |
| } |
| } |
| |
| void UpdateCdf13(uint16_t* const cdf, const int symbol) { |
| uint16x8_t cdf_vec0 = vld1q_u16(cdf); |
| uint16x8_t cdf_vec1 = vld1q_u16(cdf + 4); |
| const uint16_t count = cdf[13]; |
| const int rate = (4 | (count >> 4)) + 1; |
| const uint16x8_t zero = vdupq_n_u16(0); |
| const uint16x8_t cdf_max_probability = |
| vdupq_n_u16(kCdfMaxProbability + 1 - (1 << rate)); |
| const uint16x8_t symbol_vec = vdupq_n_u16(symbol); |
| const int16x8_t negative_rate = vdupq_n_s16(-rate); |
| |
| uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000), |
| vcreate_u16(0x0007000600050004)); |
| uint16x8_t mask = vcltq_u16(index, symbol_vec); |
| uint16x8_t a = vbslq_u16(mask, cdf_max_probability, zero); |
| int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(cdf_vec0, a)); |
| uint16x8_t delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate)); |
| cdf_vec0 = vsubq_u16(cdf_vec0, delta); |
| vst1q_u16(cdf, cdf_vec0); |
| |
| index = vcombine_u16(vcreate_u16(0x0007000600050004), |
| vcreate_u16(0x000b000a00090008)); |
| mask = vcltq_u16(index, symbol_vec); |
| a = vbslq_u16(mask, cdf_max_probability, zero); |
| diff = vreinterpretq_s16_u16(vsubq_u16(cdf_vec1, a)); |
| delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate)); |
| cdf_vec1 = vsubq_u16(cdf_vec1, delta); |
| vst1q_u16(cdf + 4, cdf_vec1); |
| |
| cdf[13] = count + static_cast<uint16_t>(count < 32); |
| } |
| |
| void UpdateCdf16(uint16_t* const cdf, const int symbol) { |
| uint16x8_t cdf_vec = vld1q_u16(cdf); |
| const uint16_t count = cdf[16]; |
| const int rate = (4 | (count >> 4)) + 1; |
| const uint16x8_t zero = vdupq_n_u16(0); |
| const uint16x8_t cdf_max_probability = |
| vdupq_n_u16(kCdfMaxProbability + 1 - (1 << rate)); |
| const uint16x8_t symbol_vec = vdupq_n_u16(symbol); |
| const int16x8_t negative_rate = vdupq_n_s16(-rate); |
| |
| uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000), |
| vcreate_u16(0x0007000600050004)); |
| uint16x8_t mask = vcltq_u16(index, symbol_vec); |
| uint16x8_t a = vbslq_u16(mask, cdf_max_probability, zero); |
| int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(cdf_vec, a)); |
| uint16x8_t delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate)); |
| cdf_vec = vsubq_u16(cdf_vec, delta); |
| vst1q_u16(cdf, cdf_vec); |
| |
| cdf_vec = vld1q_u16(cdf + 8); |
| index = vcombine_u16(vcreate_u16(0x000b000a00090008), |
| vcreate_u16(0x000f000e000d000c)); |
| mask = vcltq_u16(index, symbol_vec); |
| a = vbslq_u16(mask, cdf_max_probability, zero); |
| diff = vreinterpretq_s16_u16(vsubq_u16(cdf_vec, a)); |
| delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate)); |
| cdf_vec = vsubq_u16(cdf_vec, delta); |
| vst1q_u16(cdf + 8, cdf_vec); |
| |
| cdf[16] = count + static_cast<uint16_t>(count < 32); |
| } |
| |
| #else // !LIBGAV1_ENTROPY_DECODER_ENABLE_NEON |
| |
| #if LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4 |
| |
| inline __m128i LoadLo8(const void* a) { |
| return _mm_loadl_epi64(static_cast<const __m128i*>(a)); |
| } |
| |
| inline __m128i LoadUnaligned16(const void* a) { |
| return _mm_loadu_si128(static_cast<const __m128i*>(a)); |
| } |
| |
| inline void StoreLo8(void* a, const __m128i v) { |
| _mm_storel_epi64(static_cast<__m128i*>(a), v); |
| } |
| |
| inline void StoreUnaligned16(void* a, const __m128i v) { |
| _mm_storeu_si128(static_cast<__m128i*>(a), v); |
| } |
| |
| void UpdateCdf5(uint16_t* const cdf, const int symbol) { |
| __m128i cdf_vec = LoadLo8(cdf); |
| const uint16_t count = cdf[5]; |
| const int rate = (4 | (count >> 4)) + 1; |
| const __m128i zero = _mm_setzero_si128(); |
| const __m128i cdf_max_probability = _mm_shufflelo_epi16( |
| _mm_cvtsi32_si128(kCdfMaxProbability + 1 - (1 << rate)), 0); |
| const __m128i index = _mm_set_epi32(0x0, 0x0, 0x00030002, 0x00010000); |
| const __m128i symbol_vec = _mm_shufflelo_epi16(_mm_cvtsi32_si128(symbol), 0); |
| const __m128i mask = _mm_cmplt_epi16(index, symbol_vec); |
| const __m128i a = _mm_blendv_epi8(zero, cdf_max_probability, mask); |
| const __m128i diff = _mm_sub_epi16(cdf_vec, a); |
| const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate)); |
| cdf_vec = _mm_sub_epi16(cdf_vec, delta); |
| StoreLo8(cdf, cdf_vec); |
| cdf[5] = count + static_cast<uint16_t>(count < 32); |
| } |
| |
| // This version works for |symbol_count| = 7, 8, or 9. |
| template <int symbol_count> |
| void UpdateCdf7To9(uint16_t* const cdf, const int symbol) { |
| static_assert(symbol_count >= 7 && symbol_count <= 9, ""); |
| __m128i cdf_vec = LoadUnaligned16(cdf); |
| const uint16_t count = cdf[symbol_count]; |
| const int rate = (4 | (count >> 4)) + 1; |
| const __m128i zero = _mm_setzero_si128(); |
| const __m128i cdf_max_probability = |
| _mm_set1_epi16(kCdfMaxProbability + 1 - (1 << rate)); |
| const __m128i index = |
| _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000); |
| const __m128i symbol_vec = _mm_set1_epi16(symbol); |
| const __m128i mask = _mm_cmplt_epi16(index, symbol_vec); |
| const __m128i a = _mm_blendv_epi8(zero, cdf_max_probability, mask); |
| const __m128i diff = _mm_sub_epi16(cdf_vec, a); |
| const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate)); |
| cdf_vec = _mm_sub_epi16(cdf_vec, delta); |
| StoreUnaligned16(cdf, cdf_vec); |
| cdf[symbol_count] = count + static_cast<uint16_t>(count < 32); |
| } |
| |
| void UpdateCdf7(uint16_t* const cdf, const int symbol) { |
| UpdateCdf7To9<7>(cdf, symbol); |
| } |
| |
| void UpdateCdf8(uint16_t* const cdf, const int symbol) { |
| UpdateCdf7To9<8>(cdf, symbol); |
| } |
| |
| void UpdateCdf11(uint16_t* const cdf, const int symbol) { |
| __m128i cdf_vec = LoadUnaligned16(cdf + 2); |
| const uint16_t count = cdf[11]; |
| cdf[11] = count + static_cast<uint16_t>(count < 32); |
| const int rate = (4 | (count >> 4)) + 1; |
| if (symbol > 1) { |
| cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate; |
| cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate; |
| const __m128i zero = _mm_setzero_si128(); |
| const __m128i cdf_max_probability = |
| _mm_set1_epi16(kCdfMaxProbability + 1 - (1 << rate)); |
| const __m128i index = |
| _mm_set_epi32(0x00090008, 0x00070006, 0x00050004, 0x00030002); |
| const __m128i symbol_vec = _mm_set1_epi16(symbol); |
| const __m128i mask = _mm_cmplt_epi16(index, symbol_vec); |
| const __m128i a = _mm_blendv_epi8(zero, cdf_max_probability, mask); |
| const __m128i diff = _mm_sub_epi16(cdf_vec, a); |
| const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate)); |
| cdf_vec = _mm_sub_epi16(cdf_vec, delta); |
| StoreUnaligned16(cdf + 2, cdf_vec); |
| } else { |
| if (symbol != 0) { |
| cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate; |
| cdf[1] -= cdf[1] >> rate; |
| } else { |
| cdf[0] -= cdf[0] >> rate; |
| cdf[1] -= cdf[1] >> rate; |
| } |
| const __m128i delta = _mm_sra_epi16(cdf_vec, _mm_cvtsi32_si128(rate)); |
| cdf_vec = _mm_sub_epi16(cdf_vec, delta); |
| StoreUnaligned16(cdf + 2, cdf_vec); |
| } |
| } |
| |
| void UpdateCdf13(uint16_t* const cdf, const int symbol) { |
| __m128i cdf_vec0 = LoadUnaligned16(cdf); |
| __m128i cdf_vec1 = LoadUnaligned16(cdf + 4); |
| const uint16_t count = cdf[13]; |
| const int rate = (4 | (count >> 4)) + 1; |
| const __m128i zero = _mm_setzero_si128(); |
| const __m128i cdf_max_probability = |
| _mm_set1_epi16(kCdfMaxProbability + 1 - (1 << rate)); |
| const __m128i symbol_vec = _mm_set1_epi16(symbol); |
| |
| const __m128i index = |
| _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000); |
| const __m128i mask = _mm_cmplt_epi16(index, symbol_vec); |
| const __m128i a = _mm_blendv_epi8(zero, cdf_max_probability, mask); |
| const __m128i diff = _mm_sub_epi16(cdf_vec0, a); |
| const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate)); |
| cdf_vec0 = _mm_sub_epi16(cdf_vec0, delta); |
| StoreUnaligned16(cdf, cdf_vec0); |
| |
| const __m128i index1 = |
| _mm_set_epi32(0x000b000a, 0x00090008, 0x00070006, 0x00050004); |
| const __m128i mask1 = _mm_cmplt_epi16(index1, symbol_vec); |
| const __m128i a1 = _mm_blendv_epi8(zero, cdf_max_probability, mask1); |
| const __m128i diff1 = _mm_sub_epi16(cdf_vec1, a1); |
| const __m128i delta1 = _mm_sra_epi16(diff1, _mm_cvtsi32_si128(rate)); |
| cdf_vec1 = _mm_sub_epi16(cdf_vec1, delta1); |
| StoreUnaligned16(cdf + 4, cdf_vec1); |
| |
| cdf[13] = count + static_cast<uint16_t>(count < 32); |
| } |
| |
| void UpdateCdf16(uint16_t* const cdf, const int symbol) { |
| __m128i cdf_vec0 = LoadUnaligned16(cdf); |
| const uint16_t count = cdf[16]; |
| const int rate = (4 | (count >> 4)) + 1; |
| const __m128i zero = _mm_setzero_si128(); |
| const __m128i cdf_max_probability = |
| _mm_set1_epi16(kCdfMaxProbability + 1 - (1 << rate)); |
| const __m128i symbol_vec = _mm_set1_epi16(symbol); |
| |
| const __m128i index = |
| _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000); |
| const __m128i mask = _mm_cmplt_epi16(index, symbol_vec); |
| const __m128i a = _mm_blendv_epi8(zero, cdf_max_probability, mask); |
| const __m128i diff = _mm_sub_epi16(cdf_vec0, a); |
| const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate)); |
| cdf_vec0 = _mm_sub_epi16(cdf_vec0, delta); |
| StoreUnaligned16(cdf, cdf_vec0); |
| |
| __m128i cdf_vec1 = LoadUnaligned16(cdf + 8); |
| const __m128i index1 = |
| _mm_set_epi32(0x000f000e, 0x000d000c, 0x000b000a, 0x00090008); |
| const __m128i mask1 = _mm_cmplt_epi16(index1, symbol_vec); |
| const __m128i a1 = _mm_blendv_epi8(zero, cdf_max_probability, mask1); |
| const __m128i diff1 = _mm_sub_epi16(cdf_vec1, a1); |
| const __m128i delta1 = _mm_sra_epi16(diff1, _mm_cvtsi32_si128(rate)); |
| cdf_vec1 = _mm_sub_epi16(cdf_vec1, delta1); |
| StoreUnaligned16(cdf + 8, cdf_vec1); |
| |
| cdf[16] = count + static_cast<uint16_t>(count < 32); |
| } |
| |
| #else // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4 |
| |
| void UpdateCdf5(uint16_t* const cdf, const int symbol) { |
| UpdateCdf(cdf, 5, symbol); |
| } |
| |
| void UpdateCdf7(uint16_t* const cdf, const int symbol) { |
| UpdateCdf(cdf, 7, symbol); |
| } |
| |
| void UpdateCdf8(uint16_t* const cdf, const int symbol) { |
| UpdateCdf(cdf, 8, symbol); |
| } |
| |
| void UpdateCdf11(uint16_t* const cdf, const int symbol) { |
| UpdateCdf(cdf, 11, symbol); |
| } |
| |
| void UpdateCdf13(uint16_t* const cdf, const int symbol) { |
| UpdateCdf(cdf, 13, symbol); |
| } |
| |
| void UpdateCdf16(uint16_t* const cdf, const int symbol) { |
| UpdateCdf(cdf, 16, symbol); |
| } |
| |
| #endif // LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4 |
| #endif // LIBGAV1_ENTROPY_DECODER_ENABLE_NEON |
| |
| } // namespace |
| |
| #if !LIBGAV1_CXX17 |
| constexpr int DaalaBitReader::kWindowSize; // static. |
| #endif |
| |
| DaalaBitReader::DaalaBitReader(const uint8_t* data, size_t size, |
| bool allow_update_cdf) |
| : data_(data), |
| size_(size), |
| data_index_(0), |
| allow_update_cdf_(allow_update_cdf) { |
| window_diff_ = (WindowSize{1} << (kWindowSize - 1)) - 1; |
| values_in_range_ = kCdfMaxProbability; |
| bits_ = -15; |
| PopulateBits(); |
| } |
| |
| // This is similar to the ReadSymbol() implementation but it is optimized based |
| // on the following facts: |
| // * The probability is fixed at half. So some multiplications can be replaced |
| // with bit operations. |
| // * Symbol count is fixed at 2. |
| int DaalaBitReader::ReadBit() { |
| const uint32_t curr = |
| ((values_in_range_ & kReadBitMask) >> 1) + kMinimumProbabilityPerSymbol; |
| const WindowSize zero_threshold = static_cast<WindowSize>(curr) |
| << (kWindowSize - 16); |
| int bit = 1; |
| if (window_diff_ >= zero_threshold) { |
| values_in_range_ -= curr; |
| window_diff_ -= zero_threshold; |
| bit = 0; |
| } else { |
| values_in_range_ = curr; |
| } |
| NormalizeRange(); |
| return bit; |
| } |
| |
| int64_t DaalaBitReader::ReadLiteral(int num_bits) { |
| assert(num_bits <= 32); |
| assert(num_bits > 0); |
| uint32_t literal = 0; |
| int bit = num_bits - 1; |
| do { |
| // ARM can combine a shift operation with a constant number of bits with |
| // some other operations, such as the OR operation. |
| // Here is an ARM disassembly example: |
| // orr w1, w0, w1, lsl #1 |
| // which left shifts register w1 by 1 bit and OR the shift result with |
| // register w0. |
| // The next 2 lines are equivalent to: |
| // literal |= static_cast<uint32_t>(ReadBit()) << bit; |
| literal <<= 1; |
| literal |= static_cast<uint32_t>(ReadBit()); |
| } while (--bit >= 0); |
| return literal; |
| } |
| |
| int DaalaBitReader::ReadSymbol(uint16_t* const cdf, int symbol_count) { |
| const int symbol = ReadSymbolImpl(cdf, symbol_count); |
| if (allow_update_cdf_) { |
| UpdateCdf(cdf, symbol_count, symbol); |
| } |
| return symbol; |
| } |
| |
| bool DaalaBitReader::ReadSymbol(uint16_t* cdf) { |
| const bool symbol = ReadSymbolImpl(cdf) != 0; |
| if (allow_update_cdf_) { |
| const uint16_t count = cdf[2]; |
| // rate is computed in the spec as: |
| // 3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2) |
| // In this case N is 2 and cdf[N] is |count|. So the equation becomes: |
| // 4 + (count > 15) + (count > 31) |
| // Note that the largest value for count is 32 (it is not incremented beyond |
| // 32). So using that information: |
| // count >> 4 is 0 for count from 0 to 15. |
| // count >> 4 is 1 for count from 16 to 31. |
| // count >> 4 is 2 for count == 32. |
| // Now, the equation becomes: |
| // 4 + (count >> 4). |
| // Since (count >> 4) can only be 0 or 1 or 2, the addition can be replaced |
| // with bitwise or. So the final equation is: |
| // 4 | (count >> 4). |
| const int rate = 4 | (count >> 4); |
| if (symbol) { |
| cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate; |
| } else { |
| cdf[0] -= cdf[0] >> rate; |
| } |
| cdf[2] += static_cast<uint16_t>(count < 32); |
| } |
| return symbol; |
| } |
| |
| bool DaalaBitReader::ReadSymbolWithoutCdfUpdate(uint16_t* cdf) { |
| return ReadSymbolImpl(cdf) != 0; |
| } |
| |
| template <int symbol_count> |
| int DaalaBitReader::ReadSymbol(uint16_t* const cdf) { |
| static_assert(symbol_count >= 3 && symbol_count <= 16, ""); |
| if (symbol_count == 4) { |
| return ReadSymbol4(cdf); |
| } |
| int symbol; |
| if (symbol_count == 8) { |
| symbol = ReadSymbolImpl8(cdf); |
| } else if (symbol_count <= 13) { |
| symbol = ReadSymbolImpl(cdf, symbol_count); |
| } else { |
| symbol = ReadSymbolImplBinarySearch(cdf, symbol_count); |
| } |
| if (allow_update_cdf_) { |
| if (symbol_count == 5) { |
| UpdateCdf5(cdf, symbol); |
| } else if (symbol_count == 7) { |
| UpdateCdf7(cdf, symbol); |
| } else if (symbol_count == 8) { |
| UpdateCdf8(cdf, symbol); |
| } else if (symbol_count == 11) { |
| UpdateCdf11(cdf, symbol); |
| } else if (symbol_count == 13) { |
| UpdateCdf13(cdf, symbol); |
| } else if (symbol_count == 16) { |
| UpdateCdf16(cdf, symbol); |
| } else { |
| UpdateCdf(cdf, symbol_count, symbol); |
| } |
| } |
| return symbol; |
| } |
| |
| int DaalaBitReader::ReadSymbolImpl(const uint16_t* const cdf, |
| int symbol_count) { |
| assert(cdf[symbol_count - 1] == 0); |
| --symbol_count; |
| uint32_t curr = values_in_range_; |
| int symbol = -1; |
| uint32_t prev; |
| const auto symbol_value = |
| static_cast<uint32_t>(window_diff_ >> (kWindowSize - 16)); |
| uint32_t delta = kMinimumProbabilityPerSymbol * symbol_count; |
| // Search through the |cdf| array to determine where the scaled cdf value and |
| // |symbol_value| cross over. |
| do { |
| prev = curr; |
| curr = (((values_in_range_ >> 8) * (cdf[++symbol] >> kCdfPrecision)) >> 1) + |
| delta; |
| delta -= kMinimumProbabilityPerSymbol; |
| } while (symbol_value < curr); |
| values_in_range_ = prev - curr; |
| window_diff_ -= static_cast<WindowSize>(curr) << (kWindowSize - 16); |
| NormalizeRange(); |
| return symbol; |
| } |
| |
| int DaalaBitReader::ReadSymbolImplBinarySearch(const uint16_t* const cdf, |
| int symbol_count) { |
| assert(cdf[symbol_count - 1] == 0); |
| assert(symbol_count > 1 && symbol_count <= 16); |
| --symbol_count; |
| const auto symbol_value = |
| static_cast<uint32_t>(window_diff_ >> (kWindowSize - 16)); |
| // Search through the |cdf| array to determine where the scaled cdf value and |
| // |symbol_value| cross over. Since the CDFs are sorted, we can use binary |
| // search to do this. Let |symbol| be the index of the first |cdf| array |
| // entry whose scaled cdf value is less than or equal to |symbol_value|. The |
| // binary search maintains the invariant: |
| // low <= symbol <= high + 1 |
| // and terminates when low == high + 1. |
| int low = 0; |
| int high = symbol_count - 1; |
| // The binary search maintains the invariants that |prev| is the scaled cdf |
| // value for low - 1 and |curr| is the scaled cdf value for high + 1. (By |
| // convention, the scaled cdf value for -1 is values_in_range_.) When the |
| // binary search terminates, |prev| is the scaled cdf value for symbol - 1 |
| // and |curr| is the scaled cdf value for |symbol|. |
| uint32_t prev = values_in_range_; |
| uint32_t curr = 0; |
| const uint32_t values_in_range_shifted = values_in_range_ >> 8; |
| do { |
| const int mid = DivideBy2(low + high); |
| const uint32_t scaled_cdf = |
| ScaleCdf(values_in_range_shifted, cdf, mid, symbol_count); |
| if (symbol_value < scaled_cdf) { |
| low = mid + 1; |
| prev = scaled_cdf; |
| } else { |
| high = mid - 1; |
| curr = scaled_cdf; |
| } |
| } while (low <= high); |
| assert(low == high + 1); |
| // At this point, |low| is the symbol that has been decoded. |
| values_in_range_ = prev - curr; |
| window_diff_ -= static_cast<WindowSize>(curr) << (kWindowSize - 16); |
| NormalizeRange(); |
| return low; |
| } |
| |
| int DaalaBitReader::ReadSymbolImpl(const uint16_t* const cdf) { |
| assert(cdf[1] == 0); |
| const auto symbol_value = |
| static_cast<uint32_t>(window_diff_ >> (kWindowSize - 16)); |
| const uint32_t curr = ScaleCdf(values_in_range_ >> 8, cdf, 0, 1); |
| const int symbol = static_cast<int>(symbol_value < curr); |
| if (symbol == 1) { |
| values_in_range_ = curr; |
| } else { |
| values_in_range_ -= curr; |
| window_diff_ -= static_cast<WindowSize>(curr) << (kWindowSize - 16); |
| } |
| NormalizeRange(); |
| return symbol; |
| } |
| |
| // Equivalent to ReadSymbol(cdf, 4), with the ReadSymbolImpl and UpdateCdf |
| // calls inlined. |
| int DaalaBitReader::ReadSymbol4(uint16_t* const cdf) { |
| assert(cdf[3] == 0); |
| uint32_t curr = values_in_range_; |
| uint32_t prev; |
| const auto symbol_value = |
| static_cast<uint32_t>(window_diff_ >> (kWindowSize - 16)); |
| uint32_t delta = kMinimumProbabilityPerSymbol * 3; |
| const uint32_t values_in_range_shifted = values_in_range_ >> 8; |
| |
| // Search through the |cdf| array to determine where the scaled cdf value and |
| // |symbol_value| cross over. If allow_update_cdf_ is true, update the |cdf| |
| // array. |
| // |
| // The original code is: |
| // |
| // int symbol = -1; |
| // do { |
| // prev = curr; |
| // curr = |
| // ((values_in_range_shifted * (cdf[++symbol] >> kCdfPrecision)) >> 1) |
| // + delta; |
| // delta -= kMinimumProbabilityPerSymbol; |
| // } while (symbol_value < curr); |
| // if (allow_update_cdf_) { |
| // UpdateCdf(cdf, 4, symbol); |
| // } |
| // |
| // The do-while loop is unrolled with four iterations, and the UpdateCdf call |
| // is inlined and merged into the four iterations. |
| int symbol = 0; |
| // Iteration 0. |
| prev = curr; |
| curr = |
| ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta; |
| if (symbol_value >= curr) { |
| // symbol == 0. |
| if (allow_update_cdf_) { |
| // Inlined version of UpdateCdf(cdf, 4, /*symbol=*/0). |
| const uint16_t count = cdf[4]; |
| cdf[4] += static_cast<uint16_t>(count < 32); |
| const int rate = (4 | (count >> 4)) + 1; |
| #if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON |
| // 1. On Motorola Moto G5 Plus (running 32-bit Android 8.1.0), the ARM |
| // NEON code is slower. Consider using the C version if __arm__ is |
| // defined. |
| // 2. The ARM NEON code (compiled for arm64) is slightly slower on |
| // Samsung Galaxy S8+ (SM-G955FD). |
| uint16x4_t cdf_vec = vld1_u16(cdf); |
| const int16x4_t negative_rate = vdup_n_s16(-rate); |
| const uint16x4_t delta = vshl_u16(cdf_vec, negative_rate); |
| cdf_vec = vsub_u16(cdf_vec, delta); |
| vst1_u16(cdf, cdf_vec); |
| #elif LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4 |
| __m128i cdf_vec = LoadLo8(cdf); |
| const __m128i delta = _mm_sra_epi16(cdf_vec, _mm_cvtsi32_si128(rate)); |
| cdf_vec = _mm_sub_epi16(cdf_vec, delta); |
| StoreLo8(cdf, cdf_vec); |
| #else // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4 |
| cdf[0] -= cdf[0] >> rate; |
| cdf[1] -= cdf[1] >> rate; |
| cdf[2] -= cdf[2] >> rate; |
| #endif |
| } |
| goto found; |
| } |
| ++symbol; |
| delta -= kMinimumProbabilityPerSymbol; |
| // Iteration 1. |
| prev = curr; |
| curr = |
| ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta; |
| if (symbol_value >= curr) { |
| // symbol == 1. |
| if (allow_update_cdf_) { |
| // Inlined version of UpdateCdf(cdf, 4, /*symbol=*/1). |
| const uint16_t count = cdf[4]; |
| cdf[4] += static_cast<uint16_t>(count < 32); |
| const int rate = (4 | (count >> 4)) + 1; |
| cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate; |
| cdf[1] -= cdf[1] >> rate; |
| cdf[2] -= cdf[2] >> rate; |
| } |
| goto found; |
| } |
| ++symbol; |
| delta -= kMinimumProbabilityPerSymbol; |
| // Iteration 2. |
| prev = curr; |
| curr = |
| ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta; |
| if (symbol_value >= curr) { |
| // symbol == 2. |
| if (allow_update_cdf_) { |
| // Inlined version of UpdateCdf(cdf, 4, /*symbol=*/2). |
| const uint16_t count = cdf[4]; |
| cdf[4] += static_cast<uint16_t>(count < 32); |
| const int rate = (4 | (count >> 4)) + 1; |
| cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate; |
| cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate; |
| cdf[2] -= cdf[2] >> rate; |
| } |
| goto found; |
| } |
| ++symbol; |
| // |delta| is 0 for the last iteration. |
| // Iteration 3. |
| prev = curr; |
| // Since cdf[3] is 0 and |delta| is 0, |curr| is also 0. |
| curr = 0; |
| // symbol == 3. |
| if (allow_update_cdf_) { |
| // Inlined version of UpdateCdf(cdf, 4, /*symbol=*/3). |
| const uint16_t count = cdf[4]; |
| cdf[4] += static_cast<uint16_t>(count < 32); |
| const int rate = (4 | (count >> 4)) + 1; |
| #if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON |
| // On Motorola Moto G5 Plus (running 32-bit Android 8.1.0), the ARM NEON |
| // code is a tiny bit slower. Consider using the C version if __arm__ is |
| // defined. |
| uint16x4_t cdf_vec = vld1_u16(cdf); |
| const uint16x4_t cdf_max_probability = vdup_n_u16(kCdfMaxProbability); |
| const int16x4_t diff = |
| vreinterpret_s16_u16(vsub_u16(cdf_max_probability, cdf_vec)); |
| const int16x4_t negative_rate = vdup_n_s16(-rate); |
| const uint16x4_t delta = |
| vreinterpret_u16_s16(vshl_s16(diff, negative_rate)); |
| cdf_vec = vadd_u16(cdf_vec, delta); |
| vst1_u16(cdf, cdf_vec); |
| cdf[3] = 0; |
| #elif LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4 |
| __m128i cdf_vec = LoadLo8(cdf); |
| const __m128i cdf_max_probability = |
| _mm_shufflelo_epi16(_mm_cvtsi32_si128(kCdfMaxProbability), 0); |
| const __m128i diff = _mm_sub_epi16(cdf_max_probability, cdf_vec); |
| const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate)); |
| cdf_vec = _mm_add_epi16(cdf_vec, delta); |
| StoreLo8(cdf, cdf_vec); |
| cdf[3] = 0; |
| #else // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4 |
| cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate; |
| cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate; |
| cdf[2] += (kCdfMaxProbability - cdf[2]) >> rate; |
| #endif |
| } |
| found: |
| // End of unrolled do-while loop. |
| |
| values_in_range_ = prev - curr; |
| window_diff_ -= static_cast<WindowSize>(curr) << (kWindowSize - 16); |
| NormalizeRange(); |
| return symbol; |
| } |
| |
| int DaalaBitReader::ReadSymbolImpl8(const uint16_t* const cdf) { |
| assert(cdf[7] == 0); |
| uint32_t curr = values_in_range_; |
| uint32_t prev; |
| const auto symbol_value = |
| static_cast<uint32_t>(window_diff_ >> (kWindowSize - 16)); |
| uint32_t delta = kMinimumProbabilityPerSymbol * 7; |
| // Search through the |cdf| array to determine where the scaled cdf value and |
| // |symbol_value| cross over. |
| // |
| // The original code is: |
| // |
| // int symbol = -1; |
| // do { |
| // prev = curr; |
| // curr = |
| // (((values_in_range_ >> 8) * (cdf[++symbol] >> kCdfPrecision)) >> 1) |
| // + delta; |
| // delta -= kMinimumProbabilityPerSymbol; |
| // } while (symbol_value < curr); |
| // |
| // The do-while loop is unrolled with eight iterations. |
| int symbol = 0; |
| |
| #define READ_SYMBOL_ITERATION \ |
| prev = curr; \ |
| curr = (((values_in_range_ >> 8) * (cdf[symbol] >> kCdfPrecision)) >> 1) + \ |
| delta; \ |
| if (symbol_value >= curr) goto found; \ |
| ++symbol; \ |
| delta -= kMinimumProbabilityPerSymbol |
| |
| READ_SYMBOL_ITERATION; // Iteration 0. |
| READ_SYMBOL_ITERATION; // Iteration 1. |
| READ_SYMBOL_ITERATION; // Iteration 2. |
| READ_SYMBOL_ITERATION; // Iteration 3. |
| READ_SYMBOL_ITERATION; // Iteration 4. |
| READ_SYMBOL_ITERATION; // Iteration 5. |
| |
| // The last two iterations can be simplified, so they don't use the |
| // READ_SYMBOL_ITERATION macro. |
| #undef READ_SYMBOL_ITERATION |
| |
| // Iteration 6. |
| prev = curr; |
| curr = |
| (((values_in_range_ >> 8) * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta; |
| if (symbol_value >= curr) goto found; // symbol == 6. |
| ++symbol; |
| // |delta| is 0 for the last iteration. |
| // Iteration 7. |
| prev = curr; |
| // Since cdf[7] is 0 and |delta| is 0, |curr| is also 0. |
| curr = 0; |
| // symbol == 7. |
| found: |
| // End of unrolled do-while loop. |
| |
| values_in_range_ = prev - curr; |
| window_diff_ -= static_cast<WindowSize>(curr) << (kWindowSize - 16); |
| NormalizeRange(); |
| return symbol; |
| } |
| |
| void DaalaBitReader::PopulateBits() { |
| #if defined(__aarch64__) |
| // Fast path: read eight bytes and add the first six bytes to window_diff_. |
| // This fast path makes the following assumptions. |
| // 1. We assume that unaligned load of uint64_t is fast. |
| // 2. When there are enough bytes in data_, the for loop below reads 6 or 7 |
| // bytes depending on the value of bits_. This fast path always reads 6 |
| // bytes, which results in more calls to PopulateBits(). We assume that |
| // making more calls to a faster PopulateBits() is overall a win. |
| // NOTE: Although this fast path could also be used on x86_64, it hurts |
| // performance (measured on Lenovo ThinkStation P920 running Linux). (The |
| // reason is still unknown.) Therefore this fast path is only used on arm64. |
| static_assert(kWindowSize == 64, ""); |
| if (size_ - data_index_ >= 8) { |
| uint64_t value; |
| // arm64 supports unaligned loads, so this memcpy call is compiled to a |
| // single ldr instruction. |
| memcpy(&value, &data_[data_index_], 8); |
| #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ |
| value = __builtin_bswap64(value); |
| #endif |
| value &= 0xffffffffffff0000; |
| window_diff_ ^= static_cast<WindowSize>(value) >> (bits_ + 16); |
| data_index_ += 6; |
| bits_ += 6 * 8; |
| return; |
| } |
| #endif |
| |
| size_t data_index = data_index_; |
| int bits = bits_; |
| WindowSize window_diff = window_diff_; |
| |
| int shift = kWindowSize - 9 - (bits + 15); |
| // The fast path above, if compiled, would cause clang 8.0.7 to vectorize |
| // this loop. Since -15 <= bits_ <= -1, this loop has at most 6 or 7 |
| // iterations when WindowSize is 64 bits. So it is not profitable to |
| // vectorize this loop. Note that clang 8.0.7 does not vectorize this loop if |
| // the fast path above is not compiled. |
| |
| #ifdef __clang__ |
| #pragma clang loop vectorize(disable) interleave(disable) |
| #endif |
| for (; shift >= 0 && data_index < size_; shift -= 8) { |
| window_diff ^= static_cast<WindowSize>(data_[data_index++]) << shift; |
| bits += 8; |
| } |
| if (data_index >= size_) { |
| bits = kLargeBitCount; |
| } |
| |
| data_index_ = data_index; |
| bits_ = bits; |
| window_diff_ = window_diff; |
| } |
| |
| void DaalaBitReader::NormalizeRange() { |
| const int bits_used = 15 - FloorLog2(values_in_range_); |
| bits_ -= bits_used; |
| window_diff_ = ((window_diff_ + 1) << bits_used) - 1; |
| values_in_range_ <<= bits_used; |
| if (bits_ < 0) PopulateBits(); |
| } |
| |
| // Explicit instantiations. |
| template int DaalaBitReader::ReadSymbol<3>(uint16_t* cdf); |
| template int DaalaBitReader::ReadSymbol<4>(uint16_t* cdf); |
| template int DaalaBitReader::ReadSymbol<5>(uint16_t* cdf); |
| template int DaalaBitReader::ReadSymbol<7>(uint16_t* cdf); |
| template int DaalaBitReader::ReadSymbol<8>(uint16_t* cdf); |
| template int DaalaBitReader::ReadSymbol<10>(uint16_t* cdf); |
| template int DaalaBitReader::ReadSymbol<11>(uint16_t* cdf); |
| template int DaalaBitReader::ReadSymbol<13>(uint16_t* cdf); |
| template int DaalaBitReader::ReadSymbol<14>(uint16_t* cdf); |
| template int DaalaBitReader::ReadSymbol<16>(uint16_t* cdf); |
| |
| } // namespace libgav1 |