| // Copyright 2019 The libgav1 Authors |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| #include "src/dsp/loop_restoration.h" |
| |
| #include <algorithm> |
| #include <cassert> |
| #include <cstddef> |
| #include <cstdint> |
| |
| #include "src/dsp/common.h" |
| #include "src/dsp/dsp.h" |
| #include "src/utils/common.h" |
| #include "src/utils/constants.h" |
| |
| namespace libgav1 { |
| namespace dsp { |
| |
| // Section 7.17.3. |
| // a2: range [1, 256]. |
| // if (z >= 255) |
| // a2 = 256; |
| // else if (z == 0) |
| // a2 = 1; |
| // else |
| // a2 = ((z << kSgrProjSgrBits) + (z >> 1)) / (z + 1); |
| // ma = 256 - a2; |
| const uint8_t kSgrMaLookup[256] = { |
| 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16, 15, 14, |
| 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, |
| 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, |
| 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, |
| 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
| 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, |
| 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 1, 1, 1, 0}; |
| |
| namespace { |
| |
| template <int bitdepth, typename Pixel> |
| inline void WienerHorizontal(const Pixel* source, const ptrdiff_t source_stride, |
| const int width, const int height, |
| const int16_t* const filter, |
| const int number_zero_coefficients, |
| int16_t** wiener_buffer) { |
| constexpr int kCenterTap = kWienerFilterTaps / 2; |
| constexpr int kRoundBitsHorizontal = (bitdepth == 12) |
| ? kInterRoundBitsHorizontal12bpp |
| : kInterRoundBitsHorizontal; |
| constexpr int offset = |
| 1 << (bitdepth + kWienerFilterBits - kRoundBitsHorizontal - 1); |
| constexpr int limit = (offset << 2) - 1; |
| int y = height; |
| do { |
| int x = 0; |
| do { |
| // sum fits into 16 bits only when bitdepth = 8. |
| int sum = 0; |
| for (int k = number_zero_coefficients; k < kCenterTap; ++k) { |
| sum += |
| filter[k] * (source[x + k] + source[x + kWienerFilterTaps - 1 - k]); |
| } |
| sum += filter[kCenterTap] * source[x + kCenterTap]; |
| const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsHorizontal); |
| (*wiener_buffer)[x] = Clip3(rounded_sum, -offset, limit - offset); |
| } while (++x != width); |
| source += source_stride; |
| *wiener_buffer += width; |
| } while (--y != 0); |
| } |
| |
| template <int bitdepth, typename Pixel> |
| inline void WienerVertical(const int16_t* wiener_buffer, const int width, |
| const int height, const int16_t* const filter, |
| const int number_zero_coefficients, void* const dest, |
| const ptrdiff_t dest_stride) { |
| constexpr int kCenterTap = kWienerFilterTaps / 2; |
| constexpr int kRoundBitsVertical = |
| (bitdepth == 12) ? kInterRoundBitsVertical12bpp : kInterRoundBitsVertical; |
| auto* dst = static_cast<Pixel*>(dest); |
| int y = height; |
| do { |
| int x = 0; |
| do { |
| // sum needs 32 bits. |
| int sum = 0; |
| for (int k = number_zero_coefficients; k < kCenterTap; ++k) { |
| sum += filter[k] * |
| (wiener_buffer[k * width + x] + |
| wiener_buffer[(kWienerFilterTaps - 1 - k) * width + x]); |
| } |
| sum += filter[kCenterTap] * wiener_buffer[kCenterTap * width + x]; |
| const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsVertical); |
| dst[x] = static_cast<Pixel>(Clip3(rounded_sum, 0, (1 << bitdepth) - 1)); |
| } while (++x != width); |
| wiener_buffer += width; |
| dst += dest_stride; |
| } while (--y != 0); |
| } |
| |
| // Note: bit range for wiener filter. |
| // Wiener filter process first applies horizontal filtering to input pixels, |
| // followed by rounding with predefined bits (dependent on bitdepth). |
| // Then vertical filtering is applied, followed by rounding (dependent on |
| // bitdepth). |
| // The process is the same as convolution: |
| // <input> --> <horizontal filter> --> <rounding 0> --> <vertical filter> |
| // --> <rounding 1> |
| // By design: |
| // (a). horizontal/vertical filtering adds 7 bits to input. |
| // (b). The output of first rounding fits into 16 bits. |
| // (c). The output of second rounding fits into 16 bits. |
| // If input bitdepth > 8, the accumulator of the horizontal filter is larger |
| // than 16 bit and smaller than 32 bits. |
| // The accumulator of the vertical filter is larger than 16 bits and smaller |
| // than 32 bits. |
| // Note: range of wiener filter coefficients. |
| // Wiener filter coefficients are symmetric, and their sum is 1 (128). |
| // The range of each coefficient: |
| // filter[0] = filter[6], 4 bits, min = -5, max = 10. |
| // filter[1] = filter[5], 5 bits, min = -23, max = 8. |
| // filter[2] = filter[4], 6 bits, min = -17, max = 46. |
| // filter[3] = 128 - 2 * (filter[0] + filter[1] + filter[2]). |
| // The difference from libaom is that in libaom: |
| // filter[3] = 0 - 2 * (filter[0] + filter[1] + filter[2]). |
| // Thus in libaom's computation, an offset of 128 is needed for filter[3]. |
| template <int bitdepth, typename Pixel> |
| void WienerFilter_C(const void* const source, void* const dest, |
| const RestorationUnitInfo& restoration_info, |
| ptrdiff_t source_stride, ptrdiff_t dest_stride, int width, |
| int height, RestorationBuffer* const restoration_buffer) { |
| constexpr int kCenterTap = kWienerFilterTaps / 2; |
| const int16_t* const number_leading_zero_coefficients = |
| restoration_info.wiener_info.number_leading_zero_coefficients; |
| const int number_rows_to_skip = std::max( |
| static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]), |
| 1); |
| int16_t* const wiener_buffer_org = restoration_buffer->wiener_buffer; |
| |
| // horizontal filtering. |
| const int height_horizontal = |
| height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip; |
| const int16_t* const filter_horizontal = |
| restoration_info.wiener_info.filter[WienerInfo::kHorizontal]; |
| const auto* src = static_cast<const Pixel*>(source); |
| src -= (kCenterTap - number_rows_to_skip) * source_stride + kCenterTap; |
| auto* wiener_buffer = wiener_buffer_org + number_rows_to_skip * width; |
| |
| if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) { |
| WienerHorizontal<bitdepth, Pixel>(src, source_stride, width, |
| height_horizontal, filter_horizontal, 0, |
| &wiener_buffer); |
| } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) { |
| WienerHorizontal<bitdepth, Pixel>(src, source_stride, width, |
| height_horizontal, filter_horizontal, 1, |
| &wiener_buffer); |
| } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) { |
| WienerHorizontal<bitdepth, Pixel>(src, source_stride, width, |
| height_horizontal, filter_horizontal, 2, |
| &wiener_buffer); |
| } else { |
| assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3); |
| WienerHorizontal<bitdepth, Pixel>(src, source_stride, width, |
| height_horizontal, filter_horizontal, 3, |
| &wiener_buffer); |
| } |
| |
| // vertical filtering. |
| const int16_t* const filter_vertical = |
| restoration_info.wiener_info.filter[WienerInfo::kVertical]; |
| if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) { |
| // Because the top row of |source| is a duplicate of the second row, and the |
| // bottom row of |source| is a duplicate of its above row, we can duplicate |
| // the top and bottom row of |wiener_buffer| accordingly. |
| memcpy(wiener_buffer, wiener_buffer - width, |
| sizeof(*wiener_buffer) * width); |
| memcpy(wiener_buffer_org, wiener_buffer_org + width, |
| sizeof(*wiener_buffer) * width); |
| WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height, |
| filter_vertical, 0, dest, dest_stride); |
| } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) { |
| WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height, |
| filter_vertical, 1, dest, dest_stride); |
| } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) { |
| WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height, |
| filter_vertical, 2, dest, dest_stride); |
| } else { |
| assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3); |
| WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height, |
| filter_vertical, 3, dest, dest_stride); |
| } |
| } |
| |
| //------------------------------------------------------------------------------ |
| // SGR |
| |
| template <typename Pixel, int size> |
| LIBGAV1_ALWAYS_INLINE void BoxSum(const Pixel* src, const ptrdiff_t src_stride, |
| const int height, const int width, |
| uint16_t* sums, uint32_t* square_sums, |
| const ptrdiff_t sum_stride) { |
| int y = height; |
| do { |
| uint32_t sum = 0; |
| uint32_t square_sum = 0; |
| for (int dx = 0; dx < size; ++dx) { |
| const Pixel source = src[dx]; |
| sum += source; |
| square_sum += source * source; |
| } |
| sums[0] = sum; |
| square_sums[0] = square_sum; |
| int x = 1; |
| do { |
| const Pixel source0 = src[x - 1]; |
| const Pixel source1 = src[x - 1 + size]; |
| sum -= source0; |
| sum += source1; |
| square_sum -= source0 * source0; |
| square_sum += source1 * source1; |
| sums[x] = sum; |
| square_sums[x] = square_sum; |
| } while (++x != width); |
| src += src_stride; |
| sums += sum_stride; |
| square_sums += sum_stride; |
| } while (--y != 0); |
| } |
| |
| template <typename Pixel> |
| LIBGAV1_ALWAYS_INLINE void BoxSum(const Pixel* src, const ptrdiff_t src_stride, |
| const int height, const int width, |
| uint16_t* sum3, uint16_t* sum5, |
| uint32_t* square_sum3, uint32_t* square_sum5, |
| const ptrdiff_t sum_stride) { |
| int y = height; |
| do { |
| uint32_t sum = 0; |
| uint32_t square_sum = 0; |
| for (int dx = 0; dx < 4; ++dx) { |
| const Pixel source = src[dx]; |
| sum += source; |
| square_sum += source * source; |
| } |
| int x = 0; |
| do { |
| const Pixel source0 = src[x]; |
| const Pixel source1 = src[x + 4]; |
| sum -= source0; |
| square_sum -= source0 * source0; |
| sum3[x] = sum; |
| square_sum3[x] = square_sum; |
| sum += source1; |
| square_sum += source1 * source1; |
| sum5[x] = sum + source0; |
| square_sum5[x] = square_sum + source0 * source0; |
| } while (++x != width); |
| src += src_stride; |
| sum3 += sum_stride; |
| sum5 += sum_stride; |
| square_sum3 += sum_stride; |
| square_sum5 += sum_stride; |
| } while (--y != 0); |
| } |
| |
| template <int bitdepth, int n> |
| inline void CalculateIntermediate(const uint32_t s, uint32_t a, |
| const uint32_t b, uint8_t* const ma_ptr, |
| uint32_t* const b_ptr) { |
| // a: before shift, max is 25 * (2^(bitdepth) - 1) * (2^(bitdepth) - 1). |
| // since max bitdepth = 12, max < 2^31. |
| // after shift, a < 2^16 * n < 2^22 regardless of bitdepth |
| a = RightShiftWithRounding(a, (bitdepth - 8) << 1); |
| // b: max is 25 * (2^(bitdepth) - 1). If bitdepth = 12, max < 2^19. |
| // d < 2^8 * n < 2^14 regardless of bitdepth |
| const uint32_t d = RightShiftWithRounding(b, bitdepth - 8); |
| // p: Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28, |
| // and p itself satisfies p < 2^14 * n^2 < 2^26. |
| // This bound on p is due to: |
| // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances |
| // Note: Sometimes, in high bitdepth, we can end up with a*n < b*b. |
| // This is an artifact of rounding, and can only happen if all pixels |
| // are (almost) identical, so in this case we saturate to p=0. |
| const uint32_t p = (a * n < d * d) ? 0 : a * n - d * d; |
| // p * s < (2^14 * n^2) * round(2^20 / (n^2 * scale)) < 2^34 / scale < |
| // 2^32 as long as scale >= 4. So p * s fits into a uint32_t, and z < 2^12 |
| // (this holds even after accounting for the rounding in s) |
| const uint32_t z = RightShiftWithRounding(p * s, kSgrProjScaleBits); |
| // ma: range [0, 255]. |
| const uint32_t ma = kSgrMaLookup[std::min(z, 255u)]; |
| const uint32_t one_over_n = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n; |
| // ma < 2^8, b < 2^(bitdepth) * n, |
| // one_over_n = round(2^12 / n) |
| // => the product here is < 2^(20 + bitdepth) <= 2^32, |
| // and b is set to a value < 2^(8 + bitdepth). |
| // This holds even with the rounding in one_over_n and in the overall result, |
| // as long as ma is strictly less than 2^8. |
| const uint32_t b2 = ma * b * one_over_n; |
| *ma_ptr = ma; |
| *b_ptr = RightShiftWithRounding(b2, kSgrProjReciprocalBits); |
| } |
| |
| template <typename T> |
| inline uint32_t Sum343(const T* const src) { |
| return 3 * (src[0] + src[2]) + 4 * src[1]; |
| } |
| |
| template <typename T> |
| inline uint32_t Sum444(const T* const src) { |
| return 4 * (src[0] + src[1] + src[2]); |
| } |
| |
| template <typename T> |
| inline uint32_t Sum565(const T* const src) { |
| return 5 * (src[0] + src[2]) + 6 * src[1]; |
| } |
| |
| template <int bitdepth> |
| LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5( |
| const uint16_t* const sum5[5], const uint32_t* const square_sum5[5], |
| const int width, const uint32_t s, SgrBuffer* const sgr_buffer, |
| uint16_t* const ma565, uint32_t* const b565) { |
| int x = 0; |
| do { |
| uint32_t a = 0; |
| uint32_t b = 0; |
| for (int dy = 0; dy < 5; ++dy) { |
| a += square_sum5[dy][x]; |
| b += sum5[dy][x]; |
| } |
| CalculateIntermediate<bitdepth, 25>(s, a, b, sgr_buffer->ma + x, |
| sgr_buffer->b + x); |
| } while (++x != width + 2); |
| x = 0; |
| do { |
| ma565[x] = Sum565(sgr_buffer->ma + x); |
| b565[x] = Sum565(sgr_buffer->b + x); |
| } while (++x != width); |
| } |
| |
| template <int bitdepth> |
| LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3( |
| const uint16_t* const sum3[3], const uint32_t* const square_sum3[3], |
| const int width, const uint32_t s, const bool calculate444, |
| SgrBuffer* const sgr_buffer, uint16_t* const ma343, uint32_t* const b343, |
| uint16_t* const ma444, uint32_t* const b444) { |
| int x = 0; |
| do { |
| uint32_t a = 0; |
| uint32_t b = 0; |
| for (int dy = 0; dy < 3; ++dy) { |
| a += square_sum3[dy][x]; |
| b += sum3[dy][x]; |
| } |
| CalculateIntermediate<bitdepth, 9>(s, a, b, sgr_buffer->ma + x, |
| sgr_buffer->b + x); |
| } while (++x != width + 2); |
| x = 0; |
| do { |
| ma343[x] = Sum343(sgr_buffer->ma + x); |
| b343[x] = Sum343(sgr_buffer->b + x); |
| } while (++x != width); |
| if (calculate444) { |
| x = 0; |
| do { |
| ma444[x] = Sum444(sgr_buffer->ma + x); |
| b444[x] = Sum444(sgr_buffer->b + x); |
| } while (++x != width); |
| } |
| } |
| |
| template <typename Pixel> |
| inline int CalculateFilteredOutput(const Pixel src, const uint32_t ma, |
| const uint32_t b, const int shift) { |
| const int32_t v = b - ma * src; |
| return RightShiftWithRounding(v, |
| kSgrProjSgrBits + shift - kSgrProjRestoreBits); |
| } |
| |
| template <typename Pixel> |
| inline void BoxFilterPass(const Pixel src0, const Pixel src1, |
| const uint16_t* const ma565[2], |
| const uint32_t* const b565[2], const ptrdiff_t x, |
| int p[2]) { |
| p[0] = CalculateFilteredOutput<Pixel>(src0, ma565[0][x] + ma565[1][x], |
| b565[0][x] + b565[1][x], 5); |
| p[1] = CalculateFilteredOutput<Pixel>(src1, ma565[1][x], b565[1][x], 4); |
| } |
| |
| template <typename Pixel> |
| inline int BoxFilterPass2(const Pixel src, const uint16_t* const ma343[3], |
| const uint16_t* const ma444, |
| const uint32_t* const b343[3], |
| const uint32_t* const b444, const ptrdiff_t x) { |
| const uint32_t ma = ma343[0][x] + ma444[x] + ma343[2][x]; |
| const uint32_t b = b343[0][x] + b444[x] + b343[2][x]; |
| return CalculateFilteredOutput<Pixel>(src, ma, b, 5); |
| } |
| |
| template <int bitdepth, typename Pixel> |
| inline Pixel SelfGuidedFinal(const int src, const int v) { |
| // if radius_pass_0 == 0 and radius_pass_1 == 0, the range of v is: |
| // bits(u) + bits(w0/w1/w2) + 2 = bitdepth + 13. |
| // Then, range of s is bitdepth + 2. This is a rough estimation, taking the |
| // maximum value of each element. |
| const int s = src + RightShiftWithRounding( |
| v, kSgrProjRestoreBits + kSgrProjPrecisionBits); |
| return static_cast<Pixel>(Clip3(s, 0, (1 << bitdepth) - 1)); |
| } |
| |
| template <int bitdepth, typename Pixel> |
| inline Pixel SelfGuidedDoubleMultiplier(const int src, const int filter0, |
| const int filter1, const int16_t w0, |
| const int16_t w2) { |
| const int v = w0 * filter0 + w2 * filter1; |
| return SelfGuidedFinal<bitdepth, Pixel>(src, v); |
| } |
| |
| template <int bitdepth, typename Pixel> |
| inline Pixel SelfGuidedSingleMultiplier(const int src, const int filter, |
| const int16_t w0) { |
| const int v = w0 * filter; |
| return SelfGuidedFinal<bitdepth, Pixel>(src, v); |
| } |
| |
| template <typename T> |
| void Circulate3PointersBy1(T* p[3]) { |
| T* const p0 = p[0]; |
| p[0] = p[1]; |
| p[1] = p[2]; |
| p[2] = p0; |
| } |
| |
| template <typename T> |
| void Circulate4PointersBy2(T* p[4]) { |
| std::swap(p[0], p[2]); |
| std::swap(p[1], p[3]); |
| } |
| |
| template <typename T> |
| void Circulate5PointersBy2(T* p[5]) { |
| T* const p0 = p[0]; |
| T* const p1 = p[1]; |
| p[0] = p[2]; |
| p[1] = p[3]; |
| p[2] = p[4]; |
| p[3] = p0; |
| p[4] = p1; |
| } |
| |
| template <int bitdepth, typename Pixel> |
| inline void BoxFilterProcess(const RestorationUnitInfo& restoration_info, |
| const Pixel* src, const ptrdiff_t src_stride, |
| const int width, const int height, |
| SgrBuffer* const sgr_buffer, Pixel* dst, |
| const ptrdiff_t dst_stride) { |
| const auto temp_stride = Align<ptrdiff_t>(width, 8); |
| const ptrdiff_t sum_stride = temp_stride + 8; |
| const int sgr_proj_index = restoration_info.sgr_proj_info.index; |
| const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index]; // < 2^12. |
| const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0]; |
| const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1]; |
| const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1; |
| uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2]; |
| uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2]; |
| sum3[0] = sgr_buffer->sum3; |
| square_sum3[0] = sgr_buffer->square_sum3; |
| ma343[0] = sgr_buffer->ma343; |
| b343[0] = sgr_buffer->b343; |
| for (int i = 1; i <= 3; ++i) { |
| sum3[i] = sum3[i - 1] + sum_stride; |
| square_sum3[i] = square_sum3[i - 1] + sum_stride; |
| ma343[i] = ma343[i - 1] + temp_stride; |
| b343[i] = b343[i - 1] + temp_stride; |
| } |
| sum5[0] = sgr_buffer->sum5; |
| square_sum5[0] = sgr_buffer->square_sum5; |
| for (int i = 1; i <= 4; ++i) { |
| sum5[i] = sum5[i - 1] + sum_stride; |
| square_sum5[i] = square_sum5[i - 1] + sum_stride; |
| } |
| ma444[0] = sgr_buffer->ma444; |
| b444[0] = sgr_buffer->b444; |
| for (int i = 1; i <= 2; ++i) { |
| ma444[i] = ma444[i - 1] + temp_stride; |
| b444[i] = b444[i - 1] + temp_stride; |
| } |
| ma565[0] = sgr_buffer->ma565; |
| ma565[1] = ma565[0] + temp_stride; |
| b565[0] = sgr_buffer->b565; |
| b565[1] = b565[0] + temp_stride; |
| assert(scales[0] != 0); |
| assert(scales[1] != 0); |
| BoxSum<Pixel>(src - 2 * src_stride - 3, src_stride, 4, width + 2, sum3[0], |
| sum5[1], square_sum3[0], square_sum5[1], sum_stride); |
| memcpy(sum5[0], sum5[1], sizeof(**sum5) * sum_stride); |
| memcpy(square_sum5[0], square_sum5[1], sizeof(**square_sum5) * sum_stride); |
| BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0], |
| sgr_buffer, ma565[0], b565[0]); |
| BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scales[1], false, |
| sgr_buffer, ma343[0], b343[0], nullptr, |
| nullptr); |
| BoxFilterPreProcess3<bitdepth>(sum3 + 1, square_sum3 + 1, width, scales[1], |
| true, sgr_buffer, ma343[1], b343[1], ma444[0], |
| b444[0]); |
| for (int y = height >> 1; y != 0; --y) { |
| Circulate4PointersBy2<uint16_t>(sum3); |
| Circulate4PointersBy2<uint32_t>(square_sum3); |
| Circulate5PointersBy2<uint16_t>(sum5); |
| Circulate5PointersBy2<uint32_t>(square_sum5); |
| BoxSum<Pixel>(src + 2 * src_stride - 3, src_stride, 1, width + 2, sum3[2], |
| sum5[3], square_sum3[2], square_sum5[3], sum_stride); |
| BoxSum<Pixel>(src + 3 * src_stride - 3, src_stride, 1, width + 2, sum3[3], |
| sum5[4], square_sum3[3], square_sum5[4], sum_stride); |
| BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0], |
| sgr_buffer, ma565[1], b565[1]); |
| BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scales[1], true, |
| sgr_buffer, ma343[2], b343[2], ma444[1], |
| b444[1]); |
| BoxFilterPreProcess3<bitdepth>(sum3 + 1, square_sum3 + 1, width, scales[1], |
| true, sgr_buffer, ma343[3], b343[3], |
| ma444[2], b444[2]); |
| int x = 0; |
| do { |
| int p[2][2]; |
| BoxFilterPass<Pixel>(src[x], src[src_stride + x], ma565, b565, x, p[0]); |
| p[1][0] = |
| BoxFilterPass2<Pixel>(src[x], ma343, ma444[0], b343, b444[0], x); |
| p[1][1] = BoxFilterPass2<Pixel>(src[src_stride + x], ma343 + 1, ma444[1], |
| b343 + 1, b444[1], x); |
| dst[x] = SelfGuidedDoubleMultiplier<bitdepth, Pixel>(src[x], p[0][0], |
| p[1][0], w0, w2); |
| dst[dst_stride + x] = SelfGuidedDoubleMultiplier<bitdepth, Pixel>( |
| src[src_stride + x], p[0][1], p[1][1], w0, w2); |
| } while (++x != width); |
| src += 2 * src_stride; |
| dst += 2 * dst_stride; |
| Circulate4PointersBy2<uint16_t>(ma343); |
| Circulate4PointersBy2<uint32_t>(b343); |
| std::swap(ma444[0], ma444[2]); |
| std::swap(b444[0], b444[2]); |
| std::swap(ma565[0], ma565[1]); |
| std::swap(b565[0], b565[1]); |
| } |
| if ((height & 1) != 0) { |
| Circulate4PointersBy2<uint16_t>(sum3); |
| Circulate4PointersBy2<uint32_t>(square_sum3); |
| Circulate5PointersBy2<uint16_t>(sum5); |
| Circulate5PointersBy2<uint32_t>(square_sum5); |
| BoxSum<Pixel>(src + 2 * src_stride - 3, src_stride, 1, width + 2, sum3[2], |
| sum5[3], square_sum3[2], square_sum5[3], sum_stride); |
| memcpy(sum5[4], sum5[3], sizeof(**sum5) * sum_stride); |
| memcpy(square_sum5[4], square_sum5[3], sizeof(**square_sum5) * sum_stride); |
| BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0], |
| sgr_buffer, ma565[1], b565[1]); |
| BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scales[1], false, |
| sgr_buffer, ma343[2], b343[2], nullptr, |
| nullptr); |
| int x = 0; |
| do { |
| const int p0 = CalculateFilteredOutput<Pixel>( |
| src[x], ma565[0][x] + ma565[1][x], b565[0][x] + b565[1][x], 5); |
| const int p1 = |
| BoxFilterPass2<Pixel>(src[x], ma343, ma444[0], b343, b444[0], x); |
| dst[x] = |
| SelfGuidedDoubleMultiplier<bitdepth, Pixel>(src[x], p0, p1, w0, w2); |
| } while (++x != width); |
| } |
| } |
| |
| template <int bitdepth, typename Pixel> |
| inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info, |
| const Pixel* src, const ptrdiff_t src_stride, |
| const int width, const int height, |
| SgrBuffer* const sgr_buffer, Pixel* dst, |
| const ptrdiff_t dst_stride) { |
| const auto temp_stride = Align<ptrdiff_t>(width, 8); |
| const ptrdiff_t sum_stride = temp_stride + 8; |
| const int sgr_proj_index = restoration_info.sgr_proj_info.index; |
| const uint32_t s = kSgrScaleParameter[sgr_proj_index][0]; // s < 2^12. |
| const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0]; |
| uint16_t *sum5[5], *ma565[2]; |
| uint32_t *square_sum5[5], *b565[2]; |
| sum5[0] = sgr_buffer->sum5; |
| square_sum5[0] = sgr_buffer->square_sum5; |
| for (int i = 1; i <= 4; ++i) { |
| sum5[i] = sum5[i - 1] + sum_stride; |
| square_sum5[i] = square_sum5[i - 1] + sum_stride; |
| } |
| ma565[0] = sgr_buffer->ma565; |
| ma565[1] = ma565[0] + temp_stride; |
| b565[0] = sgr_buffer->b565; |
| b565[1] = b565[0] + temp_stride; |
| assert(s != 0); |
| BoxSum<Pixel, 5>(src - 2 * src_stride - 3, src_stride, 4, width + 2, sum5[1], |
| square_sum5[1], sum_stride); |
| memcpy(sum5[0], sum5[1], sizeof(**sum5) * sum_stride); |
| memcpy(square_sum5[0], square_sum5[1], sizeof(**square_sum5) * sum_stride); |
| BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, s, sgr_buffer, |
| ma565[0], b565[0]); |
| for (int y = height >> 1; y != 0; --y) { |
| Circulate5PointersBy2<uint16_t>(sum5); |
| Circulate5PointersBy2<uint32_t>(square_sum5); |
| BoxSum<Pixel, 5>(src + 2 * src_stride - 3, src_stride, 1, width + 2, |
| sum5[3], square_sum5[3], sum_stride); |
| BoxSum<Pixel, 5>(src + 3 * src_stride - 3, src_stride, 1, width + 2, |
| sum5[4], square_sum5[4], sum_stride); |
| BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, s, sgr_buffer, |
| ma565[1], b565[1]); |
| int x = 0; |
| do { |
| int p[2]; |
| BoxFilterPass<Pixel>(src[x], src[src_stride + x], ma565, b565, x, p); |
| dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p[0], w0); |
| dst[dst_stride + x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>( |
| src[src_stride + x], p[1], w0); |
| } while (++x != width); |
| src += 2 * src_stride; |
| dst += 2 * dst_stride; |
| std::swap(ma565[0], ma565[1]); |
| std::swap(b565[0], b565[1]); |
| } |
| if ((height & 1) != 0) { |
| Circulate5PointersBy2<uint16_t>(sum5); |
| Circulate5PointersBy2<uint32_t>(square_sum5); |
| BoxSum<Pixel, 5>(src + 2 * src_stride - 3, src_stride, 1, width + 2, |
| sum5[3], square_sum5[3], sum_stride); |
| memcpy(sum5[4], sum5[3], sizeof(**sum5) * sum_stride); |
| memcpy(square_sum5[4], square_sum5[3], sizeof(**square_sum5) * sum_stride); |
| BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, s, sgr_buffer, |
| ma565[1], b565[1]); |
| int x = 0; |
| do { |
| const int p = CalculateFilteredOutput<Pixel>( |
| src[x], ma565[0][x] + ma565[1][x], b565[0][x] + b565[1][x], 5); |
| dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p, w0); |
| } while (++x != width); |
| } |
| } |
| |
| template <int bitdepth, typename Pixel> |
| inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, |
| const Pixel* src, const ptrdiff_t src_stride, |
| const int width, const int height, |
| SgrBuffer* const sgr_buffer, Pixel* dst, |
| const ptrdiff_t dst_stride) { |
| assert(restoration_info.sgr_proj_info.multiplier[0] == 0); |
| const auto temp_stride = Align<ptrdiff_t>(width, 8); |
| const ptrdiff_t sum_stride = temp_stride + 8; |
| const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1]; |
| const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1; |
| const int sgr_proj_index = restoration_info.sgr_proj_info.index; |
| const uint32_t s = kSgrScaleParameter[sgr_proj_index][1]; // s < 2^12. |
| uint16_t *sum3[3], *ma343[3], *ma444[2]; |
| uint32_t *square_sum3[3], *b343[3], *b444[2]; |
| sum3[0] = sgr_buffer->sum3; |
| square_sum3[0] = sgr_buffer->square_sum3; |
| ma343[0] = sgr_buffer->ma343; |
| b343[0] = sgr_buffer->b343; |
| for (int i = 1; i <= 2; ++i) { |
| sum3[i] = sum3[i - 1] + sum_stride; |
| square_sum3[i] = square_sum3[i - 1] + sum_stride; |
| ma343[i] = ma343[i - 1] + temp_stride; |
| b343[i] = b343[i - 1] + temp_stride; |
| } |
| ma444[0] = sgr_buffer->ma444; |
| ma444[1] = ma444[0] + temp_stride; |
| b444[0] = sgr_buffer->b444; |
| b444[1] = b444[0] + temp_stride; |
| assert(s != 0); |
| BoxSum<Pixel, 3>(src - 2 * src_stride - 2, src_stride, 3, width + 2, sum3[0], |
| square_sum3[0], sum_stride); |
| BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, s, false, sgr_buffer, |
| ma343[0], b343[0], nullptr, nullptr); |
| Circulate3PointersBy1<uint16_t>(sum3); |
| Circulate3PointersBy1<uint32_t>(square_sum3); |
| BoxSum<Pixel, 3>(src + src_stride - 2, src_stride, 1, width + 2, sum3[2], |
| square_sum3[2], sum_stride); |
| BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, s, true, sgr_buffer, |
| ma343[1], b343[1], ma444[0], b444[0]); |
| int y = height; |
| do { |
| Circulate3PointersBy1<uint16_t>(sum3); |
| Circulate3PointersBy1<uint32_t>(square_sum3); |
| BoxSum<Pixel, 3>(src + 2 * src_stride - 2, src_stride, 1, width + 2, |
| sum3[2], square_sum3[2], sum_stride); |
| BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, s, true, |
| sgr_buffer, ma343[2], b343[2], ma444[1], |
| b444[1]); |
| int x = 0; |
| do { |
| const int p = |
| BoxFilterPass2<Pixel>(src[x], ma343, ma444[0], b343, b444[0], x); |
| dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p, w0); |
| } while (++x != width); |
| src += src_stride; |
| dst += dst_stride; |
| Circulate3PointersBy1<uint16_t>(ma343); |
| Circulate3PointersBy1<uint32_t>(b343); |
| std::swap(ma444[0], ma444[1]); |
| std::swap(b444[0], b444[1]); |
| } while (--y != 0); |
| } |
| |
| template <int bitdepth, typename Pixel> |
| void SelfGuidedFilter_C(const void* const source, void* const dest, |
| const RestorationUnitInfo& restoration_info, |
| ptrdiff_t source_stride, ptrdiff_t dest_stride, |
| int width, int height, |
| RestorationBuffer* const restoration_buffer) { |
| const int index = restoration_info.sgr_proj_info.index; |
| const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0 |
| const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0 |
| const auto* src = static_cast<const Pixel*>(source); |
| auto* dst = static_cast<Pixel*>(dest); |
| SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer; |
| if (radius_pass_1 == 0) { |
| // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the |
| // following assertion. |
| assert(radius_pass_0 != 0); |
| BoxFilterProcessPass1<bitdepth, Pixel>(restoration_info, src, source_stride, |
| width, height, sgr_buffer, dst, |
| dest_stride); |
| } else if (radius_pass_0 == 0) { |
| BoxFilterProcessPass2<bitdepth, Pixel>(restoration_info, src, source_stride, |
| width, height, sgr_buffer, dst, |
| dest_stride); |
| } else { |
| BoxFilterProcess<bitdepth, Pixel>(restoration_info, src, source_stride, |
| width, height, sgr_buffer, dst, |
| dest_stride); |
| } |
| } |
| |
| void Init8bpp() { |
| Dsp* const dsp = dsp_internal::GetWritableDspTable(8); |
| assert(dsp != nullptr); |
| #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS |
| dsp->loop_restorations[0] = WienerFilter_C<8, uint8_t>; |
| dsp->loop_restorations[1] = SelfGuidedFilter_C<8, uint8_t>; |
| #else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS |
| static_cast<void>(dsp); |
| #ifndef LIBGAV1_Dsp8bpp_WienerFilter |
| dsp->loop_restorations[0] = WienerFilter_C<8, uint8_t>; |
| #endif |
| #ifndef LIBGAV1_Dsp8bpp_SelfGuidedFilter |
| dsp->loop_restorations[1] = SelfGuidedFilter_C<8, uint8_t>; |
| #endif |
| #endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS |
| } |
| |
| #if LIBGAV1_MAX_BITDEPTH >= 10 |
| |
| void Init10bpp() { |
| Dsp* const dsp = dsp_internal::GetWritableDspTable(10); |
| assert(dsp != nullptr); |
| #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS |
| dsp->loop_restorations[0] = WienerFilter_C<10, uint16_t>; |
| dsp->loop_restorations[1] = SelfGuidedFilter_C<10, uint16_t>; |
| #else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS |
| static_cast<void>(dsp); |
| #ifndef LIBGAV1_Dsp10bpp_WienerFilter |
| dsp->loop_restorations[0] = WienerFilter_C<10, uint16_t>; |
| #endif |
| #ifndef LIBGAV1_Dsp10bpp_SelfGuidedFilter |
| dsp->loop_restorations[1] = SelfGuidedFilter_C<10, uint16_t>; |
| #endif |
| #endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS |
| } |
| |
| #endif // LIBGAV1_MAX_BITDEPTH >= 10 |
| } // namespace |
| |
| void LoopRestorationInit_C() { |
| Init8bpp(); |
| #if LIBGAV1_MAX_BITDEPTH >= 10 |
| Init10bpp(); |
| #endif |
| } |
| |
| } // namespace dsp |
| } // namespace libgav1 |