libgav1/src/dsp/loop_restoration.cc - platform/external/libgav1 - Git at Google

 #include "src/dsp/loop_restoration.h"

 #include <algorithm>  // std::max
 #include <cassert>
 #include <cstddef>
 #include <cstdint>

 #include "src/dsp/common.h"
 #include "src/dsp/dsp.h"
 #include "src/utils/common.h"
 #include "src/utils/constants.h"

 namespace libgav1 {
 namespace dsp {
 namespace {

 // Precision of a division table (mtable)
 constexpr int kSgrProjScaleBits = 20;
 constexpr int kSgrProjReciprocalBits = 12;
 // Core selfguided restoration precision bits.
 constexpr int kSgrProjSgrBits = 8;
 // Precision bits of generated values higher than source before projection.
 constexpr int kSgrProjRestoreBits = 4;

 template <int bitdepth, typename Pixel>
 struct LoopRestorationFuncs_C {
   LoopRestorationFuncs_C() = delete;

   // |stride| for SelfGuidedFilter and WienerFilter is given in bytes.
   static void SelfGuidedFilter(const void* source, void* dest,
                                const RestorationUnitInfo& restoration_info,
                                ptrdiff_t source_stride, ptrdiff_t dest_stride,
                                int width, int height,
                                RestorationBuffer* buffer);
   static void WienerFilter(const void* source, void* dest,
                            const RestorationUnitInfo& restoration_info,
                            ptrdiff_t source_stride, ptrdiff_t dest_stride,
                            int width, int height, RestorationBuffer* buffer);
   // |stride| for box filter processing is in Pixels.
   static void BoxFilterPreProcess(const RestorationUnitInfo& restoration_info,
                                   const Pixel* src, ptrdiff_t stride, int width,
                                   int height, int pass,
                                   RestorationBuffer* buffer);
   static void BoxFilterProcess(const RestorationUnitInfo& restoration_info,
                                const Pixel* src, ptrdiff_t stride, int width,
                                int height, RestorationBuffer* buffer);
 };

 // Note: range of wiener filter coefficients.
 // Wiener filter coefficients are symmetric, and their sum is 1 (128).
 // The range of each coefficient:
 // filter[0] = filter[6], 4 bits, min = -5, max = 10.
 // filter[1] = filter[5], 5 bits, min = -23, max = 8.
 // filter[2] = filter[4], 6 bits, min = -17, max = 46.
 // filter[3] = 128 - (filter[0] + filter[1] + filter[2]) * 2.
 // The difference from libaom is that in libaom:
 // filter[3] = 0 - (filter[0] + filter[1] + filter[2]) * 2.
 // Thus in libaom's computation, an offset of 128 is needed for filter[3].
 inline void PopulateWienerCoefficients(
     const RestorationUnitInfo& restoration_info, int direction,
     int16_t* const filter) {
   filter[3] = 128;
   for (int i = 0; i < 3; ++i) {
     const int16_t coeff = restoration_info.wiener_info.filter[direction][i];
     filter[i] = coeff;
     filter[6 - i] = coeff;
     filter[3] -= MultiplyBy2(coeff);
   }
 }

 // Note: bit range for wiener filter.
 // Wiener filter process first applies horizontal filtering to input pixels,
 // followed by rounding with predefined bits (dependent on bitdepth).
 // Then vertical filtering is applied, followed by rounding (dependent on
 // bitdepth).
 // The process is the same as convolution:
 // <input> --> <horizontal filter> --> <rounding 0> --> <vertical filter>
 // --> <rounding 1>
 // By design:
 // (a). horizontal/vertical filtering adds 7 bits to input.
 // (b). The output of first rounding fits into 16 bits.
 // (c). The output of second rounding fits into 16 bits.
 // If input bitdepth > 8, the accumulator of the horizontal filter is larger
 // than 16 bit and smaller than 32 bits.
 // The accumulator of the vertical filter is larger than 16 bits and smaller
 // than 32 bits.
 template <int bitdepth, typename Pixel>
 void LoopRestorationFuncs_C<bitdepth, Pixel>::WienerFilter(
     const void* const source, void* const dest,
     const RestorationUnitInfo& restoration_info, ptrdiff_t source_stride,
     ptrdiff_t dest_stride, int width, int height,
     RestorationBuffer* const buffer) {
   constexpr int kRoundBitsHorizontal = (bitdepth == 12)
                                            ? kInterRoundBitsHorizontal12bpp
                                            : kInterRoundBitsHorizontal;
   constexpr int kRoundBitsVertical =
       (bitdepth == 12) ? kInterRoundBitsVertical12bpp : kInterRoundBitsVertical;
   int16_t filter[kSubPixelTaps - 1];
   const int limit =
       (1 << (bitdepth + 1 + kWienerFilterBits - kRoundBitsHorizontal)) - 1;
   const auto* src = static_cast<const Pixel*>(source);
   auto* dst = static_cast<Pixel*>(dest);
   source_stride /= sizeof(Pixel);
   dest_stride /= sizeof(Pixel);
   const ptrdiff_t buffer_stride = buffer->wiener_buffer_stride;
   auto* wiener_buffer = buffer->wiener_buffer;
   // horizontal filtering.
   PopulateWienerCoefficients(restoration_info, WienerInfo::kHorizontal, filter);
   const int center_tap = 3;
   src -= center_tap * source_stride + center_tap;
   const int horizontal_rounding = 1 << (bitdepth + kWienerFilterBits - 1);
   for (int y = 0; y < height + kSubPixelTaps - 2; ++y) {
     for (int x = 0; x < width; ++x) {
       // sum fits into 16 bits only when bitdepth = 8.
       int sum = horizontal_rounding;
       for (int k = 0; k < kSubPixelTaps - 1; ++k) {
         sum += filter[k] * src[x + k];
       }
       const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsHorizontal);
       wiener_buffer[x] = static_cast<uint16_t>(Clip3(rounded_sum, 0, limit));
     }
     src += source_stride;
     wiener_buffer += buffer_stride;
   }
   wiener_buffer = buffer->wiener_buffer;
   // vertical filtering.
   PopulateWienerCoefficients(restoration_info, WienerInfo::kVertical, filter);
   const int vertical_rounding = -(1 << (bitdepth + kRoundBitsVertical - 1));
   for (int y = 0; y < height; ++y) {
     for (int x = 0; x < width; ++x) {
       // sum needs 32 bits.
       int sum = vertical_rounding;
       for (int k = 0; k < kSubPixelTaps - 1; ++k) {
         sum += filter[k] * wiener_buffer[k * buffer_stride + x];
       }
       const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsVertical);
       dst[x] = static_cast<Pixel>(Clip3(rounded_sum, 0, (1 << bitdepth) - 1));
     }
     dst += dest_stride;
     wiener_buffer += buffer_stride;
   }
 }

 template <int bitdepth, typename Pixel>
 void LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterPreProcess(
     const RestorationUnitInfo& restoration_info, const Pixel* const src,
     ptrdiff_t stride, int width, int height, int pass,
     RestorationBuffer* const buffer) {
   const int sgr_proj_index = restoration_info.sgr_proj_info.index;
   const uint8_t radius = kSgrProjParams[sgr_proj_index][pass * 2];
   assert(radius != 0);
   const uint32_t n = (2 * radius + 1) * (2 * radius + 1);
   // const uint8_t scale = kSgrProjParams[sgr_proj_index][pass * 2 + 1];
   // n2_with_scale: max value < 2^16. min value is 4.
   // const uint32_t n2_with_scale = n * n * scale;
   // s: max value < 2^12.
   // const uint32_t s =
   // ((1 << kSgrProjScaleBits) + (n2_with_scale >> 1)) / n2_with_scale;
   const uint32_t s = kSgrScaleParameter[sgr_proj_index][pass];
   assert(s != 0);
   const ptrdiff_t array_stride = buffer->box_filter_process_intermediate_stride;
   // The size of the intermediate result buffer is the size of the filter area
   // plus horizontal (3) and vertical (3) padding. The processing start point
   // is the filter area start point -1 row and -1 column. Therefore we need to
   // set offset and use the intermediate_result as the start point for
   // processing.
   const ptrdiff_t intermediate_buffer_offset =
       kRestorationBorder * array_stride + kRestorationBorder;
   uint32_t* intermediate_result[2] = {
       buffer->box_filter_process_intermediate[0] + intermediate_buffer_offset -
           array_stride,
       buffer->box_filter_process_intermediate[1] + intermediate_buffer_offset -
           array_stride};
   // Calculate intermediate results, including one-pixel border, for example,
   // if unit size is 64x64, we calculate 66x66 pixels.
   for (int y = -1; y <= height; ++y) {
     if (pass == 0 && ((y & 1) == 0)) {
       intermediate_result[0] += array_stride;
       intermediate_result[1] += array_stride;
       continue;
     }
     for (int x = -1; x <= width; ++x) {
       uint32_t a = 0;
       uint32_t b = 0;
       for (int dy = -radius; dy <= radius; ++dy) {
         for (int dx = -radius; dx <= radius; ++dx) {
           const Pixel source = src[(y + dy) * stride + (x + dx)];
           // TODO(chengchen): Use boxsum for fast calculation.
           a += source * source;
           b += source;
         }
       }
       // a: before shift, max is 25 * (2^(bitdepth) - 1) * (2^(bitdepth) - 1).
       // since max bitdepth = 12, max < 2^31.
       // after shift, a < 2^16 * n < 2^22 regardless of bitdepth
       a = RightShiftWithRounding(a, (bitdepth - 8) << 1);
       // b: max is 25 * (2^(bitdepth) - 1). If bitdepth = 12, max < 2^19.
       // d < 2^8 * n < 2^14 regardless of bitdepth
       const uint32_t d = RightShiftWithRounding(b, bitdepth - 8);
       // p: Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
       // and p itself satisfies p < 2^14 * n^2 < 2^26.
       // This bound on p is due to:
       // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
       // Note: Sometimes, in high bitdepth, we can end up with a*n < b*b.
       // This is an artifact of rounding, and can only happen if all pixels
       // are (almost) identical, so in this case we saturate to p=0.
       const uint32_t p = (a * n < d * d) ? 0 : a * n - d * d;
       // p * s < (2^14 * n^2) * round(2^20 / (n^2 * scale)) < 2^34 / scale <
       // 2^32 as long as scale >= 4. So p * s fits into a uint32_t, and z < 2^12
       // (this holds even after accounting for the rounding in s)
       const uint32_t z = RightShiftWithRounding(p * s, kSgrProjScaleBits);
       // a2: range [1, 256].
       uint32_t a2;
       if (z >= 255) {
         a2 = 256;
       } else if (z == 0) {
         a2 = 1;
       } else {
         a2 = ((z << kSgrProjSgrBits) + (z >> 1)) / (z + 1);
       }
       const uint32_t one_over_n =
           ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
       // (kSgrProjSgrBits - a2) < 2^8, b < 2^(bitdepth) * n,
       // one_over_n = round(2^12 / n)
       // => the product here is < 2^(20 + bitdepth) <= 2^32,
       // and b is set to a value < 2^(8 + bitdepth).
       // This holds even with the rounding in one_over_n and in the overall
       // result, as long as (kSgrProjSgrBits - a2) is strictly less than 2^8.
       const uint32_t b2 = ((1 << kSgrProjSgrBits) - a2) * b * one_over_n;
       intermediate_result[0][x] = a2;
       intermediate_result[1][x] =
           RightShiftWithRounding(b2, kSgrProjReciprocalBits);
     }
     intermediate_result[0] += array_stride;
     intermediate_result[1] += array_stride;
   }
 }

 template <int bitdepth, typename Pixel>
 void LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterProcess(
     const RestorationUnitInfo& restoration_info, const Pixel* src,
     ptrdiff_t stride, int width, int height, RestorationBuffer* const buffer) {
   const int sgr_proj_index = restoration_info.sgr_proj_info.index;
   for (int pass = 0; pass < 2; ++pass) {
     const uint8_t radius = kSgrProjParams[sgr_proj_index][pass * 2];
     const Pixel* src_ptr = src;
     if (radius == 0) continue;
     LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterPreProcess(
         restoration_info, src_ptr, stride, width, height, pass, buffer);

     int* filtered_output = buffer->box_filter_process_output[pass];
     const ptrdiff_t filtered_output_stride =
         buffer->box_filter_process_output_stride;
     const ptrdiff_t intermediate_stride =
         buffer->box_filter_process_intermediate_stride;
     // Set intermediate buffer start point to the actual start point of
     // filtering.
     const ptrdiff_t intermediate_buffer_offset =
         kRestorationBorder * intermediate_stride + kRestorationBorder;
     for (int y = 0; y < height; ++y) {
       const int shift = (pass == 0 && (y & 1) != 0) ? 4 : 5;
       uint32_t* const array_start[2] = {
           buffer->box_filter_process_intermediate[0] +
               intermediate_buffer_offset + y * intermediate_stride,
           buffer->box_filter_process_intermediate[1] +
               intermediate_buffer_offset + y * intermediate_stride};
       for (int x = 0; x < width; ++x) {
         uint32_t a = 0;
         uint32_t b = 0;
         uint32_t* intermediate_result[2] = {
             array_start[0] - intermediate_stride,
             array_start[1] - intermediate_stride};
         for (int dy = -1; dy <= 1; ++dy) {
           for (int dx = -1; dx <= 1; ++dx) {
             int weight;
             if (pass == 0) {
               if (((y + dy) & 1) != 0) {
                 weight = (dx == 0) ? 6 : 5;
               } else {
                 continue;
               }
             } else {
               weight = ((dx & dy) == 0) ? 4 : 3;
             }
             // intermediate_result[0]: range [1, 256].
             // intermediate_result[1] < 2^20.
             a += weight * intermediate_result[0][x + dx];
             b += weight * intermediate_result[1][x + dx];
           }
           intermediate_result[0] += intermediate_stride;
           intermediate_result[1] += intermediate_stride;
         }
         // v < 2^32. All intermediate calculations are positive.
         const uint32_t v = a * src_ptr[x] + b;
         filtered_output[x] = RightShiftWithRounding(
             v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
       }
       src_ptr += stride;
       filtered_output += filtered_output_stride;
     }
   }
 }

 // Assume box_filter_process_output[2] are allocated before calling
 // this function. Their sizes are width * height, stride equals width.
 template <int bitdepth, typename Pixel>
 void LoopRestorationFuncs_C<bitdepth, Pixel>::SelfGuidedFilter(
     const void* const source, void* const dest,
     const RestorationUnitInfo& restoration_info, ptrdiff_t source_stride,
     ptrdiff_t dest_stride, int width, int height,
     RestorationBuffer* const buffer) {
   const int w0 = restoration_info.sgr_proj_info.multiplier[0];
   const int w1 = restoration_info.sgr_proj_info.multiplier[1];
   const int w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
   const int index = restoration_info.sgr_proj_info.index;
   const uint8_t r0 = kSgrProjParams[index][0];
   const uint8_t r1 = kSgrProjParams[index][2];
   const ptrdiff_t array_stride = buffer->box_filter_process_output_stride;
   int* box_filter_process_output[2] = {buffer->box_filter_process_output[0],
                                        buffer->box_filter_process_output[1]};
   const auto* src = static_cast<const Pixel*>(source);
   auto* dst = static_cast<Pixel*>(dest);
   source_stride /= sizeof(Pixel);
   dest_stride /= sizeof(Pixel);
   LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterProcess(
       restoration_info, src, source_stride, width, height, buffer);
   for (int y = 0; y < height; ++y) {
     for (int x = 0; x < width; ++x) {
       const int u = src[x] << kSgrProjRestoreBits;
       int v = w1 * u;
       if (r0 != 0) {
         v += w0 * box_filter_process_output[0][x];
       } else {
         v += w0 * u;
       }
       if (r1 != 0) {
         v += w2 * box_filter_process_output[1][x];
       } else {
         v += w2 * u;
       }
       // if r0 == 0 and r1 == 0, the range of v is:
       // bits(u) + bits(w0/w1/w2) + 2 = bitdepth + 13.
       // Then, range of s is bitdepth + 2. This is a rough estimation, taking
       // the maximum value of each element.
       const int s = RightShiftWithRounding(
           v, kSgrProjRestoreBits + kSgrProjPrecisionBits);
       dst[x] = static_cast<Pixel>(Clip3(s, 0, (1 << bitdepth) - 1));
     }
     src += source_stride;
     dst += dest_stride;
     box_filter_process_output[0] += array_stride;
     box_filter_process_output[1] += array_stride;
   }
 }

 void Init8bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
   assert(dsp != nullptr);
 #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
   dsp->loop_restorations[0] = LoopRestorationFuncs_C<8, uint8_t>::WienerFilter;
   dsp->loop_restorations[1] =
       LoopRestorationFuncs_C<8, uint8_t>::SelfGuidedFilter;
 #else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
   static_cast<void>(dsp);
 #ifndef LIBGAV1_Dsp8bpp_WienerFilter
   dsp->loop_restorations[0] = LoopRestorationFuncs_C<8, uint8_t>::WienerFilter;
 #endif
 #ifndef LIBGAV1_Dsp8bpp_SelfGuidedFilter
   dsp->loop_restorations[1] =
       LoopRestorationFuncs_C<8, uint8_t>::SelfGuidedFilter;
 #endif
 #endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
 }

 #if LIBGAV1_MAX_BITDEPTH >= 10

 void Init10bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
   assert(dsp != nullptr);
 #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
   dsp->loop_restorations[0] =
       LoopRestorationFuncs_C<10, uint16_t>::WienerFilter;
   dsp->loop_restorations[1] =
       LoopRestorationFuncs_C<10, uint16_t>::SelfGuidedFilter;
 #else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
   static_cast<void>(dsp);
 #ifndef LIBGAV1_Dsp10bpp_WienerFilter
   dsp->loop_restorations[0] =
       LoopRestorationFuncs_C<10, uint16_t>::WienerFilter;
 #endif
 #ifndef LIBGAV1_Dsp10bpp_SelfGuidedFilter
   dsp->loop_restorations[1] =
       LoopRestorationFuncs_C<10, uint16_t>::SelfGuidedFilter;
 #endif
 #endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
 }

 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
 }  // namespace

 void LoopRestorationInit_C() {
   Init8bpp();
 #if LIBGAV1_MAX_BITDEPTH >= 10
   Init10bpp();
 #endif
   // Local functions that may be unused depending on the optimizations
   // available.
   static_cast<void>(PopulateWienerCoefficients);
 }

 }  // namespace dsp
 }  // namespace libgav1
	#include "src/dsp/loop_restoration.h"

	#include <algorithm> // std::max
	#include <cassert>
	#include <cstddef>
	#include <cstdint>

	#include "src/dsp/common.h"
	#include "src/dsp/dsp.h"
	#include "src/utils/common.h"
	#include "src/utils/constants.h"

	namespace libgav1 {
	namespace dsp {
	namespace {

	// Precision of a division table (mtable)
	constexpr int kSgrProjScaleBits = 20;
	constexpr int kSgrProjReciprocalBits = 12;
	// Core selfguided restoration precision bits.
	constexpr int kSgrProjSgrBits = 8;
	// Precision bits of generated values higher than source before projection.
	constexpr int kSgrProjRestoreBits = 4;

	template <int bitdepth, typename Pixel>
	struct LoopRestorationFuncs_C {
	LoopRestorationFuncs_C() = delete;

	// \|stride\| for SelfGuidedFilter and WienerFilter is given in bytes.
	static void SelfGuidedFilter(const void* source, void* dest,
	const RestorationUnitInfo& restoration_info,
	ptrdiff_t source_stride, ptrdiff_t dest_stride,
	int width, int height,
	RestorationBuffer* buffer);
	static void WienerFilter(const void* source, void* dest,
	const RestorationUnitInfo& restoration_info,
	ptrdiff_t source_stride, ptrdiff_t dest_stride,
	int width, int height, RestorationBuffer* buffer);
	// \|stride\| for box filter processing is in Pixels.
	static void BoxFilterPreProcess(const RestorationUnitInfo& restoration_info,
	const Pixel* src, ptrdiff_t stride, int width,
	int height, int pass,
	RestorationBuffer* buffer);
	static void BoxFilterProcess(const RestorationUnitInfo& restoration_info,
	const Pixel* src, ptrdiff_t stride, int width,
	int height, RestorationBuffer* buffer);
	};

	// Note: range of wiener filter coefficients.
	// Wiener filter coefficients are symmetric, and their sum is 1 (128).
	// The range of each coefficient:
	// filter[0] = filter[6], 4 bits, min = -5, max = 10.
	// filter[1] = filter[5], 5 bits, min = -23, max = 8.
	// filter[2] = filter[4], 6 bits, min = -17, max = 46.
	// filter[3] = 128 - (filter[0] + filter[1] + filter[2]) * 2.
	// The difference from libaom is that in libaom:
	// filter[3] = 0 - (filter[0] + filter[1] + filter[2]) * 2.
	// Thus in libaom's computation, an offset of 128 is needed for filter[3].
	inline void PopulateWienerCoefficients(
	const RestorationUnitInfo& restoration_info, int direction,
	int16_t* const filter) {
	filter[3] = 128;
	for (int i = 0; i < 3; ++i) {
	const int16_t coeff = restoration_info.wiener_info.filter[direction][i];
	filter[i] = coeff;
	filter[6 - i] = coeff;
	filter[3] -= MultiplyBy2(coeff);
	}
	}

	// Note: bit range for wiener filter.
	// Wiener filter process first applies horizontal filtering to input pixels,
	// followed by rounding with predefined bits (dependent on bitdepth).
	// Then vertical filtering is applied, followed by rounding (dependent on
	// bitdepth).
	// The process is the same as convolution:
	// <input> --> <horizontal filter> --> <rounding 0> --> <vertical filter>
	// --> <rounding 1>
	// By design:
	// (a). horizontal/vertical filtering adds 7 bits to input.
	// (b). The output of first rounding fits into 16 bits.
	// (c). The output of second rounding fits into 16 bits.
	// If input bitdepth > 8, the accumulator of the horizontal filter is larger
	// than 16 bit and smaller than 32 bits.
	// The accumulator of the vertical filter is larger than 16 bits and smaller
	// than 32 bits.
	template <int bitdepth, typename Pixel>
	void LoopRestorationFuncs_C<bitdepth, Pixel>::WienerFilter(
	const void* const source, void* const dest,
	const RestorationUnitInfo& restoration_info, ptrdiff_t source_stride,
	ptrdiff_t dest_stride, int width, int height,
	RestorationBuffer* const buffer) {
	constexpr int kRoundBitsHorizontal = (bitdepth == 12)
	? kInterRoundBitsHorizontal12bpp
	: kInterRoundBitsHorizontal;
	constexpr int kRoundBitsVertical =
	(bitdepth == 12) ? kInterRoundBitsVertical12bpp : kInterRoundBitsVertical;
	int16_t filter[kSubPixelTaps - 1];
	const int limit =
	(1 << (bitdepth + 1 + kWienerFilterBits - kRoundBitsHorizontal)) - 1;
	const auto* src = static_cast<const Pixel*>(source);
	auto* dst = static_cast<Pixel*>(dest);
	source_stride /= sizeof(Pixel);
	dest_stride /= sizeof(Pixel);
	const ptrdiff_t buffer_stride = buffer->wiener_buffer_stride;
	auto* wiener_buffer = buffer->wiener_buffer;
	// horizontal filtering.
	PopulateWienerCoefficients(restoration_info, WienerInfo::kHorizontal, filter);
	const int center_tap = 3;
	src -= center_tap * source_stride + center_tap;
	const int horizontal_rounding = 1 << (bitdepth + kWienerFilterBits - 1);
	for (int y = 0; y < height + kSubPixelTaps - 2; ++y) {
	for (int x = 0; x < width; ++x) {
	// sum fits into 16 bits only when bitdepth = 8.
	int sum = horizontal_rounding;
	for (int k = 0; k < kSubPixelTaps - 1; ++k) {
	sum += filter[k] * src[x + k];
	}
	const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsHorizontal);
	wiener_buffer[x] = static_cast<uint16_t>(Clip3(rounded_sum, 0, limit));
	}
	src += source_stride;
	wiener_buffer += buffer_stride;
	}
	wiener_buffer = buffer->wiener_buffer;
	// vertical filtering.
	PopulateWienerCoefficients(restoration_info, WienerInfo::kVertical, filter);
	const int vertical_rounding = -(1 << (bitdepth + kRoundBitsVertical - 1));
	for (int y = 0; y < height; ++y) {
	for (int x = 0; x < width; ++x) {
	// sum needs 32 bits.
	int sum = vertical_rounding;
	for (int k = 0; k < kSubPixelTaps - 1; ++k) {
	sum += filter[k] * wiener_buffer[k * buffer_stride + x];
	}
	const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsVertical);
	dst[x] = static_cast<Pixel>(Clip3(rounded_sum, 0, (1 << bitdepth) - 1));
	}
	dst += dest_stride;
	wiener_buffer += buffer_stride;
	}
	}

	template <int bitdepth, typename Pixel>
	void LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterPreProcess(
	const RestorationUnitInfo& restoration_info, const Pixel* const src,
	ptrdiff_t stride, int width, int height, int pass,
	RestorationBuffer* const buffer) {
	const int sgr_proj_index = restoration_info.sgr_proj_info.index;
	const uint8_t radius = kSgrProjParams[sgr_proj_index][pass * 2];
	assert(radius != 0);
	const uint32_t n = (2 * radius + 1) * (2 * radius + 1);
	// const uint8_t scale = kSgrProjParams[sgr_proj_index][pass * 2 + 1];
	// n2_with_scale: max value < 2^16. min value is 4.
	// const uint32_t n2_with_scale = n * n * scale;
	// s: max value < 2^12.
	// const uint32_t s =
	// ((1 << kSgrProjScaleBits) + (n2_with_scale >> 1)) / n2_with_scale;
	const uint32_t s = kSgrScaleParameter[sgr_proj_index][pass];
	assert(s != 0);
	const ptrdiff_t array_stride = buffer->box_filter_process_intermediate_stride;
	// The size of the intermediate result buffer is the size of the filter area
	// plus horizontal (3) and vertical (3) padding. The processing start point
	// is the filter area start point -1 row and -1 column. Therefore we need to
	// set offset and use the intermediate_result as the start point for
	// processing.
	const ptrdiff_t intermediate_buffer_offset =
	kRestorationBorder * array_stride + kRestorationBorder;
	uint32_t* intermediate_result[2] = {
	buffer->box_filter_process_intermediate[0] + intermediate_buffer_offset -
	array_stride,
	buffer->box_filter_process_intermediate[1] + intermediate_buffer_offset -
	array_stride};
	// Calculate intermediate results, including one-pixel border, for example,
	// if unit size is 64x64, we calculate 66x66 pixels.
	for (int y = -1; y <= height; ++y) {
	if (pass == 0 && ((y & 1) == 0)) {
	intermediate_result[0] += array_stride;
	intermediate_result[1] += array_stride;
	continue;
	}
	for (int x = -1; x <= width; ++x) {
	uint32_t a = 0;
	uint32_t b = 0;
	for (int dy = -radius; dy <= radius; ++dy) {
	for (int dx = -radius; dx <= radius; ++dx) {
	const Pixel source = src[(y + dy) * stride + (x + dx)];
	// TODO(chengchen): Use boxsum for fast calculation.
	a += source * source;
	b += source;
	}
	}
	// a: before shift, max is 25 * (2^(bitdepth) - 1) * (2^(bitdepth) - 1).
	// since max bitdepth = 12, max < 2^31.
	// after shift, a < 2^16 * n < 2^22 regardless of bitdepth
	a = RightShiftWithRounding(a, (bitdepth - 8) << 1);
	// b: max is 25 * (2^(bitdepth) - 1). If bitdepth = 12, max < 2^19.
	// d < 2^8 * n < 2^14 regardless of bitdepth
	const uint32_t d = RightShiftWithRounding(b, bitdepth - 8);
	// p: Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
	// and p itself satisfies p < 2^14 * n^2 < 2^26.
	// This bound on p is due to:
	// https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
	// Note: Sometimes, in high bitdepth, we can end up with an < bb.
	// This is an artifact of rounding, and can only happen if all pixels
	// are (almost) identical, so in this case we saturate to p=0.
	const uint32_t p = (a * n < d * d) ? 0 : a * n - d * d;
	// p * s < (2^14 * n^2) * round(2^20 / (n^2 * scale)) < 2^34 / scale <
	// 2^32 as long as scale >= 4. So p * s fits into a uint32_t, and z < 2^12
	// (this holds even after accounting for the rounding in s)
	const uint32_t z = RightShiftWithRounding(p * s, kSgrProjScaleBits);
	// a2: range [1, 256].
	uint32_t a2;
	if (z >= 255) {
	a2 = 256;
	} else if (z == 0) {
	a2 = 1;
	} else {
	a2 = ((z << kSgrProjSgrBits) + (z >> 1)) / (z + 1);
	}
	const uint32_t one_over_n =
	((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
	// (kSgrProjSgrBits - a2) < 2^8, b < 2^(bitdepth) * n,
	// one_over_n = round(2^12 / n)
	// => the product here is < 2^(20 + bitdepth) <= 2^32,
	// and b is set to a value < 2^(8 + bitdepth).
	// This holds even with the rounding in one_over_n and in the overall
	// result, as long as (kSgrProjSgrBits - a2) is strictly less than 2^8.
	const uint32_t b2 = ((1 << kSgrProjSgrBits) - a2) * b * one_over_n;
	intermediate_result[0][x] = a2;
	intermediate_result[1][x] =
	RightShiftWithRounding(b2, kSgrProjReciprocalBits);
	}
	intermediate_result[0] += array_stride;
	intermediate_result[1] += array_stride;
	}
	}

	template <int bitdepth, typename Pixel>
	void LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterProcess(
	const RestorationUnitInfo& restoration_info, const Pixel* src,
	ptrdiff_t stride, int width, int height, RestorationBuffer* const buffer) {
	const int sgr_proj_index = restoration_info.sgr_proj_info.index;
	for (int pass = 0; pass < 2; ++pass) {
	const uint8_t radius = kSgrProjParams[sgr_proj_index][pass * 2];
	const Pixel* src_ptr = src;
	if (radius == 0) continue;
	LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterPreProcess(
	restoration_info, src_ptr, stride, width, height, pass, buffer);

	int* filtered_output = buffer->box_filter_process_output[pass];
	const ptrdiff_t filtered_output_stride =
	buffer->box_filter_process_output_stride;
	const ptrdiff_t intermediate_stride =
	buffer->box_filter_process_intermediate_stride;
	// Set intermediate buffer start point to the actual start point of
	// filtering.
	const ptrdiff_t intermediate_buffer_offset =
	kRestorationBorder * intermediate_stride + kRestorationBorder;
	for (int y = 0; y < height; ++y) {
	const int shift = (pass == 0 && (y & 1) != 0) ? 4 : 5;
	uint32_t* const array_start[2] = {
	buffer->box_filter_process_intermediate[0] +
	intermediate_buffer_offset + y * intermediate_stride,
	buffer->box_filter_process_intermediate[1] +
	intermediate_buffer_offset + y * intermediate_stride};
	for (int x = 0; x < width; ++x) {
	uint32_t a = 0;
	uint32_t b = 0;
	uint32_t* intermediate_result[2] = {
	array_start[0] - intermediate_stride,
	array_start[1] - intermediate_stride};
	for (int dy = -1; dy <= 1; ++dy) {
	for (int dx = -1; dx <= 1; ++dx) {
	int weight;
	if (pass == 0) {
	if (((y + dy) & 1) != 0) {
	weight = (dx == 0) ? 6 : 5;
	} else {
	continue;
	}
	} else {
	weight = ((dx & dy) == 0) ? 4 : 3;
	}
	// intermediate_result[0]: range [1, 256].
	// intermediate_result[1] < 2^20.
	a += weight * intermediate_result[0][x + dx];
	b += weight * intermediate_result[1][x + dx];
	}
	intermediate_result[0] += intermediate_stride;
	intermediate_result[1] += intermediate_stride;
	}
	// v < 2^32. All intermediate calculations are positive.
	const uint32_t v = a * src_ptr[x] + b;
	filtered_output[x] = RightShiftWithRounding(
	v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
	}
	src_ptr += stride;
	filtered_output += filtered_output_stride;
	}
	}
	}

	// Assume box_filter_process_output[2] are allocated before calling
	// this function. Their sizes are width * height, stride equals width.
	template <int bitdepth, typename Pixel>
	void LoopRestorationFuncs_C<bitdepth, Pixel>::SelfGuidedFilter(
	const void* const source, void* const dest,
	const RestorationUnitInfo& restoration_info, ptrdiff_t source_stride,
	ptrdiff_t dest_stride, int width, int height,
	RestorationBuffer* const buffer) {
	const int w0 = restoration_info.sgr_proj_info.multiplier[0];
	const int w1 = restoration_info.sgr_proj_info.multiplier[1];
	const int w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
	const int index = restoration_info.sgr_proj_info.index;
	const uint8_t r0 = kSgrProjParams[index][0];
	const uint8_t r1 = kSgrProjParams[index][2];
	const ptrdiff_t array_stride = buffer->box_filter_process_output_stride;
	int* box_filter_process_output[2] = {buffer->box_filter_process_output[0],
	buffer->box_filter_process_output[1]};
	const auto* src = static_cast<const Pixel*>(source);
	auto* dst = static_cast<Pixel*>(dest);
	source_stride /= sizeof(Pixel);
	dest_stride /= sizeof(Pixel);
	LoopRestorationFuncs_C<bitdepth, Pixel>::BoxFilterProcess(
	restoration_info, src, source_stride, width, height, buffer);
	for (int y = 0; y < height; ++y) {
	for (int x = 0; x < width; ++x) {
	const int u = src[x] << kSgrProjRestoreBits;
	int v = w1 * u;
	if (r0 != 0) {
	v += w0 * box_filter_process_output[0][x];
	} else {
	v += w0 * u;
	}
	if (r1 != 0) {
	v += w2 * box_filter_process_output[1][x];
	} else {
	v += w2 * u;
	}
	// if r0 == 0 and r1 == 0, the range of v is:
	// bits(u) + bits(w0/w1/w2) + 2 = bitdepth + 13.
	// Then, range of s is bitdepth + 2. This is a rough estimation, taking
	// the maximum value of each element.
	const int s = RightShiftWithRounding(
	v, kSgrProjRestoreBits + kSgrProjPrecisionBits);
	dst[x] = static_cast<Pixel>(Clip3(s, 0, (1 << bitdepth) - 1));
	}
	src += source_stride;
	dst += dest_stride;
	box_filter_process_output[0] += array_stride;
	box_filter_process_output[1] += array_stride;
	}
	}

	void Init8bpp() {
	Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
	assert(dsp != nullptr);
	#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
	dsp->loop_restorations[0] = LoopRestorationFuncs_C<8, uint8_t>::WienerFilter;
	dsp->loop_restorations[1] =
	LoopRestorationFuncs_C<8, uint8_t>::SelfGuidedFilter;
	#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
	static_cast<void>(dsp);
	#ifndef LIBGAV1_Dsp8bpp_WienerFilter
	dsp->loop_restorations[0] = LoopRestorationFuncs_C<8, uint8_t>::WienerFilter;
	#endif
	#ifndef LIBGAV1_Dsp8bpp_SelfGuidedFilter
	dsp->loop_restorations[1] =
	LoopRestorationFuncs_C<8, uint8_t>::SelfGuidedFilter;
	#endif
	#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
	}

	#if LIBGAV1_MAX_BITDEPTH >= 10

	void Init10bpp() {
	Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
	assert(dsp != nullptr);
	#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
	dsp->loop_restorations[0] =
	LoopRestorationFuncs_C<10, uint16_t>::WienerFilter;
	dsp->loop_restorations[1] =
	LoopRestorationFuncs_C<10, uint16_t>::SelfGuidedFilter;
	#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
	static_cast<void>(dsp);
	#ifndef LIBGAV1_Dsp10bpp_WienerFilter
	dsp->loop_restorations[0] =
	LoopRestorationFuncs_C<10, uint16_t>::WienerFilter;
	#endif
	#ifndef LIBGAV1_Dsp10bpp_SelfGuidedFilter
	dsp->loop_restorations[1] =
	LoopRestorationFuncs_C<10, uint16_t>::SelfGuidedFilter;
	#endif
	#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
	}

	#endif // LIBGAV1_MAX_BITDEPTH >= 10
	} // namespace

	void LoopRestorationInit_C() {
	Init8bpp();
	#if LIBGAV1_MAX_BITDEPTH >= 10
	Init10bpp();
	#endif
	// Local functions that may be unused depending on the optimizations
	// available.
	static_cast<void>(PopulateWienerCoefficients);
	}

	} // namespace dsp
	} // namespace libgav1