libgav1/src/dsp/arm/loop_restoration_neon.cc - platform/external/libgav1 - Git at Google

 // Copyright 2019 The libgav1 Authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include "src/dsp/dsp.h"
 #include "src/dsp/loop_restoration.h"

 #if LIBGAV1_ENABLE_NEON

 #include <arm_neon.h>

 #include <cassert>
 #include <cstddef>
 #include <cstdint>

 #include "src/dsp/arm/common_neon.h"
 #include "src/utils/constants.h"

 namespace libgav1 {
 namespace dsp {
 namespace low_bitdepth {
 namespace {

 // Wiener
 inline void PopulateWienerCoefficients(
     const RestorationUnitInfo& restoration_info, const int direction,
     int16_t* const filter) {
   // In order to keep the horizontal pass intermediate values within 16 bits we
   // initialize |filter[3]| to 0 instead of 128.
   filter[3] = 0;
   for (int i = 0; i < 3; ++i) {
     const int16_t coeff = restoration_info.wiener_info.filter[direction][i];
     filter[i] = coeff;
     filter[6 - i] = coeff;
     filter[3] -= coeff * 2;
   }
 }

 inline int16x8_t HorizontalSum(const uint8x8_t a[7], int16_t filter[7]) {
   int16x8_t sum = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(a[0])), filter[0]);
   sum = vaddq_s16(
       sum, vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(a[1])), filter[1]));
   sum = vaddq_s16(
       sum, vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(a[2])), filter[2]));
   sum = vaddq_s16(
       sum, vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(a[3])), filter[3]));
   sum = vaddq_s16(
       sum, vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(a[4])), filter[4]));
   sum = vaddq_s16(
       sum, vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(a[5])), filter[5]));
   sum = vaddq_s16(
       sum, vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(a[6])), filter[6]));

   sum = vrshrq_n_s16(sum, kInterRoundBitsHorizontal);

   // Delaying |horizontal_rounding| until after dowshifting allows the sum to
   // stay in 16 bits.
   // |horizontal_rounding| = 1 << (bitdepth + kWienerFilterBits - 1)
   //                         1 << (       8 +                 7 - 1)
   // Plus |kInterRoundBitsHorizontal| and it works out to 1 << 11.
   sum = vaddq_s16(sum, vdupq_n_s16(1 << 11));

   // Just like |horizontal_rounding|, adding |filter[3]| at this point allows
   // the sum to stay in 16 bits.
   // But wait! We *did* calculate |filter[3]| and used it in the sum! But it was
   // offset by 128. Fix that here:
   // |src[3]| * 128 >> 3 == |src[3]| << 4
   sum = vaddq_s16(sum, vreinterpretq_s16_u16(vshll_n_u8(a[3], 4)));

   // Saturate to
   // [0,
   // (1 << (bitdepth + 1 + kWienerFilterBits - kInterRoundBitsHorizontal)) - 1)]
   // (1 << (       8 + 1 +                 7 -                         3)) - 1)
   sum = vminq_s16(sum, vdupq_n_s16((1 << 13) - 1));
   sum = vmaxq_s16(sum, vdupq_n_s16(0));
   return sum;
 }

 template <int min_width>
 inline void VerticalSum(const int16_t* src_base, const ptrdiff_t src_stride,
                         uint8_t* dst_base, const ptrdiff_t dst_stride,
                         const int16x4_t filter[7], const int width,
                         const int height) {
   static_assert(min_width == 4 || min_width == 8, "");
   // -(1 << (bitdepth + kInterRoundBitsVertical - 1))
   // -(1 << (       8 +                      11 - 1))
   constexpr int vertical_rounding = -(1 << 18);
   if (min_width == 8) {
     int x = 0;
     do {
       const int16_t* src = src_base + x;
       uint8_t* dst = dst_base + x;
       int16x8_t a[7];
       a[0] = vld1q_s16(src);
       src += src_stride;
       a[1] = vld1q_s16(src);
       src += src_stride;
       a[2] = vld1q_s16(src);
       src += src_stride;
       a[3] = vld1q_s16(src);
       src += src_stride;
       a[4] = vld1q_s16(src);
       src += src_stride;
       a[5] = vld1q_s16(src);
       src += src_stride;

       int y = 0;
       do {
         a[6] = vld1q_s16(src);
         src += src_stride;

         int32x4_t sum_lo = vdupq_n_s32(vertical_rounding);
         sum_lo = vmlal_s16(sum_lo, vget_low_s16(a[0]), filter[0]);
         sum_lo = vmlal_s16(sum_lo, vget_low_s16(a[1]), filter[1]);
         sum_lo = vmlal_s16(sum_lo, vget_low_s16(a[2]), filter[2]);
         sum_lo = vmlal_s16(sum_lo, vget_low_s16(a[3]), filter[3]);
         sum_lo = vmlal_s16(sum_lo, vget_low_s16(a[4]), filter[4]);
         sum_lo = vmlal_s16(sum_lo, vget_low_s16(a[5]), filter[5]);
         sum_lo = vmlal_s16(sum_lo, vget_low_s16(a[6]), filter[6]);
         uint16x4_t sum_lo_16 = vqrshrun_n_s32(sum_lo, 11);

         int32x4_t sum_hi = vdupq_n_s32(vertical_rounding);
         sum_hi = vmlal_s16(sum_hi, vget_high_s16(a[0]), filter[0]);
         sum_hi = vmlal_s16(sum_hi, vget_high_s16(a[1]), filter[1]);
         sum_hi = vmlal_s16(sum_hi, vget_high_s16(a[2]), filter[2]);
         sum_hi = vmlal_s16(sum_hi, vget_high_s16(a[3]), filter[3]);
         sum_hi = vmlal_s16(sum_hi, vget_high_s16(a[4]), filter[4]);
         sum_hi = vmlal_s16(sum_hi, vget_high_s16(a[5]), filter[5]);
         sum_hi = vmlal_s16(sum_hi, vget_high_s16(a[6]), filter[6]);
         uint16x4_t sum_hi_16 = vqrshrun_n_s32(sum_hi, 11);

         vst1_u8(dst, vqmovn_u16(vcombine_u16(sum_lo_16, sum_hi_16)));
         dst += dst_stride;

         a[0] = a[1];
         a[1] = a[2];
         a[2] = a[3];
         a[3] = a[4];
         a[4] = a[5];
         a[5] = a[6];
       } while (++y < height);
       x += 8;
     } while (x < width);
   } else if (min_width == 4) {
     const int16_t* src = src_base;
     uint8_t* dst = dst_base;
     int16x4_t a[7];
     a[0] = vld1_s16(src);
     src += src_stride;
     a[1] = vld1_s16(src);
     src += src_stride;
     a[2] = vld1_s16(src);
     src += src_stride;
     a[3] = vld1_s16(src);
     src += src_stride;
     a[4] = vld1_s16(src);
     src += src_stride;
     a[5] = vld1_s16(src);
     src += src_stride;

     int y = 0;
     do {
       a[6] = vld1_s16(src);
       src += src_stride;

       int32x4_t sum = vdupq_n_s32(vertical_rounding);
       sum = vmlal_s16(sum, a[0], filter[0]);
       sum = vmlal_s16(sum, a[1], filter[1]);
       sum = vmlal_s16(sum, a[2], filter[2]);
       sum = vmlal_s16(sum, a[3], filter[3]);
       sum = vmlal_s16(sum, a[4], filter[4]);
       sum = vmlal_s16(sum, a[5], filter[5]);
       sum = vmlal_s16(sum, a[6], filter[6]);
       uint16x4_t sum_16 = vqrshrun_n_s32(sum, 11);

       StoreLo4(dst, vqmovn_u16(vcombine_u16(sum_16, sum_16)));
       dst += dst_stride;

       a[0] = a[1];
       a[1] = a[2];
       a[2] = a[3];
       a[3] = a[4];
       a[4] = a[5];
       a[5] = a[6];
     } while (++y < height);
   }
 }

 void WienerFilter_NEON(const void* const source, void* const dest,
                        const RestorationUnitInfo& restoration_info,
                        const ptrdiff_t source_stride,
                        const ptrdiff_t dest_stride, const int width,
                        const int height, RestorationBuffer* const buffer) {
   int16_t filter[kSubPixelTaps - 1];
   const auto* src = static_cast<const uint8_t*>(source);
   auto* dst = static_cast<uint8_t*>(dest);
   // It should be possible to set this to |width|.
   ptrdiff_t buffer_stride = buffer->wiener_buffer_stride;
   // Casting once here saves a lot of vreinterpret() calls. The values are
   // saturated to 13 bits before storing.
   int16_t* wiener_buffer = reinterpret_cast<int16_t*>(buffer->wiener_buffer);

   // Horizontal filtering.
   PopulateWienerCoefficients(restoration_info, WienerInfo::kHorizontal, filter);
   // The taps have a radius of 3. Adjust |src| so we start reading with the top
   // left value.
   const int center_tap = 3;
   src -= center_tap * source_stride + center_tap;
   int y = 0;
   do {
     int x = 0;
     do {
       // This is just as fast as an 8x8 transpose but avoids over-reading extra
       // rows. It always over-reads by at least 1 value. On small widths (4xH)
       // it over-reads by 9 values.
       const uint8x16_t src_v = vld1q_u8(src + x);
       uint8x8_t b[7];
       b[0] = vget_low_u8(src_v);
       b[1] = vget_low_u8(vextq_u8(src_v, src_v, 1));
       b[2] = vget_low_u8(vextq_u8(src_v, src_v, 2));
       b[3] = vget_low_u8(vextq_u8(src_v, src_v, 3));
       b[4] = vget_low_u8(vextq_u8(src_v, src_v, 4));
       b[5] = vget_low_u8(vextq_u8(src_v, src_v, 5));
       b[6] = vget_low_u8(vextq_u8(src_v, src_v, 6));

       int16x8_t sum = HorizontalSum(b, filter);

       vst1q_s16(wiener_buffer + x, sum);
       x += 8;
     } while (x < width);
     src += source_stride;
     wiener_buffer += buffer_stride;
   } while (++y < height + kSubPixelTaps - 2);

   // Vertical filtering.
   wiener_buffer = reinterpret_cast<int16_t*>(buffer->wiener_buffer);
   PopulateWienerCoefficients(restoration_info, WienerInfo::kVertical, filter);
   // Add 128 to |filter[3]| to fix the adjustment for the horizontal filtering.
   // This pass starts with 13 bits so there is no chance of keeping it in 16.
   const int16x4_t filter_v[7] = {
       vdup_n_s16(filter[0]),       vdup_n_s16(filter[1]), vdup_n_s16(filter[2]),
       vdup_n_s16(filter[3] + 128), vdup_n_s16(filter[4]), vdup_n_s16(filter[5]),
       vdup_n_s16(filter[6])};

   if (width == 4) {
     VerticalSum<4>(wiener_buffer, buffer_stride, dst, dest_stride, filter_v,
                    width, height);
   } else {
     VerticalSum<8>(wiener_buffer, buffer_stride, dst, dest_stride, filter_v,
                    width, height);
   }
 }

 // SGR
 inline uint16x4_t Sum3Horizontal(const uint16x8_t a) {
   const uint16x4_t sum =
       vadd_u16(vget_low_u16(a), vext_u16(vget_low_u16(a), vget_high_u16(a), 1));
   return vadd_u16(sum, vext_u16(vget_low_u16(a), vget_high_u16(a), 2));
 }

 inline uint32x4_t Sum3Horizontal(const uint32x4x2_t a) {
   const uint32x4_t sum = vaddq_u32(a.val[0], vextq_u32(a.val[0], a.val[1], 1));
   return vaddq_u32(sum, vextq_u32(a.val[0], a.val[1], 2));
 }

 inline uint16x8_t Sum3Vertical(const uint8x8_t a[3]) {
   const uint16x8_t sum = vaddl_u8(a[0], a[1]);
   return vaddw_u8(sum, a[2]);
 }

 inline uint32x4x2_t Sum3Vertical(const uint16x8_t a[3]) {
   uint32x4_t sum_a = vaddl_u16(vget_low_u16(a[0]), vget_low_u16(a[1]));
   sum_a = vaddw_u16(sum_a, vget_low_u16(a[2]));
   uint32x4_t sum_b = vaddl_u16(vget_high_u16(a[0]), vget_high_u16(a[1]));
   sum_b = vaddw_u16(sum_b, vget_high_u16(a[2]));
   return {sum_a, sum_b};
 }

 inline uint16x4_t Sum5Horizontal(const uint16x8_t a) {
   uint16x4_t sum =
       vadd_u16(vget_low_u16(a), vext_u16(vget_low_u16(a), vget_high_u16(a), 1));
   sum = vadd_u16(sum, vext_u16(vget_low_u16(a), vget_high_u16(a), 2));
   sum = vadd_u16(sum, vext_u16(vget_low_u16(a), vget_high_u16(a), 3));
   return vadd_u16(sum, vget_high_u16(a));
 }

 inline uint32x4_t Sum5Horizontal(const uint32x4x2_t a) {
   uint32x4_t sum = vaddq_u32(a.val[0], vextq_u32(a.val[0], a.val[1], 1));
   sum = vaddq_u32(sum, vextq_u32(a.val[0], a.val[1], 2));
   sum = vaddq_u32(sum, vextq_u32(a.val[0], a.val[1], 3));
   return vaddq_u32(sum, a.val[1]);
 }

 inline uint16x8_t Sum5Vertical(const uint8x8_t a[5]) {
   uint16x8_t sum = vaddl_u8(a[0], a[1]);
   sum = vaddq_u16(sum, vaddl_u8(a[2], a[3]));
   return vaddw_u8(sum, a[4]);
 }

 inline uint32x4x2_t Sum5Vertical(const uint16x8_t a[5]) {
   uint32x4_t sum_a = vaddl_u16(vget_low_u16(a[0]), vget_low_u16(a[1]));
   sum_a = vaddq_u32(sum_a, vaddl_u16(vget_low_u16(a[2]), vget_low_u16(a[3])));
   sum_a = vaddw_u16(sum_a, vget_low_u16(a[4]));
   uint32x4_t sum_b = vaddl_u16(vget_high_u16(a[0]), vget_high_u16(a[1]));
   sum_b = vaddq_u32(sum_b, vaddl_u16(vget_high_u16(a[2]), vget_high_u16(a[3])));
   sum_b = vaddw_u16(sum_b, vget_high_u16(a[4]));
   return {sum_a, sum_b};
 }

 constexpr int kSgrProjScaleBits = 20;
 constexpr int kSgrProjRestoreBits = 4;
 constexpr int kSgrProjSgrBits = 8;
 constexpr int kSgrProjReciprocalBits = 12;

 constexpr int kIntermediateStride = 68;
 constexpr int kIntermediateHeight = 66;

 // a2 = ((z << kSgrProjSgrBits) + (z >> 1)) / (z + 1);
 constexpr uint16_t kA2Lookup[256] = {
     1,   128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
     240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
     248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
     250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
     252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
     253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
     253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
     254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
     254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
     254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
     254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
     254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
     256};

 inline uint16x4_t Sum565(const uint16x8_t a) {
   // Multiply everything by 4.
   const uint16x8_t x4 = vshlq_n_u16(a, 2);
   // Everything * 5
   const uint16x8_t x5 = vaddq_u16(x4, a);
   // The middle elements get added once more
   const uint16x4_t middle = vext_u16(vget_low_u16(a), vget_high_u16(a), 1);
   return vadd_u16(middle, Sum3Horizontal(x5));
 }

 inline uint32x4_t Sum3HorizontalW(const uint32x4_t a, const uint32x4_t b) {
   const uint32x4_t sum = vaddq_u32(a, vextq_u32(a, b, 1));
   return vaddq_u32(sum, vcombine_u32(vget_high_u32(a), vget_low_u32(b)));
 }

 inline uint32x4_t Sum565W(const uint16x8_t a) {
   // Multiply everything by 4. |b2| values can be up to 65088 which means we
   // have to step up to 32 bits immediately.
   const uint32x4_t x4_lo = vshll_n_u16(vget_low_u16(a), 2);
   const uint32x4_t x4_hi = vshll_n_u16(vget_high_u16(a), 2);
   // Everything * 5
   const uint32x4_t x5_lo = vaddw_u16(x4_lo, vget_low_u16(a));
   const uint32x4_t x5_hi = vaddw_u16(x4_hi, vget_high_u16(a));
   // The middle elements get added once more
   const uint16x4_t middle = vext_u16(vget_low_u16(a), vget_high_u16(a), 1);
   return vaddw_u16(Sum3HorizontalW(x5_lo, x5_hi), middle);
 }

 template <int n>
 inline uint16x4_t CalculateA2(const uint32x4_t sum_sq, const uint16x4_t sum,
                               const uint32_t s, const uint16x4_t v_255) {
   // a = |sum_sq|
   // d = |sum|
   // p = (a * n < d * d) ? 0 : a * n - d * d;
   const uint32x4_t dxd = vmull_u16(sum, sum);
   const uint32x4_t axn = vmulq_n_u32(sum_sq, n);
   // Ensure |p| does not underflow by using saturating subtraction.
   const uint32x4_t p = vqsubq_u32(axn, dxd);

   // z = RightShiftWithRounding(p * s, kSgrProjScaleBits);
   const uint32x4_t pxs = vmulq_n_u32(p, s);
   // For some reason vrshrn_n_u32() (narrowing shift) can only shift by 16
   // and kSgrProjScaleBits is 20.
   const uint32x4_t shifted = vrshrq_n_u32(pxs, kSgrProjScaleBits);
   // Narrow |shifted| so we can use a D register for v_255.
   const uint16x4_t z = vmin_u16(v_255, vmovn_u32(shifted));

   // a2 = ((z << kSgrProjSgrBits) + (z >> 1)) / (z + 1);
   const uint16_t lookup[4] = {
       kA2Lookup[vget_lane_u16(z, 0)], kA2Lookup[vget_lane_u16(z, 1)],
       kA2Lookup[vget_lane_u16(z, 2)], kA2Lookup[vget_lane_u16(z, 3)]};
   return vld1_u16(lookup);
 }

 inline uint16x4_t CalculateB2Shifted(const uint16x4_t a2, const uint16x4_t sum,
                                      const uint32_t one_over_n) {
   // b2 = ((1 << kSgrProjSgrBits) - a2) * b * one_over_n
   // 1 << kSgrProjSgrBits = 256
   // |a2| = [1, 256]
   // |sgrMa2| max value = 255
   const uint16x4_t sgrMa2 = vsub_u16(vdup_n_u16(1 << kSgrProjSgrBits), a2);
   // |sum| is a box sum with radius 1 or 2.
   // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
   // For the second pass radius is 1. Maxmimum value is 3x3x255 = 2295.
   // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
   // When radius is 2 |n| is 25. |one_over_n| is 164.
   // When radius is 1 |n| is 9. |one_over_n| is 455.
   const uint32x4_t b2 = vmulq_n_u32(vmull_u16(sgrMa2, sum), one_over_n);
   // static_cast<int>(RightShiftWithRounding(b2, kSgrProjReciprocalBits));
   // |kSgrProjReciprocalBits| is 12.
   // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
   // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
   return vrshrn_n_u32(b2, kSgrProjReciprocalBits);
 }

 // RightShiftWithRounding(
 //   (a * src_ptr[x] + b), kSgrProjSgrBits + shift - kSgrProjRestoreBits);
 template <int shift>
 inline uint16x4_t CalculateFilteredOutput(const uint16x4_t a,
                                           const uint32x4_t b,
                                           const uint16x4_t src) {
   // a: 256 * 32 = 8192 (14 bits)
   // b: 65088 * 32 = 2082816 (21 bits)
   const uint32x4_t axsrc = vmull_u16(a, src);
   // v: 8192 * 255 + 2082816 = 4171876 (22 bits)
   const uint32x4_t v = vaddq_u32(axsrc, b);

   // kSgrProjSgrBits = 8
   // kSgrProjRestoreBits = 4
   // shift = 4 or 5
   // v >> 8 or 9
   // 22 bits >> 8 = 14 bits
   return vrshrn_n_u32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
 }

 inline void BoxFilterProcess_FirstPass(const uint8_t* const src,
                                        const ptrdiff_t stride, const int width,
                                        const int height, const uint32_t s,
                                        uint16_t* const buf) {
   // Number of elements in the box being summed.
   const uint32_t n = 25;
   const uint32_t one_over_n = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;

   const uint16x4_t v_255 = vdup_n_u16(255);

   // We have combined PreProcess and Process for the first pass by storing
   // intermediate values in the |a2| region. The values stored are one vertical
   // column of interleaved |a2| and |b2| values and consume 4 * |height| values.
   // This is |height| and not |height| * 2 because PreProcess only generates
   // output for every other row. When processing the next column we write the
   // new scratch values right after reading the previously saved ones.
   uint16_t* const temp = buf + kIntermediateStride * kIntermediateHeight;

   // The PreProcess phase calculates a 5x5 box sum for every other row
   //
   // PreProcess and Process have been combined into the same step. We need 8
   // input values to generate 4 output values for PreProcess:
   // 0 1 2 3 4 5 6 7
   // 2 = 0 + 1 + 2 + 3 + 4
   // 3 = 1 + 2 + 3 + 4 + 5
   // 4 = 2 + 3 + 4 + 5 + 6
   // 5 = 3 + 4 + 5 + 6 + 7
   //
   // and then we need 6 input values to generate 4 output values for Process:
   // 0 1 2 3 4 5
   // 1 = 0 + 1 + 2
   // 2 = 1 + 2 + 3
   // 3 = 2 + 3 + 4
   // 4 = 3 + 4 + 5
   //
   // To avoid re-calculating PreProcess values over and over again we will do a
   // single column of 4 output values and store them interleaved in |temp|. Next
   // we will start the second column. When 2 rows have been calculated we can
   // calculate Process and output those into the top of |buf|.

   // The first phase needs a radius of 2 context values. The second phase needs
   // a context of radius 1 values. This means we start at (-3, -3).
   const uint8_t* const src_pre_process = src - 3 - 3 * stride;

   // Calculate and store a single column. Scope so we can re-use the variable
   // names for the next step.
   {
     const uint8_t* column = src_pre_process;
     uint16_t* temp_column = temp;

     uint8x8_t row[5];
     row[0] = vld1_u8(column);
     column += stride;
     row[1] = vld1_u8(column);
     column += stride;
     row[2] = vld1_u8(column);
     column += stride;

     uint16x8_t row_sq[5];
     row_sq[0] = vmull_u8(row[0], row[0]);
     row_sq[1] = vmull_u8(row[1], row[1]);
     row_sq[2] = vmull_u8(row[2], row[2]);

     int y = -1;
     do {
       row[3] = vld1_u8(column);
       column += stride;
       row[4] = vld1_u8(column);
       column += stride;

       row_sq[3] = vmull_u8(row[3], row[3]);
       row_sq[4] = vmull_u8(row[4], row[4]);

       const uint16x4_t sum = Sum5Horizontal(Sum5Vertical(row));
       const uint32x4_t sum_sq = Sum5Horizontal(Sum5Vertical(row_sq));

       const uint16x4_t a2 = CalculateA2<n>(sum_sq, sum, s, v_255);
       const uint16x4_t b2 = CalculateB2Shifted(a2, sum, one_over_n);

       vst1q_u16(temp_column, vcombine_u16(a2, b2));
       temp_column += 8;

       row[0] = row[2];
       row[1] = row[3];
       row[2] = row[4];

       row_sq[0] = row_sq[2];
       row_sq[1] = row_sq[3];
       row_sq[2] = row_sq[4];
       y += 2;
     } while (y < height + 1);
   }

   int x = 0;
   do {
     // |src_pre_process| is X but we already processed the first column of 4
     // values so we want to start at Y and increment from there.
     // X s s s Y s s
     // s s s s s s s
     // s s i i i i i
     // s s i o o o o
     // s s i o o o o
     const uint8_t* column = src_pre_process + x + 4;

     uint8x8_t row[5];
     row[0] = vld1_u8(column);
     column += stride;
     row[1] = vld1_u8(column);
     column += stride;
     row[2] = vld1_u8(column);
     column += stride;
     row[3] = vld1_u8(column);
     column += stride;
     row[4] = vld1_u8(column);
     column += stride;

     uint16x8_t row_sq[5];
     row_sq[0] = vmull_u8(row[0], row[0]);
     row_sq[1] = vmull_u8(row[1], row[1]);
     row_sq[2] = vmull_u8(row[2], row[2]);
     row_sq[3] = vmull_u8(row[3], row[3]);
     row_sq[4] = vmull_u8(row[4], row[4]);

     // Seed the loop with one line of output. Then, inside the loop, for each
     // iteration we can output one even row and one odd row and carry the new
     // line to the next iteration. In the diagram below 'i' values are
     // intermediary values from the first step and '-' values are empty.
     // iiii
     // ---- > even row
     // iiii - odd row
     // ---- > even row
     // iiii
     uint16_t* temp_column = temp;
     uint16x4_t sum565_a0;
     uint32x4_t sum565_b0;
     {
       const uint16x4_t sum = Sum5Horizontal(Sum5Vertical(row));
       const uint32x4_t sum_sq = Sum5Horizontal(Sum5Vertical(row_sq));

       const uint16x4_t a2 = CalculateA2<n>(sum_sq, sum, s, v_255);
       const uint16x4_t b2 = CalculateB2Shifted(a2, sum, one_over_n);

       // Exchange the previously calculated |a2| and |b2| values.
       const uint16x8_t a2_b2 = vld1q_u16(temp_column);
       vst1q_u16(temp_column, vcombine_u16(a2, b2));
       temp_column += 8;

       // Pass 1 Process. These are the only values we need to propagate between
       // rows.
       sum565_a0 = Sum565(vcombine_u16(vget_low_u16(a2_b2), a2));
       sum565_b0 = Sum565W(vcombine_u16(vget_high_u16(a2_b2), b2));
     }

     row[0] = row[2];
     row[1] = row[3];
     row[2] = row[4];

     row_sq[0] = row_sq[2];
     row_sq[1] = row_sq[3];
     row_sq[2] = row_sq[4];

     const uint8_t* src_ptr = src + x;
     uint16_t* out_buf = buf + x;

     // Calculate one output line. Add in the line from the previous pass and
     // output one even row. Sum the new line and output the odd row. Carry the
     // new row into the next pass.
     int y = 0;
     do {
       row[3] = vld1_u8(column);
       column += stride;
       row[4] = vld1_u8(column);
       column += stride;

       row_sq[3] = vmull_u8(row[3], row[3]);
       row_sq[4] = vmull_u8(row[4], row[4]);

       const uint16x4_t sum = Sum5Horizontal(Sum5Vertical(row));
       const uint32x4_t sum_sq = Sum5Horizontal(Sum5Vertical(row_sq));

       const uint16x4_t a2 = CalculateA2<n>(sum_sq, sum, s, v_255);
       const uint16x4_t b2 = CalculateB2Shifted(a2, sum, one_over_n);

       const uint16x8_t a2_b2 = vld1q_u16(temp_column);
       vst1q_u16(temp_column, vcombine_u16(a2, b2));
       temp_column += 8;

       uint16x4_t sum565_a1 = Sum565(vcombine_u16(vget_low_u16(a2_b2), a2));
       uint32x4_t sum565_b1 = Sum565W(vcombine_u16(vget_high_u16(a2_b2), b2));

       const uint8x8_t src_u8 = vld1_u8(src_ptr);
       src_ptr += stride;
       const uint16x4_t src_u16 = vget_low_u16(vmovl_u8(src_u8));

       const uint16x4_t output =
           CalculateFilteredOutput<5>(vadd_u16(sum565_a0, sum565_a1),
                                      vaddq_u32(sum565_b0, sum565_b1), src_u16);

       vst1_u16(out_buf, output);
       out_buf += kIntermediateStride;

       const uint8x8_t src0_u8 = vld1_u8(src_ptr);
       src_ptr += stride;
       const uint16x4_t src0_u16 = vget_low_u16(vmovl_u8(src0_u8));

       const uint16x4_t output1 =
           CalculateFilteredOutput<4>(sum565_a1, sum565_b1, src0_u16);
       vst1_u16(out_buf, output1);
       out_buf += kIntermediateStride;

       row[0] = row[2];
       row[1] = row[3];
       row[2] = row[4];

       row_sq[0] = row_sq[2];
       row_sq[1] = row_sq[3];
       row_sq[2] = row_sq[4];

       sum565_a0 = sum565_a1;
       sum565_b0 = sum565_b1;
       y += 2;
     } while (y < height);
     x += 4;
   } while (x < width);
 }

 inline void BoxFilterPreProcess_SecondPass(const uint8_t* const src,
                                            const ptrdiff_t stride,
                                            const int width, const int height,
                                            const uint32_t s,
                                            uint16_t* const a2) {
   // Number of elements in the box being summed.
   const uint32_t n = 9;
   const uint32_t one_over_n = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;

   const uint16x4_t v_255 = vdup_n_u16(255);

   // Calculate intermediate results, including one-pixel border, for example,
   // if unit size is 64x64, we calculate 66x66 pixels.
   // Because of the vectors this calculates in blocks of 4 so we actually
   // get 68 values. This doesn't appear to be causing problems yet but it
   // might.
   const uint8_t* const src_top_left_corner = src - 1 - 2 * stride;
   int x = -1;
   do {
     const uint8_t* column = src_top_left_corner + x;
     uint16_t* a2_column = a2 + (x + 1);
     uint8x8_t row[3];
     row[0] = vld1_u8(column);
     column += stride;
     row[1] = vld1_u8(column);
     column += stride;

     uint16x8_t row_sq[3];
     row_sq[0] = vmull_u8(row[0], row[0]);
     row_sq[1] = vmull_u8(row[1], row[1]);

     int y = -1;
     do {
       row[2] = vld1_u8(column);
       column += stride;

       row_sq[2] = vmull_u8(row[2], row[2]);

       const uint16x4_t sum = Sum3Horizontal(Sum3Vertical(row));
       const uint32x4_t sum_sq = Sum3Horizontal(Sum3Vertical(row_sq));

       const uint16x4_t a2_v = CalculateA2<n>(sum_sq, sum, s, v_255);

       vst1_u16(a2_column, a2_v);
       a2_column += kIntermediateStride;

       vst1_u16(a2_column, CalculateB2Shifted(a2_v, sum, one_over_n));
       a2_column += kIntermediateStride;

       row[0] = row[1];
       row[1] = row[2];

       row_sq[0] = row_sq[1];
       row_sq[1] = row_sq[2];
     } while (++y < height + 1);
     x += 4;
   } while (x < width + 1);
 }

 inline uint16x4_t Sum444(const uint16x8_t a) {
   return Sum3Horizontal(vshlq_n_u16(a, 2));
 }

 inline uint32x4_t Sum444W(const uint16x8_t a) {
   return Sum3HorizontalW(vshll_n_u16(vget_low_u16(a), 2),
                          vshll_n_u16(vget_high_u16(a), 2));
 }

 inline uint16x4_t Sum343(const uint16x8_t a) {
   const uint16x4_t middle = vext_u16(vget_low_u16(a), vget_high_u16(a), 1);
   const uint16x4_t sum = Sum3Horizontal(a);
   return vadd_u16(vadd_u16(vadd_u16(sum, sum), sum), middle);
 }

 inline uint32x4_t Sum343W(const uint16x8_t a) {
   const uint16x4_t middle = vext_u16(vget_low_u16(a), vget_high_u16(a), 1);
   const uint32x4_t sum =
       Sum3HorizontalW(vmovl_u16(vget_low_u16(a)), vmovl_u16(vget_high_u16(a)));
   return vaddw_u16(vaddq_u32(vaddq_u32(sum, sum), sum), middle);
 }

 inline void BoxFilterProcess_SecondPass(const uint8_t* src,
                                         const ptrdiff_t stride, const int width,
                                         const int height, const uint32_t s,
                                         uint16_t* const intermediate_buffer) {
   uint16_t* const a2 =
       intermediate_buffer + kIntermediateStride * kIntermediateHeight;

   BoxFilterPreProcess_SecondPass(src, stride, width, height, s, a2);

   int x = 0;
   do {
     uint16_t* a2_ptr = a2 + x;
     const uint8_t* src_ptr = src + x;
     // |filtered_output| must match how |a2| values are read since they are
     // written out over the |a2| values which have already been used.
     uint16_t* filtered_output = a2_ptr;

     uint16x4_t sum343_a[2], sum444_a;
     uint32x4_t sum343_b[2], sum444_b;

     sum343_a[0] = Sum343(vld1q_u16(a2_ptr));
     a2_ptr += kIntermediateStride;

     sum343_b[0] = Sum343W(vld1q_u16(a2_ptr));
     a2_ptr += kIntermediateStride;

     const uint16x8_t a_1 = vld1q_u16(a2_ptr);
     a2_ptr += kIntermediateStride;
     sum343_a[1] = Sum343(a_1);
     sum444_a = Sum444(a_1);

     const uint16x8_t b_1 = vld1q_u16(a2_ptr);
     a2_ptr += kIntermediateStride;
     sum343_b[1] = Sum343W(b_1);
     sum444_b = Sum444W(b_1);

     int y = 0;
     do {
       const uint16x8_t a_2 = vld1q_u16(a2_ptr);
       a2_ptr += kIntermediateStride;

       const uint16x4_t sum343_a2 = Sum343(a_2);
       const uint16x4_t a_v =
           vadd_u16(vadd_u16(sum343_a[0], sum444_a), sum343_a2);
       sum444_a = Sum444(a_2);
       sum343_a[0] = sum343_a[1];
       sum343_a[1] = sum343_a2;

       const uint16x8_t b_2 = vld1q_u16(a2_ptr);
       a2_ptr += kIntermediateStride;

       const uint32x4_t sum343_b2 = Sum343W(b_2);
       const uint32x4_t b_v =
           vaddq_u32(vaddq_u32(sum343_b[0], sum444_b), sum343_b2);
       sum444_b = Sum444W(b_2);
       sum343_b[0] = sum343_b[1];
       sum343_b[1] = sum343_b2;

       // Load 8 values and discard 4.
       const uint8x8_t src_u8 = vld1_u8(src_ptr);
       const uint16x4_t src_u16 = vget_low_u16(vmovl_u8(src_u8));

       vst1_u16(filtered_output, CalculateFilteredOutput<5>(a_v, b_v, src_u16));

       src_ptr += stride;
       filtered_output += kIntermediateStride;
     } while (++y < height);
     x += 4;
   } while (x < width);
 }

 template <int min_width>
 inline void SelfGuidedSingleMultiplier(const uint8_t* src,
                                        const ptrdiff_t src_stride,
                                        uint16_t* box_filter_process_output,
                                        uint8_t* dst, const ptrdiff_t dst_stride,
                                        const int width, const int height,
                                        const int16_t w_combo,
                                        const int16x4_t w_single) {
   static_assert(min_width == 4 || min_width == 8, "");

   int y = 0;
   do {
     if (min_width == 8) {
       int x = 0;
       do {
         const int16x8_t u = vreinterpretq_s16_u16(
             vshll_n_u8(vld1_u8(src + x), kSgrProjRestoreBits));
         const int16x8_t p =
             vreinterpretq_s16_u16(vld1q_u16(box_filter_process_output + x));

         // u * w1 + u * wN == u * (w1 + wN)
         int32x4_t v_lo = vmull_n_s16(vget_low_s16(u), w_combo);
         v_lo = vmlal_s16(v_lo, vget_low_s16(p), w_single);

         int32x4_t v_hi = vmull_n_s16(vget_high_s16(u), w_combo);
         v_hi = vmlal_s16(v_hi, vget_high_s16(p), w_single);

         const int16x4_t s_lo =
             vrshrn_n_s32(v_lo, kSgrProjRestoreBits + kSgrProjPrecisionBits);
         const int16x4_t s_hi =
             vrshrn_n_s32(v_hi, kSgrProjRestoreBits + kSgrProjPrecisionBits);
         vst1_u8(dst + x, vqmovun_s16(vcombine_s16(s_lo, s_hi)));
         x += 8;
       } while (x < width);
     } else if (min_width == 4) {
       const int16x8_t u =
           vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(src), kSgrProjRestoreBits));
       const int16x8_t p =
           vreinterpretq_s16_u16(vld1q_u16(box_filter_process_output));

       // u * w1 + u * wN == u * (w1 + wN)
       int32x4_t v_lo = vmull_n_s16(vget_low_s16(u), w_combo);
       v_lo = vmlal_s16(v_lo, vget_low_s16(p), w_single);

       int32x4_t v_hi = vmull_n_s16(vget_high_s16(u), w_combo);
       v_hi = vmlal_s16(v_hi, vget_high_s16(p), w_single);

       const int16x4_t s_lo =
           vrshrn_n_s32(v_lo, kSgrProjRestoreBits + kSgrProjPrecisionBits);
       const int16x4_t s_hi =
           vrshrn_n_s32(v_hi, kSgrProjRestoreBits + kSgrProjPrecisionBits);
       StoreLo4(dst, vqmovun_s16(vcombine_s16(s_lo, s_hi)));
     }
     src += src_stride;
     dst += dst_stride;
     box_filter_process_output += kIntermediateStride;
   } while (++y < height);
 }

 template <int min_width>
 inline void SelfGuidedDoubleMultiplier(const uint8_t* src,
                                        const ptrdiff_t src_stride,
                                        uint16_t* box_filter_process_output[2],
                                        uint8_t* dst, const ptrdiff_t dst_stride,
                                        const int width, const int height,
                                        const int16x4_t w0, const int w1,
                                        const int16x4_t w2) {
   static_assert(min_width == 4 || min_width == 8, "");

   int y = 0;
   do {
     if (min_width == 8) {
       int x = 0;
       do {
         // |wN| values are signed. |src| values can be treated as int16_t.
         const int16x8_t u = vreinterpretq_s16_u16(
             vshll_n_u8(vld1_u8(src + x), kSgrProjRestoreBits));
         // |box_filter_process_output| is 14 bits, also safe to treat as
         // int16_t.
         const int16x8_t p0 =
             vreinterpretq_s16_u16(vld1q_u16(box_filter_process_output[0] + x));
         const int16x8_t p1 =
             vreinterpretq_s16_u16(vld1q_u16(box_filter_process_output[1] + x));

         int32x4_t v_lo = vmull_n_s16(vget_low_s16(u), w1);
         v_lo = vmlal_s16(v_lo, vget_low_s16(p0), w0);
         v_lo = vmlal_s16(v_lo, vget_low_s16(p1), w2);

         int32x4_t v_hi = vmull_n_s16(vget_high_s16(u), w1);
         v_hi = vmlal_s16(v_hi, vget_high_s16(p0), w0);
         v_hi = vmlal_s16(v_hi, vget_high_s16(p1), w2);

         // |s| is saturated to uint8_t.
         const int16x4_t s_lo =
             vrshrn_n_s32(v_lo, kSgrProjRestoreBits + kSgrProjPrecisionBits);
         const int16x4_t s_hi =
             vrshrn_n_s32(v_hi, kSgrProjRestoreBits + kSgrProjPrecisionBits);
         vst1_u8(dst + x, vqmovun_s16(vcombine_s16(s_lo, s_hi)));
         x += 8;
       } while (x < width);
     } else if (min_width == 4) {
       // |wN| values are signed. |src| values can be treated as int16_t.
       // Load 8 values but ignore 4.
       const int16x4_t u = vget_low_s16(
           vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(src), kSgrProjRestoreBits)));
       // |box_filter_process_output| is 14 bits, also safe to treat as
       // int16_t.
       const int16x4_t p0 =
           vreinterpret_s16_u16(vld1_u16(box_filter_process_output[0]));
       const int16x4_t p1 =
           vreinterpret_s16_u16(vld1_u16(box_filter_process_output[1]));

       int32x4_t v = vmull_n_s16(u, w1);
       v = vmlal_s16(v, p0, w0);
       v = vmlal_s16(v, p1, w2);

       // |s| is saturated to uint8_t.
       const int16x4_t s =
           vrshrn_n_s32(v, kSgrProjRestoreBits + kSgrProjPrecisionBits);
       StoreLo4(dst, vqmovun_s16(vcombine_s16(s, s)));
     }
     src += src_stride;
     dst += dst_stride;
     box_filter_process_output[0] += kIntermediateStride;
     box_filter_process_output[1] += kIntermediateStride;
   } while (++y < height);
 }

 // Assume box_filter_process_output[2] are allocated before calling
 // this function. Their sizes are width * height, stride equals width.
 void SelfGuidedFilter_NEON(const void* const source, void* const dest,
                            const RestorationUnitInfo& restoration_info,
                            ptrdiff_t source_stride, ptrdiff_t dest_stride,
                            const int width, const int height,
                            RestorationBuffer* const /*buffer*/) {
   // The output frame is broken into blocks of 64x64 (32x32 if U/V are
   // subsampled). If either dimension is less than 32/64 it indicates it is at
   // the right or bottom edge of the frame. It is safe to overwrite the output
   // as it will not be part of the visible frame. This saves us from having to
   // handle non-multiple-of-8 widths.
   // We could round here, but the for loop with += 8 does the same thing.

   // width = (width + 7) & ~0x7;

   // -96 to 96 (Sgrproj_Xqd_Min/Max)
   const int8_t w0 = restoration_info.sgr_proj_info.multiplier[0];
   const int8_t w1 = restoration_info.sgr_proj_info.multiplier[1];
   const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
   const int index = restoration_info.sgr_proj_info.index;
   const uint8_t radius_pass_0 = kSgrProjParams[index][0];
   const uint8_t radius_pass_1 = kSgrProjParams[index][2];
   const auto* src = static_cast<const uint8_t*>(source);
   auto* dst = static_cast<uint8_t*>(dest);

   // |intermediate_buffer| is broken down into three distinct regions, each with
   // size |kIntermediateStride| * |kIntermediateHeight|.
   // The first is for final output of the first pass of PreProcess/Process. It
   // can be stored in |width| * |height| (at most 64x64).
   // The second and third are scratch space for |a2| and |b2| values from
   // PreProcess.
   //
   // At the end of BoxFilterProcess_SecondPass() the output is stored over |a2|.

   uint16_t intermediate_buffer[3 * kIntermediateStride * kIntermediateHeight];
   uint16_t* box_filter_process_output[2] = {
       intermediate_buffer,
       intermediate_buffer + kIntermediateStride * kIntermediateHeight};

   // If |radius| is 0 then there is nothing to do. If |radius| is not 0, it is
   // always 2 for the first pass and 1 for the second pass.
   if (radius_pass_0 != 0) {
     BoxFilterProcess_FirstPass(src, source_stride, width, height,
                                kSgrScaleParameter[index][0],
                                intermediate_buffer);
   }

   if (radius_pass_1 != 0) {
     BoxFilterProcess_SecondPass(src, source_stride, width, height,
                                 kSgrScaleParameter[index][1],
                                 intermediate_buffer);
   }

   // Put |w[02]| in vectors because we can use vmull_n_s16() for |w1| but there
   // is no vmlal_n_s16().
   const int16x4_t w0_v = vdup_n_s16(w0);
   const int16x4_t w2_v = vdup_n_s16(w2);
   if (radius_pass_0 != 0 && radius_pass_1 != 0) {
     if (width > 4) {
       SelfGuidedDoubleMultiplier<8>(src, source_stride,
                                     box_filter_process_output, dst, dest_stride,
                                     width, height, w0_v, w1, w2_v);
     } else /* if (width == 4) */ {
       SelfGuidedDoubleMultiplier<4>(src, source_stride,
                                     box_filter_process_output, dst, dest_stride,
                                     width, height, w0_v, w1, w2_v);
     }
   } else {
     int16_t w_combo;
     int16x4_t w_single;
     uint16_t* box_filter_process_output_n;
     if (radius_pass_0 != 0) {
       w_combo = w1 + w2;
       w_single = w0_v;
       box_filter_process_output_n = box_filter_process_output[0];
     } else /* if (radius_pass_1 != 0) */ {
       w_combo = w1 + w0;
       w_single = w2_v;
       box_filter_process_output_n = box_filter_process_output[1];
     }

     if (width > 4) {
       SelfGuidedSingleMultiplier<8>(
           src, source_stride, box_filter_process_output_n, dst, dest_stride,
           width, height, w_combo, w_single);
     } else /* if (width == 4) */ {
       SelfGuidedSingleMultiplier<4>(
           src, source_stride, box_filter_process_output_n, dst, dest_stride,
           width, height, w_combo, w_single);
     }
   }
 }

 void Init8bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
   assert(dsp != nullptr);
   dsp->loop_restorations[0] = WienerFilter_NEON;
   dsp->loop_restorations[1] = SelfGuidedFilter_NEON;
 }

 }  // namespace
 }  // namespace low_bitdepth

 void LoopRestorationInit_NEON() { low_bitdepth::Init8bpp(); }

 }  // namespace dsp
 }  // namespace libgav1

 #else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {

 void LoopRestorationInit_NEON() {}

 }  // namespace dsp
 }  // namespace libgav1
 #endif  // LIBGAV1_ENABLE_NEON