libgav1/src/dsp/arm/convolve_neon.cc - platform/external/libgav1 - Git at Google

 #include "src/dsp/convolve.h"
 #include "src/dsp/dsp.h"

 #if LIBGAV1_ENABLE_NEON

 #include <arm_neon.h>

 #include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>

 #include "src/dsp/arm/common_neon.h"
 #include "src/utils/common.h"

 namespace libgav1 {
 namespace dsp {
 namespace low_bitdepth {
 namespace {

 constexpr int kBitdepth8 = 8;
 constexpr int kIntermediateStride = kMaxSuperBlockSizeInPixels;
 constexpr int kSubPixelMask = (1 << kSubPixelBits) - 1;
 constexpr int kHorizontalOffset = 3;
 constexpr int kVerticalOffset = 3;
 constexpr int kInterRoundBitsVertical = 11;

 int GetFilterIndex(const int filter_index, const int length) {
   if (length <= 4) {
     if (filter_index == kInterpolationFilterEightTap ||
         filter_index == kInterpolationFilterEightTapSharp) {
       return 4;
     }
     if (filter_index == kInterpolationFilterEightTapSmooth) {
       return 5;
     }
   }
   return filter_index;
 }

 inline int16x8_t ZeroExtend(const uint8x8_t in) {
   return vreinterpretq_s16_u16(vmovl_u8(in));
 }

 inline void Load8x8(const uint8_t* s, const ptrdiff_t p, int16x8_t* dst) {
   dst[0] = ZeroExtend(vld1_u8(s));
   s += p;
   dst[1] = ZeroExtend(vld1_u8(s));
   s += p;
   dst[2] = ZeroExtend(vld1_u8(s));
   s += p;
   dst[3] = ZeroExtend(vld1_u8(s));
   s += p;
   dst[4] = ZeroExtend(vld1_u8(s));
   s += p;
   dst[5] = ZeroExtend(vld1_u8(s));
   s += p;
   dst[6] = ZeroExtend(vld1_u8(s));
   s += p;
   dst[7] = ZeroExtend(vld1_u8(s));
 }

 // Multiply every entry in |src[]| by the corresponding lane in |taps| and sum.
 // The sum of the entries in |taps| is always 128. In some situations negative
 // values are used. This creates a situation where the positive taps sum to more
 // than 128. An example is:
 // {-4, 10, -24, 100, 60, -20, 8, -2}
 // The negative taps never sum to < -128
 // The center taps are always positive. The remaining positive taps never sum
 // to > 128.
 // Summing these naively can overflow int16_t. This can be avoided by adding the
 // center taps last and saturating the result.
 // We do not need to expand to int32_t because later in the function the value
 // is shifted by |kFilterBits| (7) and saturated to uint8_t. This means any
 // value over 255 << 7 (32576 because of rounding) is clamped.
 template <int num_taps>
 int16x8_t SumTaps(const int16x8_t* const src, const int16x8_t taps) {
   int16x8_t sum;
   if (num_taps == 8) {
     const int16x4_t taps_lo = vget_low_s16(taps);
     const int16x4_t taps_hi = vget_high_s16(taps);
     sum = vmulq_lane_s16(src[0], taps_lo, 0);
     sum = vmlaq_lane_s16(sum, src[1], taps_lo, 1);
     sum = vmlaq_lane_s16(sum, src[2], taps_lo, 2);
     sum = vmlaq_lane_s16(sum, src[5], taps_hi, 1);
     sum = vmlaq_lane_s16(sum, src[6], taps_hi, 2);
     sum = vmlaq_lane_s16(sum, src[7], taps_hi, 3);

     // Center taps.
     sum = vqaddq_s16(sum, vmulq_lane_s16(src[3], taps_lo, 3));
     sum = vqaddq_s16(sum, vmulq_lane_s16(src[4], taps_hi, 0));
   } else if (num_taps == 6) {
     const int16x4_t taps_lo = vget_low_s16(taps);
     const int16x4_t taps_hi = vget_high_s16(taps);
     sum = vmulq_lane_s16(src[0], taps_lo, 1);
     sum = vmlaq_lane_s16(sum, src[1], taps_lo, 2);
     sum = vmlaq_lane_s16(sum, src[4], taps_hi, 1);
     sum = vmlaq_lane_s16(sum, src[5], taps_hi, 2);

     // Center taps.
     sum = vqaddq_s16(sum, vmulq_lane_s16(src[2], taps_lo, 3));
     sum = vqaddq_s16(sum, vmulq_lane_s16(src[3], taps_hi, 0));
   } else if (num_taps == 4) {
     const int16x4_t taps_lo = vget_low_s16(taps);
     sum = vmulq_lane_s16(src[0], taps_lo, 0);
     sum = vmlaq_lane_s16(sum, src[3], taps_lo, 3);

     // Center taps.
     sum = vqaddq_s16(sum, vmulq_lane_s16(src[1], taps_lo, 1));
     sum = vqaddq_s16(sum, vmulq_lane_s16(src[2], taps_lo, 2));
   } else {
     assert(num_taps == 2);
     // All the taps are positive so there is no concern regarding saturation.
     const int16x4_t taps_lo = vget_low_s16(taps);
     sum = vmulq_lane_s16(src[0], taps_lo, 1);
     sum = vmlaq_lane_s16(sum, src[1], taps_lo, 2);
   }

   return sum;
 }

 // Add an offset to ensure the sum is positive and it fits within uint16_t.
 template <int num_taps>
 uint16x8_t SumTaps8To16(const int16x8_t* const src, const int16x8_t taps) {
   // The worst case sum of negative taps is -56. The worst case sum of positive
   // taps is 184. With the single pass versions of the Convolve we could safely
   // saturate to int16_t because it outranged the final shift and narrow to
   // uint8_t. For the 2D Convolve the intermediate values are 16 bits so we
   // don't have that option.
   // 184 * 255 = 46920 which is greater than int16_t can hold, but not uint16_t.
   // The minimum value we need to handle is -56 * 255 = -14280.
   // By offsetting the sum with 1 << 14 = 16384 we ensure that the sum is never
   // negative and that 46920 + 16384 = 63304 fits comfortably in uint16_t. This
   // allows us to use 16 bit registers instead of 32 bit registers.
   // When considering the bit operations it is safe to ignore signedness. Due to
   // the magic of 2's complement and well defined rollover rules the bit
   // representations are equivalent.
   const int16x4_t taps_lo = vget_low_s16(taps);
   const int16x4_t taps_hi = vget_high_s16(taps);
   // |offset| == 1 << (bitdepth + kFilterBits - 1);
   int16x8_t sum = vdupq_n_s16(1 << 14);
   if (num_taps == 8) {
     sum = vmlaq_lane_s16(sum, src[0], taps_lo, 0);
     sum = vmlaq_lane_s16(sum, src[1], taps_lo, 1);
     sum = vmlaq_lane_s16(sum, src[2], taps_lo, 2);
     sum = vmlaq_lane_s16(sum, src[3], taps_lo, 3);
     sum = vmlaq_lane_s16(sum, src[4], taps_hi, 0);
     sum = vmlaq_lane_s16(sum, src[5], taps_hi, 1);
     sum = vmlaq_lane_s16(sum, src[6], taps_hi, 2);
     sum = vmlaq_lane_s16(sum, src[7], taps_hi, 3);
   } else if (num_taps == 6) {
     sum = vmlaq_lane_s16(sum, src[0], taps_lo, 1);
     sum = vmlaq_lane_s16(sum, src[1], taps_lo, 2);
     sum = vmlaq_lane_s16(sum, src[2], taps_lo, 3);
     sum = vmlaq_lane_s16(sum, src[3], taps_hi, 0);
     sum = vmlaq_lane_s16(sum, src[4], taps_hi, 1);
     sum = vmlaq_lane_s16(sum, src[5], taps_hi, 2);
   } else if (num_taps == 4) {
     sum = vmlaq_lane_s16(sum, src[0], taps_lo, 2);
     sum = vmlaq_lane_s16(sum, src[1], taps_lo, 3);
     sum = vmlaq_lane_s16(sum, src[2], taps_hi, 0);
     sum = vmlaq_lane_s16(sum, src[3], taps_hi, 1);
   } else if (num_taps == 2) {
     sum = vmlaq_lane_s16(sum, src[0], taps_lo, 3);
     sum = vmlaq_lane_s16(sum, src[1], taps_hi, 0);
   }

   // This is guaranteed to be positive. Convert it for the final shift.
   return vreinterpretq_u16_s16(sum);
 }

 template <int num_taps, int filter_index, bool negative_outside_taps = true>
 uint16x8_t SumCompoundHorizontalTaps(const uint8_t* const src,
                                      const uint8x8_t* const v_tap) {
   // Start with an offset to guarantee the sum is non negative.
   uint16x8_t v_sum = vdupq_n_u16(1 << 14);
   uint8x16_t v_src[8];
   v_src[0] = vld1q_u8(&src[0]);
   if (num_taps == 8) {
     v_src[1] = vextq_u8(v_src[0], v_src[0], 1);
     v_src[2] = vextq_u8(v_src[0], v_src[0], 2);
     v_src[3] = vextq_u8(v_src[0], v_src[0], 3);
     v_src[4] = vextq_u8(v_src[0], v_src[0], 4);
     v_src[5] = vextq_u8(v_src[0], v_src[0], 5);
     v_src[6] = vextq_u8(v_src[0], v_src[0], 6);
     v_src[7] = vextq_u8(v_src[0], v_src[0], 7);

     // tap signs : - + - + + - + -
     v_sum = vmlsl_u8(v_sum, vget_low_u8(v_src[0]), v_tap[0]);
     v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[1]), v_tap[1]);
     v_sum = vmlsl_u8(v_sum, vget_low_u8(v_src[2]), v_tap[2]);
     v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[3]), v_tap[3]);
     v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[4]), v_tap[4]);
     v_sum = vmlsl_u8(v_sum, vget_low_u8(v_src[5]), v_tap[5]);
     v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[6]), v_tap[6]);
     v_sum = vmlsl_u8(v_sum, vget_low_u8(v_src[7]), v_tap[7]);
   } else if (num_taps == 6) {
     v_src[1] = vextq_u8(v_src[0], v_src[0], 1);
     v_src[2] = vextq_u8(v_src[0], v_src[0], 2);
     v_src[3] = vextq_u8(v_src[0], v_src[0], 3);
     v_src[4] = vextq_u8(v_src[0], v_src[0], 4);
     v_src[5] = vextq_u8(v_src[0], v_src[0], 5);
     v_src[6] = vextq_u8(v_src[0], v_src[0], 6);
     if (filter_index == 0) {
       // tap signs : + - + + - +
       v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[1]), v_tap[1]);
       v_sum = vmlsl_u8(v_sum, vget_low_u8(v_src[2]), v_tap[2]);
       v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[3]), v_tap[3]);
       v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[4]), v_tap[4]);
       v_sum = vmlsl_u8(v_sum, vget_low_u8(v_src[5]), v_tap[5]);
       v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[6]), v_tap[6]);
     } else {
       if (negative_outside_taps) {
         // tap signs : - + + + + -
         v_sum = vmlsl_u8(v_sum, vget_low_u8(v_src[1]), v_tap[1]);
         v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[2]), v_tap[2]);
         v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[3]), v_tap[3]);
         v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[4]), v_tap[4]);
         v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[5]), v_tap[5]);
         v_sum = vmlsl_u8(v_sum, vget_low_u8(v_src[6]), v_tap[6]);
       } else {
         // tap signs : + + + + + +
         v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[1]), v_tap[1]);
         v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[2]), v_tap[2]);
         v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[3]), v_tap[3]);
         v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[4]), v_tap[4]);
         v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[5]), v_tap[5]);
         v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[6]), v_tap[6]);
       }
     }
   } else if (num_taps == 4) {
     v_src[2] = vextq_u8(v_src[0], v_src[0], 2);
     v_src[3] = vextq_u8(v_src[0], v_src[0], 3);
     v_src[4] = vextq_u8(v_src[0], v_src[0], 4);
     v_src[5] = vextq_u8(v_src[0], v_src[0], 5);
     if (filter_index == 4) {
       // tap signs : - + + -
       v_sum = vmlsl_u8(v_sum, vget_low_u8(v_src[2]), v_tap[2]);
       v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[3]), v_tap[3]);
       v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[4]), v_tap[4]);
       v_sum = vmlsl_u8(v_sum, vget_low_u8(v_src[5]), v_tap[5]);
     } else {
       // tap signs : + + + +
       v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[2]), v_tap[2]);
       v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[3]), v_tap[3]);
       v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[4]), v_tap[4]);
       v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[5]), v_tap[5]);
     }
   } else {
     assert(num_taps == 2);
     v_src[3] = vextq_u8(v_src[0], v_src[0], 3);
     v_src[4] = vextq_u8(v_src[0], v_src[0], 4);
     // tap signs : + +
     v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[3]), v_tap[3]);
     v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[4]), v_tap[4]);
   }

   return v_sum;
 }

 template <int num_taps, int filter_index>
 uint16x8_t SumHorizontalTaps2xH(const uint8_t* src, const ptrdiff_t src_stride,
                                 const uint8x8_t* const v_tap) {
   constexpr int positive_offset_bits = kBitdepth8 + kFilterBits - 1;
   uint16x8_t sum = vdupq_n_u16(1 << positive_offset_bits);
   uint8x8_t input0 = vld1_u8(src);
   src += src_stride;
   uint8x8_t input1 = vld1_u8(src);
   uint8x8x2_t input = vzip_u8(input0, input1);

   if (num_taps == 2) {
     // tap signs : + +
     sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
     sum = vmlal_u8(sum, input.val[1], v_tap[4]);
   } else if (filter_index == 4) {
     // tap signs : - + + -
     sum = vmlsl_u8(sum, RightShift<4 * 8>(input.val[0]), v_tap[2]);
     sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
     sum = vmlal_u8(sum, input.val[1], v_tap[4]);
     sum = vmlsl_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]);
   } else {
     // tap signs : + + + +
     sum = vmlal_u8(sum, RightShift<4 * 8>(input.val[0]), v_tap[2]);
     sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
     sum = vmlal_u8(sum, input.val[1], v_tap[4]);
     sum = vmlal_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]);
   }

   return vrshrq_n_u16(sum, kInterRoundBitsHorizontal);
 }

 // TODO(johannkoenig): Rename this function. It works for more than just
 // compound convolutions.
 template <int num_taps, int step, int filter_index,
           bool negative_outside_taps = true, bool is_2d = false,
           bool is_8bit = false>
 void ConvolveCompoundHorizontalBlock(const uint8_t* src,
                                      const ptrdiff_t src_stride,
                                      void* const dest,
                                      const ptrdiff_t pred_stride,
                                      const int width, const int height,
                                      const uint8x8_t* const v_tap) {
   const uint16x8_t v_compound_round_offset = vdupq_n_u16(1 << (kBitdepth8 + 4));
   const int16x8_t v_inter_round_bits_0 =
       vdupq_n_s16(-kInterRoundBitsHorizontal);

   auto* dest8 = static_cast<uint8_t*>(dest);
   auto* dest16 = static_cast<uint16_t*>(dest);

   if (width > 4) {
     int y = 0;
     do {
       int x = 0;
       do {
         uint16x8_t v_sum =
             SumCompoundHorizontalTaps<num_taps, filter_index,
                                       negative_outside_taps>(&src[x], v_tap);
         if (is_8bit) {
           // Split shifts the way they are in C. They can be combined but that
           // makes removing the 1 << 14 offset much more difficult.
           v_sum = vrshrq_n_u16(v_sum, kInterRoundBitsHorizontal);
           int16x8_t v_sum_signed = vreinterpretq_s16_u16(vsubq_u16(
               v_sum, vdupq_n_u16(1 << (14 - kInterRoundBitsHorizontal))));
           uint8x8_t result = vqrshrun_n_s16(
               v_sum_signed, kFilterBits - kInterRoundBitsHorizontal);
           vst1_u8(&dest8[x], result);
         } else {
           v_sum = vrshlq_u16(v_sum, v_inter_round_bits_0);
           if (!is_2d) {
             v_sum = vaddq_u16(v_sum, v_compound_round_offset);
           }
           vst1q_u16(&dest16[x], v_sum);
         }
         x += step;
       } while (x < width);
       src += src_stride;
       dest8 += pred_stride;
       dest16 += pred_stride;
     } while (++y < height);
     return;
   } else if (width == 4) {
     int y = 0;
     do {
       uint16x8_t v_sum =
           SumCompoundHorizontalTaps<num_taps, filter_index,
                                     negative_outside_taps>(&src[0], v_tap);
       if (is_8bit) {
         v_sum = vrshrq_n_u16(v_sum, kInterRoundBitsHorizontal);
         int16x8_t v_sum_signed = vreinterpretq_s16_u16(vsubq_u16(
             v_sum, vdupq_n_u16(1 << (14 - kInterRoundBitsHorizontal))));
         uint8x8_t result = vqrshrun_n_s16(
             v_sum_signed, kFilterBits - kInterRoundBitsHorizontal);
         StoreLo4(&dest8[0], result);
       } else {
         v_sum = vrshlq_u16(v_sum, v_inter_round_bits_0);
         if (!is_2d) {
           v_sum = vaddq_u16(v_sum, v_compound_round_offset);
         }
         vst1_u16(&dest16[0], vget_low_u16(v_sum));
       }
       src += src_stride;
       dest8 += pred_stride;
       dest16 += pred_stride;
     } while (++y < height);
     return;
   }

   // Horizontal passes only needs to account for |num_taps| 2 and 4 when
   // |width| == 2.
   assert(width == 2);
   assert(num_taps <= 4);

   constexpr int positive_offset_bits = kBitdepth8 + kFilterBits - 1;
   // Leave off + 1 << (kBitdepth8 + 3).
   constexpr int compound_round_offset = 1 << (kBitdepth8 + 4);

   if (num_taps <= 4) {
     int y = 0;
     do {
       // TODO(johannkoenig): Re-order the values for storing.
       uint16x8_t sum =
           SumHorizontalTaps2xH<num_taps, filter_index>(src, src_stride, v_tap);

       if (is_2d) {
         dest16[0] = vgetq_lane_u16(sum, 0);
         dest16[1] = vgetq_lane_u16(sum, 2);
         dest16 += pred_stride;
         dest16[0] = vgetq_lane_u16(sum, 1);
         dest16[1] = vgetq_lane_u16(sum, 3);
         dest16 += pred_stride;
       } else if (!is_8bit) {
         // None of the test vectors hit this path but the unit tests do.
         sum = vaddq_u16(sum, vdupq_n_u16(compound_round_offset));

         dest16[0] = vgetq_lane_u16(sum, 0);
         dest16[1] = vgetq_lane_u16(sum, 2);
         dest16 += pred_stride;
         dest16[0] = vgetq_lane_u16(sum, 1);
         dest16[1] = vgetq_lane_u16(sum, 3);
         dest16 += pred_stride;
       } else {
         // Split shifts the way they are in C. They can be combined but that
         // makes removing the 1 << 14 offset much more difficult.
         int16x8_t sum_signed = vreinterpretq_s16_u16(vsubq_u16(
             sum, vdupq_n_u16(
                      1 << (positive_offset_bits - kInterRoundBitsHorizontal))));
         uint8x8_t result =
             vqrshrun_n_s16(sum_signed, kFilterBits - kInterRoundBitsHorizontal);

         // Could de-interleave and vst1_lane_u16().
         dest8[0] = vget_lane_u8(result, 0);
         dest8[1] = vget_lane_u8(result, 2);
         dest8 += pred_stride;

         dest8[0] = vget_lane_u8(result, 1);
         dest8[1] = vget_lane_u8(result, 3);
         dest8 += pred_stride;
       }

       src += src_stride << 1;
       y += 2;
     } while (y < height - 1);

     // The 2d filters have an odd |height| because the horizontal pass generates
     // context for the vertical pass.
     if (is_2d) {
       assert(height % 2 == 1);
       uint16x8_t sum = vdupq_n_u16(1 << positive_offset_bits);
       uint8x8_t input = vld1_u8(src);
       if (filter_index == 3) {  // |num_taps| == 2
         sum = vmlal_u8(sum, RightShift<3 * 8>(input), v_tap[3]);
         sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
       } else if (filter_index == 4) {
         sum = vmlsl_u8(sum, RightShift<2 * 8>(input), v_tap[2]);
         sum = vmlal_u8(sum, RightShift<3 * 8>(input), v_tap[3]);
         sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
         sum = vmlsl_u8(sum, RightShift<5 * 8>(input), v_tap[5]);
       } else {
         assert(filter_index == 5);
         sum = vmlal_u8(sum, RightShift<2 * 8>(input), v_tap[2]);
         sum = vmlal_u8(sum, RightShift<3 * 8>(input), v_tap[3]);
         sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
         sum = vmlal_u8(sum, RightShift<5 * 8>(input), v_tap[5]);
         sum = vrshrq_n_u16(sum, kInterRoundBitsHorizontal);
       }
       Store2<0>(dest16, sum);
     }
   }
 }

 // Process 16 bit inputs and output 32 bits.
 template <int num_taps>
 uint32x4x2_t Sum2DVerticalTaps(const int16x8_t* const src,
                                const int16x8_t taps) {
   // In order to get the rollover correct with the lengthening instruction we
   // need to treat these as signed so that they sign extend properly.
   const int16x4_t taps_lo = vget_low_s16(taps);
   const int16x4_t taps_hi = vget_high_s16(taps);
   // An offset to guarantee the sum is non negative. Captures 56 * -4590 =
   // 257040 (worst case negative value from horizontal pass). It should be
   // possible to use 1 << 18 (262144) instead of 1 << 19 but there probably
   // isn't any benefit.
   // |offset_bits| = bitdepth + 2 * kFilterBits - kInterRoundBitsHorizontal
   // == 19.
   int32x4_t sum_lo = vdupq_n_s32(1 << 19);
   int32x4_t sum_hi = sum_lo;
   if (num_taps == 8) {
     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[0]), taps_lo, 0);
     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[0]), taps_lo, 0);
     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 1);
     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 1);
     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 2);
     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_lo, 2);
     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_lo, 3);
     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_lo, 3);

     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[4]), taps_hi, 0);
     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[4]), taps_hi, 0);
     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 1);
     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[5]), taps_hi, 1);
     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[6]), taps_hi, 2);
     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[6]), taps_hi, 2);
     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[7]), taps_hi, 3);
     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[7]), taps_hi, 3);
   } else if (num_taps == 6) {
     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[0]), taps_lo, 1);
     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[0]), taps_lo, 1);
     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 2);
     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 2);
     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 3);
     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_lo, 3);

     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_hi, 0);
     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_hi, 0);
     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[4]), taps_hi, 1);
     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[4]), taps_hi, 1);
     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 2);
     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[5]), taps_hi, 2);
   } else if (num_taps == 4) {
     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[0]), taps_lo, 2);
     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[0]), taps_lo, 2);
     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 3);
     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 3);

     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_hi, 0);
     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_hi, 0);
     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_hi, 1);
     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_hi, 1);
   } else if (num_taps == 2) {
     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[0]), taps_lo, 3);
     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[0]), taps_lo, 3);

     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_hi, 0);
     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_hi, 0);
   }

   // This is guaranteed to be positive. Convert it for the final shift.
   const uint32x4x2_t return_val = {vreinterpretq_u32_s32(sum_lo),
                                    vreinterpretq_u32_s32(sum_hi)};
   return return_val;
 }

 // Process 16 bit inputs and output 32 bits.
 template <int num_taps>
 uint32x4_t Sum2DVerticalTaps(const int16x4_t* const src, const int16x8_t taps) {
   // In order to get the rollover correct with the lengthening instruction we
   // need to treat these as signed so that they sign extend properly.
   const int16x4_t taps_lo = vget_low_s16(taps);
   const int16x4_t taps_hi = vget_high_s16(taps);
   // An offset to guarantee the sum is non negative. Captures 56 * -4590 =
   // 257040 (worst case negative value from horizontal pass). It should be
   // possible to use 1 << 18 (262144) instead of 1 << 19 but there probably
   // isn't any benefit.
   // |offset_bits| = bitdepth + 2 * kFilterBits - kInterRoundBitsHorizontal
   // == 19.
   int32x4_t sum = vdupq_n_s32(1 << 19);
   if (num_taps == 8) {
     sum = vmlal_lane_s16(sum, src[0], taps_lo, 0);
     sum = vmlal_lane_s16(sum, src[1], taps_lo, 1);
     sum = vmlal_lane_s16(sum, src[2], taps_lo, 2);
     sum = vmlal_lane_s16(sum, src[3], taps_lo, 3);

     sum = vmlal_lane_s16(sum, src[4], taps_hi, 0);
     sum = vmlal_lane_s16(sum, src[5], taps_hi, 1);
     sum = vmlal_lane_s16(sum, src[6], taps_hi, 2);
     sum = vmlal_lane_s16(sum, src[7], taps_hi, 3);
   } else if (num_taps == 6) {
     sum = vmlal_lane_s16(sum, src[0], taps_lo, 1);
     sum = vmlal_lane_s16(sum, src[1], taps_lo, 2);
     sum = vmlal_lane_s16(sum, src[2], taps_lo, 3);

     sum = vmlal_lane_s16(sum, src[3], taps_hi, 0);
     sum = vmlal_lane_s16(sum, src[4], taps_hi, 1);
     sum = vmlal_lane_s16(sum, src[5], taps_hi, 2);
   } else if (num_taps == 4) {
     sum = vmlal_lane_s16(sum, src[0], taps_lo, 2);
     sum = vmlal_lane_s16(sum, src[1], taps_lo, 3);

     sum = vmlal_lane_s16(sum, src[2], taps_hi, 0);
     sum = vmlal_lane_s16(sum, src[3], taps_hi, 1);
   } else if (num_taps == 2) {
     sum = vmlal_lane_s16(sum, src[0], taps_lo, 3);

     sum = vmlal_lane_s16(sum, src[1], taps_hi, 0);
   }

   // This is guaranteed to be positive. Convert it for the final shift.
   return vreinterpretq_u32_s32(sum);
 }

 template <int num_taps, bool is_compound = false>
 void Filter2DVertical(const uint16_t* src, const ptrdiff_t src_stride,
                       void* const dst, const ptrdiff_t dst_stride,
                       const int width, const int height, const int16x8_t taps,
                       const int inter_round_bits_vertical) {
   constexpr int next_row = num_taps - 1;
   const int32x4_t v_inter_round_bits_vertical =
       vdupq_n_s32(-inter_round_bits_vertical);

   auto* dst8 = static_cast<uint8_t*>(dst);
   auto* dst16 = static_cast<uint16_t*>(dst);

   if (width > 4) {
     int x = 0;
     do {
       int16x8_t srcs[8];
       srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src + x));
       if (num_taps >= 4) {
         srcs[1] = vreinterpretq_s16_u16(vld1q_u16(src + x + src_stride));
         srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src + x + 2 * src_stride));
         if (num_taps >= 6) {
           srcs[3] = vreinterpretq_s16_u16(vld1q_u16(src + x + 3 * src_stride));
           srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src + x + 4 * src_stride));
           if (num_taps == 8) {
             srcs[5] =
                 vreinterpretq_s16_u16(vld1q_u16(src + x + 5 * src_stride));
             srcs[6] =
                 vreinterpretq_s16_u16(vld1q_u16(src + x + 6 * src_stride));
           }
         }
       }

       int y = 0;
       do {
         srcs[next_row] = vreinterpretq_s16_u16(
             vld1q_u16(src + x + (y + next_row) * src_stride));

         const uint32x4x2_t sums = Sum2DVerticalTaps<num_taps>(srcs, taps);
         if (is_compound) {
           const uint16x8_t results = vcombine_u16(
               vmovn_u32(vqrshlq_u32(sums.val[0], v_inter_round_bits_vertical)),
               vmovn_u32(vqrshlq_u32(sums.val[1], v_inter_round_bits_vertical)));
           vst1q_u16(dst16 + x + y * dst_stride, results);
         } else {
           const uint16x8_t first_shift =
               vcombine_u16(vqrshrn_n_u32(sums.val[0], kInterRoundBitsVertical),
                            vqrshrn_n_u32(sums.val[1], kInterRoundBitsVertical));
           // |single_round_offset| == (1 << bitdepth) + (1 << (bitdepth - 1)) ==
           // 384
           const uint8x8_t results =
               vqmovn_u16(vqsubq_u16(first_shift, vdupq_n_u16(384)));

           vst1_u8(dst8 + x + y * dst_stride, results);
         }

         srcs[0] = srcs[1];
         if (num_taps >= 4) {
           srcs[1] = srcs[2];
           srcs[2] = srcs[3];
           if (num_taps >= 6) {
             srcs[3] = srcs[4];
             srcs[4] = srcs[5];
             if (num_taps == 8) {
               srcs[5] = srcs[6];
               srcs[6] = srcs[7];
             }
           }
         }
       } while (++y < height);
       x += 8;
     } while (x < width);
     return;
   }

   assert(width == 4);
   int16x4_t srcs[8];
   srcs[0] = vreinterpret_s16_u16(vld1_u16(src));
   src += src_stride;
   if (num_taps >= 4) {
     srcs[1] = vreinterpret_s16_u16(vld1_u16(src));
     src += src_stride;
     srcs[2] = vreinterpret_s16_u16(vld1_u16(src));
     src += src_stride;
     if (num_taps >= 6) {
       srcs[3] = vreinterpret_s16_u16(vld1_u16(src));
       src += src_stride;
       srcs[4] = vreinterpret_s16_u16(vld1_u16(src));
       src += src_stride;
       if (num_taps == 8) {
         srcs[5] = vreinterpret_s16_u16(vld1_u16(src));
         src += src_stride;
         srcs[6] = vreinterpret_s16_u16(vld1_u16(src));
         src += src_stride;
       }
     }
   }

   int y = 0;
   do {
     srcs[next_row] = vreinterpret_s16_u16(vld1_u16(src));
     src += src_stride;

     const uint32x4_t sums = Sum2DVerticalTaps<num_taps>(srcs, taps);
     if (is_compound) {
       const uint16x4_t results =
           vmovn_u32(vqrshlq_u32(sums, v_inter_round_bits_vertical));
       vst1_u16(dst16, results);
       dst16 += dst_stride;
     } else {
       const uint16x4_t first_shift =
           vqrshrn_n_u32(sums, kInterRoundBitsVertical);
       // |single_round_offset| == (1 << bitdepth) + (1 << (bitdepth - 1)) ==
       // 384
       const uint8x8_t results = vqmovn_u16(
           vcombine_u16(vqsub_u16(first_shift, vdup_n_u16(384)), vdup_n_u16(0)));

       StoreLo4(dst8, results);
       dst8 += dst_stride;
     }

     srcs[0] = srcs[1];
     if (num_taps >= 4) {
       srcs[1] = srcs[2];
       srcs[2] = srcs[3];
       if (num_taps >= 6) {
         srcs[3] = srcs[4];
         srcs[4] = srcs[5];
         if (num_taps == 8) {
           srcs[5] = srcs[6];
           srcs[6] = srcs[7];
         }
       }
     }
   } while (++y < height);
 }

 template <bool is_2d = false, bool is_8bit = false>
 void HorizontalPass(const uint8_t* const src, const ptrdiff_t src_stride,
                     void* const dst, const ptrdiff_t dst_stride,
                     const int width, const int height, const int subpixel,
                     const int filter_index) {
   // Duplicate the absolute value for each tap.  Negative taps are corrected
   // by using the vmlsl_u8 instruction.  Positive taps use vmlal_u8.
   uint8x8_t v_tap[kSubPixelTaps];
   const int filter_id = (subpixel >> 6) & kSubPixelMask;
   for (int k = 0; k < kSubPixelTaps; ++k) {
     v_tap[k] = vreinterpret_u8_s8(
         vabs_s8(vdup_n_s8(kSubPixelFilters[filter_index][filter_id][k])));
   }

   if (filter_index == 2) {  // 8 tap.
     ConvolveCompoundHorizontalBlock<8, 8, 2, true, is_2d, is_8bit>(
         src, src_stride, dst, dst_stride, width, height, v_tap);
   } else if (filter_index == 1) {  // 6 tap.
     // Check if outside taps are positive.
     if ((filter_id == 1) | (filter_id == 15)) {
       ConvolveCompoundHorizontalBlock<6, 8, 1, false, is_2d, is_8bit>(
           src, src_stride, dst, dst_stride, width, height, v_tap);
     } else {
       ConvolveCompoundHorizontalBlock<6, 8, 1, true, is_2d, is_8bit>(
           src, src_stride, dst, dst_stride, width, height, v_tap);
     }
   } else if (filter_index == 0) {  // 6 tap.
     ConvolveCompoundHorizontalBlock<6, 8, 0, true, is_2d, is_8bit>(
         src, src_stride, dst, dst_stride, width, height, v_tap);
   } else if (filter_index == 4) {  // 4 tap.
     ConvolveCompoundHorizontalBlock<4, 8, 4, true, is_2d, is_8bit>(
         src, src_stride, dst, dst_stride, width, height, v_tap);
   } else if (filter_index == 5) {  // 4 tap.
     ConvolveCompoundHorizontalBlock<4, 8, 5, true, is_2d, is_8bit>(
         src, src_stride, dst, dst_stride, width, height, v_tap);
   } else {  // 2 tap.
     ConvolveCompoundHorizontalBlock<2, 8, 3, true, is_2d, is_8bit>(
         src, src_stride, dst, dst_stride, width, height, v_tap);
   }
 }

 // There are three forms of this function:
 // 2D: input 8bit, output 16bit. |is_compound| has no effect.
 // 1D Horizontal: input 8bit, output 8bit.
 // 1D Compound Horizontal: input 8bit, output 16bit. Different rounding from 2D.
 // |width| is guaranteed to be 2 because all other cases are handled in neon.
 template <bool is_2d = true, bool is_compound = false>
 void HorizontalPass2xH(const uint8_t* src, const ptrdiff_t src_stride,
                        void* const dst, const ptrdiff_t dst_stride,
                        const int height, const int filter_index, const int taps,
                        const int subpixel) {
   // Even though |is_compound| has no effect when |is_2d| is true we block this
   // combination in case the compiler gets confused.
   static_assert(!is_2d || !is_compound, "|is_compound| is ignored.");
   // Since this only handles |width| == 2, we only need to be concerned with
   // 2 or 4 tap filters.
   assert(taps == 2 || taps == 4);
   auto* dst8 = static_cast<uint8_t*>(dst);
   auto* dst16 = static_cast<uint16_t*>(dst);

   const int compound_round_offset =
       (1 << (kBitdepth8 + 4)) + (1 << (kBitdepth8 + 3));

   const int filter_id = (subpixel >> 6) & kSubPixelMask;
   const int taps_start = (kSubPixelTaps - taps) / 2;
   int y = 0;
   do {
     int x = 0;
     do {
       int sum;
       if (is_2d) {
         // An offset to guarantee the sum is non negative.
         sum = 1 << (kBitdepth8 + kFilterBits - 1);
       } else if (is_compound) {
         sum = 0;
       } else {
         // 1D non-Compound. The C uses a two stage shift with rounding. Here the
         // shifts are combined and the rounding bit from the first stage is
         // added in.
         // (sum + 4 >> 3) + 8) >> 4 == (sum + 64 + 4) >> 7
         sum = 4;
       }
       for (int k = 0; k < taps; ++k) {
         const int tap = k + taps_start;
         sum += kSubPixelFilters[filter_index][filter_id][tap] * src[x + k];
       }
       if (is_2d) {
         dst16[x] = static_cast<int16_t>(
             RightShiftWithRounding(sum, kInterRoundBitsHorizontal));
       } else if (is_compound) {
         sum = RightShiftWithRounding(sum, kInterRoundBitsHorizontal);
         dst16[x] = sum + compound_round_offset;
       } else {
         // 1D non-Compound.
         dst8[x] = static_cast<uint8_t>(
             Clip3(RightShiftWithRounding(sum, kFilterBits), 0, 255));
       }
     } while (++x < 2);

     src += src_stride;
     dst8 += dst_stride;
     dst16 += dst_stride;
   } while (++y < height);
 }

 // This will always need to handle all |filter_index| values. Even with |width|
 // restricted to 2 the value of |height| can go up to at least 16.
 template <bool is_2d = true, bool is_compound = false>
 void VerticalPass2xH(const void* const src, const ptrdiff_t src_stride,
                      void* const dst, const ptrdiff_t dst_stride,
                      const int height, const int inter_round_bits_vertical,
                      const int filter_index, const int taps,
                      const int subpixel) {
   const auto* src8 = static_cast<const uint8_t*>(src);
   const auto* src16 = static_cast<const uint16_t*>(src);
   auto* dst8 = static_cast<uint8_t*>(dst);
   auto* dst16 = static_cast<uint16_t*>(dst);
   const int filter_id = (subpixel >> 6) & kSubPixelMask;
   const int taps_start = (kSubPixelTaps - taps) / 2;
   constexpr int max_pixel_value = (1 << kBitdepth8) - 1;

   int y = 0;
   do {
     int x = 0;
     do {
       int sum;
       if (is_2d) {
         sum = 1 << (kBitdepth8 + 2 * kFilterBits - kInterRoundBitsHorizontal);
       } else if (is_compound) {
         // TODO(johannkoenig): Keeping the sum positive is valuable for neon but
         // may not actually help the C implementation. Investigate removing
         // this.
         // Use this offset to cancel out 1 << (kBitdepth8 + 3) >> 3 from
         // |compound_round_offset|.
         sum = (1 << (kBitdepth8 + 3)) << 3;
       } else {
         sum = 0;
       }

       for (int k = 0; k < taps; ++k) {
         const int tap = k + taps_start;
         if (is_2d) {
           sum += kSubPixelFilters[filter_index][filter_id][tap] *
                  src16[x + k * src_stride];
         } else {
           sum += kSubPixelFilters[filter_index][filter_id][tap] *
                  src8[x + k * src_stride];
         }
       }

       if (is_2d) {
         if (is_compound) {
           dst16[x] = static_cast<uint16_t>(
               RightShiftWithRounding(sum, inter_round_bits_vertical));
         } else {
           constexpr int single_round_offset =
               (1 << kBitdepth8) + (1 << (kBitdepth8 - 1));
           dst8[x] = static_cast<uint8_t>(
               Clip3(RightShiftWithRounding(sum, kInterRoundBitsVertical) -
                         single_round_offset,
                     0, max_pixel_value));
         }
       } else if (is_compound) {
         // Leave off + 1 << (kBitdepth8 + 3).
         constexpr int compound_round_offset = 1 << (kBitdepth8 + 4);
         dst16[x] = RightShiftWithRounding(sum, 3) + compound_round_offset;
       } else {
         // 1D non-compound.
         dst8[x] = static_cast<uint8_t>(Clip3(
             RightShiftWithRounding(sum, kFilterBits), 0, max_pixel_value));
       }
     } while (++x < 2);

     src8 += src_stride;
     src16 += src_stride;
     dst8 += dst_stride;
     dst16 += dst_stride;
   } while (++y < height);
 }

 int NumTapsInFilter(const int filter_index) {
   if (filter_index < 2) {
     // Despite the names these only use 6 taps.
     // kInterpolationFilterEightTap
     // kInterpolationFilterEightTapSmooth
     return 6;
   }

   if (filter_index == 2) {
     // kInterpolationFilterEightTapSharp
     return 8;
   }

   if (filter_index == 3) {
     // kInterpolationFilterBilinear
     return 2;
   }

   assert(filter_index > 3);
   // For small sizes (width/height <= 4) the large filters are replaced with 4
   // tap options.
   // If the original filters were |kInterpolationFilterEightTap| or
   // |kInterpolationFilterEightTapSharp| then it becomes
   // |kInterpolationFilterSwitchable|.
   // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4
   // tap filter.
   return 4;
 }

 void Convolve2D_NEON(const void* const reference,
                      const ptrdiff_t reference_stride,
                      const int horizontal_filter_index,
                      const int vertical_filter_index,
                      const int /*inter_round_bits_vertical*/,
                      const int subpixel_x, const int subpixel_y,
                      const int /*step_x*/, const int /*step_y*/,
                      const int width, const int height, void* prediction,
                      const ptrdiff_t pred_stride) {
   const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
   const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
   const int horizontal_taps = NumTapsInFilter(horiz_filter_index);
   const int vertical_taps = NumTapsInFilter(vert_filter_index);

   // The output of the horizontal filter is guaranteed to fit in 16 bits.
   uint16_t
       intermediate_result[kMaxSuperBlockSizeInPixels *
                           (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
   const int intermediate_stride = width;
   const int intermediate_height = height + vertical_taps - 1;

   if (width >= 4) {
     const ptrdiff_t src_stride = reference_stride;
     const auto* src = static_cast<const uint8_t*>(reference) -
                       (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;

     HorizontalPass<true>(src, src_stride, intermediate_result,
                          intermediate_stride, width, intermediate_height,
                          subpixel_x, horiz_filter_index);

     // Vertical filter.
     auto* dest = static_cast<uint8_t*>(prediction);
     const ptrdiff_t dest_stride = pred_stride;
     const int filter_id = ((subpixel_y & 1023) >> 6) & kSubPixelMask;
     const int16x8_t taps =
         vld1q_s16(kSubPixelFilters[vert_filter_index][filter_id]);

     if (vertical_taps == 8) {
       Filter2DVertical<8>(intermediate_result, intermediate_stride, dest,
                           dest_stride, width, height, taps, 0);
     } else if (vertical_taps == 6) {
       Filter2DVertical<6>(intermediate_result, intermediate_stride, dest,
                           dest_stride, width, height, taps, 0);
     } else if (vertical_taps == 4) {
       Filter2DVertical<4>(intermediate_result, intermediate_stride, dest,
                           dest_stride, width, height, taps, 0);
     } else {  // |vertical_taps| == 2
       Filter2DVertical<2>(intermediate_result, intermediate_stride, dest,
                           dest_stride, width, height, taps, 0);
     }
   } else {
     assert(width == 2);
     // Horizontal filter.
     const auto* const src = static_cast<const uint8_t*>(reference) -
                             ((vertical_taps / 2) - 1) * reference_stride -
                             ((horizontal_taps / 2) - 1);

     HorizontalPass2xH(src, reference_stride, intermediate_result,
                       intermediate_stride, intermediate_height,
                       horiz_filter_index, horizontal_taps, subpixel_x);

     // Vertical filter.
     auto* dest = static_cast<uint8_t*>(prediction);
     const ptrdiff_t dest_stride = pred_stride;

     VerticalPass2xH(intermediate_result, intermediate_stride, dest, dest_stride,
                     height, 0, vert_filter_index, vertical_taps, subpixel_y);
   }
 }

 template <int tap_lane0, int tap_lane1>
 inline int16x8_t CombineFilterTapsLong(const int16x8_t sum,
                                        const int16x8_t src0, int16x8_t src1,
                                        int16x4_t taps0, int16x4_t taps1) {
   int32x4_t sum_lo = vmovl_s16(vget_low_s16(sum));
   int32x4_t sum_hi = vmovl_s16(vget_high_s16(sum));
   const int16x8_t product0 = vmulq_lane_s16(src0, taps0, tap_lane0);
   const int16x8_t product1 = vmulq_lane_s16(src1, taps1, tap_lane1);
   const int32x4_t center_vals_lo =
       vaddl_s16(vget_low_s16(product0), vget_low_s16(product1));
   const int32x4_t center_vals_hi =
       vaddl_s16(vget_high_s16(product0), vget_high_s16(product1));

   sum_lo = vaddq_s32(sum_lo, center_vals_lo);
   sum_hi = vaddq_s32(sum_hi, center_vals_hi);
   return vcombine_s16(vrshrn_n_s32(sum_lo, 3), vrshrn_n_s32(sum_hi, 3));
 }

 // TODO(b/133525024): Replace usage of this function with version that uses
 // unsigned trick, once cl/263050071 is submitted.
 template <int num_taps>
 inline int16x8_t SumTapsCompound(const int16x8_t* const src,
                                  const int16x8_t taps) {
   int16x8_t sum = vdupq_n_s16(1 << (kBitdepth8 + kFilterBits - 1));
   if (num_taps == 8) {
     const int16x4_t taps_lo = vget_low_s16(taps);
     const int16x4_t taps_hi = vget_high_s16(taps);
     sum = vmlaq_lane_s16(sum, src[0], taps_lo, 0);
     sum = vmlaq_lane_s16(sum, src[1], taps_lo, 1);
     sum = vmlaq_lane_s16(sum, src[2], taps_lo, 2);
     sum = vmlaq_lane_s16(sum, src[5], taps_hi, 1);
     sum = vmlaq_lane_s16(sum, src[6], taps_hi, 2);
     sum = vmlaq_lane_s16(sum, src[7], taps_hi, 3);

     // Center taps may sum to as much as 160, which pollutes the sign bit in
     // int16 types.
     sum = CombineFilterTapsLong<3, 0>(sum, src[3], src[4], taps_lo, taps_hi);
   } else if (num_taps == 6) {
     const int16x4_t taps_lo = vget_low_s16(taps);
     const int16x4_t taps_hi = vget_high_s16(taps);
     sum = vmlaq_lane_s16(sum, src[0], taps_lo, 0);
     sum = vmlaq_lane_s16(sum, src[1], taps_lo, 1);
     sum = vmlaq_lane_s16(sum, src[4], taps_hi, 0);
     sum = vmlaq_lane_s16(sum, src[5], taps_hi, 1);

     // Center taps in filter 0 may sum to as much as 148, which pollutes the
     // sign bit in int16 types. This is not true of filter 1.
     sum = CombineFilterTapsLong<2, 3>(sum, src[2], src[3], taps_lo, taps_lo);
   } else if (num_taps == 4) {
     const int16x4_t taps_lo = vget_low_s16(taps);
     sum = vmlaq_lane_s16(sum, src[0], taps_lo, 0);
     sum = vmlaq_lane_s16(sum, src[3], taps_lo, 3);

     // Center taps.
     sum = vqaddq_s16(sum, vmulq_lane_s16(src[1], taps_lo, 1));
     sum = vrshrq_n_s16(vqaddq_s16(sum, vmulq_lane_s16(src[2], taps_lo, 2)),
                        kInterRoundBitsHorizontal);
   } else {
     assert(num_taps == 2);
     // All the taps are positive so there is no concern regarding saturation.
     const int16x4_t taps_lo = vget_low_s16(taps);
     sum = vmlaq_lane_s16(sum, src[0], taps_lo, 0);
     sum = vrshrq_n_s16(vmlaq_lane_s16(sum, src[1], taps_lo, 1),
                        kInterRoundBitsHorizontal);
   }
   return sum;
 }

 // |grade_x| determines an upper limit on how many whole-pixel steps will be
 // realized with 8 |step_x| increments.
 template <int filter_index, int num_taps, int grade_x>
 inline void ConvolveHorizontalScaled_NEON(const uint8_t* src,
                                           const ptrdiff_t src_stride,
                                           const int width, const int subpixel_x,
                                           const int step_x,
                                           const int intermediate_height,
                                           int16_t* dst) {
   const int dst_stride = kMaxSuperBlockSizeInPixels;
   const int kernel_offset = (8 - num_taps) / 2;
   const int ref_x = subpixel_x >> kScaleSubPixelBits;
   int y = intermediate_height;
   do {  // y > 0
     int p = subpixel_x;
     int prev_p = p;
     int x = 0;
     int16x8_t s[(grade_x + 1) * 8];
     const uint8_t* src_x =
         &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
     // TODO(petersonab,b/139707209): Fix source buffer overreads.
     // For example, when |height| == 2 and |num_taps| == 8 then
     // |intermediate_height| == 9. On the second pass this will load and
     // transpose 7 rows past where |src| may end.
     Load8x8(src_x, src_stride, s);
     Transpose8x8(s);
     if (grade_x > 1) {
       Load8x8(src_x + 8, src_stride, &s[8]);
       Transpose8x8(&s[8]);
     }

     do {  // x < width
       int16x8_t result[8];
       src_x = &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
       // process 8 src_x steps
       Load8x8(src_x + 8, src_stride, &s[8]);
       Transpose8x8(&s[8]);
       if (grade_x > 1) {
         Load8x8(src_x + 16, src_stride, &s[16]);
         Transpose8x8(&s[16]);
       }
       // Remainder after whole index increments.
       int pixel_offset = p & ((1 << kScaleSubPixelBits) - 1);
       for (int z = 0; z < 8; ++z) {
         const int16x8_t filter = vld1q_s16(
             &kSubPixelFilters[filter_index][(p >> 6) & 0xF][kernel_offset]);
         result[z] = SumTapsCompound<num_taps>(
             &s[pixel_offset >> kScaleSubPixelBits], filter);
         pixel_offset += step_x;
         p += step_x;
       }

       // Transpose the 8x8 filtered values back to dst.
       Transpose8x8(result);

       vst1q_s16(&dst[x + 0 * dst_stride], result[0]);
       vst1q_s16(&dst[x + 1 * dst_stride], result[1]);
       vst1q_s16(&dst[x + 2 * dst_stride], result[2]);
       vst1q_s16(&dst[x + 3 * dst_stride], result[3]);
       vst1q_s16(&dst[x + 4 * dst_stride], result[4]);
       vst1q_s16(&dst[x + 5 * dst_stride], result[5]);
       vst1q_s16(&dst[x + 6 * dst_stride], result[6]);
       vst1q_s16(&dst[x + 7 * dst_stride], result[7]);

       for (int i = 0; i < 8; ++i) {
         s[i] =
             s[(p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits) + i];
         if (grade_x > 1) {
           s[i + 8] = s[(p >> kScaleSubPixelBits) -
                        (prev_p >> kScaleSubPixelBits) + i + 8];
         }
       }

       prev_p = p;
       x += 8;
     } while (x < width);

     src += src_stride * 8;
     dst += dst_stride * 8;
     y -= 8;
   } while (y > 0);
 }

 inline uint8x16_t GetPositive2TapFilter(const int tap_index) {
   assert(tap_index < 2);
   constexpr uint8_t kSubPixel2TapFilterColumns[2][16] = {
       {128, 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8},
       {0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120}};

   return vld1q_u8(kSubPixel2TapFilterColumns[tap_index]);
 }

 inline void ConvolveKernelHorizontal2Tap(const uint8_t* src,
                                          const ptrdiff_t src_stride,
                                          const int width, const int subpixel_x,
                                          const int step_x,
                                          const int intermediate_height,
                                          int16_t* intermediate) {
   const int kIntermediateStride = kMaxSuperBlockSizeInPixels;
   // Account for the 0-taps that precede the 2 nonzero taps.
   const int kernel_offset = 3;
   const int ref_x = subpixel_x >> kScaleSubPixelBits;
   const int step_x8 = step_x << 3;
   const uint8x16_t filter_taps0 = GetPositive2TapFilter(0);
   const uint8x16_t filter_taps1 = GetPositive2TapFilter(1);
   const uint16x8_t sum = vdupq_n_u16(1 << (kBitdepth8 + kFilterBits - 1));
   uint16x8_t index_steps = vmulq_n_u16(vmovl_u8(vcreate_u8(0x0706050403020100)),
                                        static_cast<uint16_t>(step_x));

   const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
   for (int x = 0, p = subpixel_x; x < width; x += 8, p += step_x8) {
     const uint8_t* src_x =
         &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
     int16_t* intermediate_x = intermediate + x;
     // Only add steps to the 10-bit truncated p to avoid overflow.
     const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
     const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
     const uint8x8_t filter_indices =
         vand_u8(vshrn_n_u16(subpel_index_offsets, 6), filter_index_mask);
     // This is a special case. The 2-tap filter has no negative taps, so we
     // can use unsigned values.
     // For each x, a lane of tapsK has
     // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
     // on x.
     const uint8x8_t taps0 = VQTbl1U8(filter_taps0, filter_indices);
     const uint8x8_t taps1 = VQTbl1U8(filter_taps1, filter_indices);
     for (int y = 0; y < intermediate_height; ++y) {
       // Load a pool of samples to select from using stepped indices.
       uint8x16_t src_vals = vld1q_u8(src_x);
       const uint8x8_t src_indices =
           vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));

       // For each x, a lane of srcK contains src_x[k].
       const uint8x8_t src0 = VQTbl1U8(src_vals, src_indices);
       const uint8x8_t src1 =
           VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)));

       const uint16x8_t product0 = vmlal_u8(sum, taps0, src0);
       // product0 + product1
       const uint16x8_t result = vmlal_u8(product0, taps1, src1);

       vst1q_s16(intermediate_x, vreinterpretq_s16_u16(vrshrq_n_u16(result, 3)));
       src_x += src_stride;
       intermediate_x += kIntermediateStride;
     }
   }
 }

 inline uint8x16_t GetPositive4TapFilter(const int tap_index) {
   assert(tap_index < 4);
   constexpr uint8_t kSubPixel4TapPositiveFilterColumns[4][16] = {
       {0, 30, 26, 22, 20, 18, 16, 14, 12, 12, 10, 8, 6, 4, 4, 2},
       {128, 62, 62, 62, 60, 58, 56, 54, 52, 48, 46, 44, 42, 40, 36, 34},
       {0, 34, 36, 40, 42, 44, 46, 48, 52, 54, 56, 58, 60, 62, 62, 62},
       {0, 2, 4, 4, 6, 8, 10, 12, 12, 14, 16, 18, 20, 22, 26, 30}};

   uint8x16_t filter_taps =
       vld1q_u8(kSubPixel4TapPositiveFilterColumns[tap_index]);
   return filter_taps;
 }

 // This filter is only possible when width <= 4.
 inline void ConvolveKernelHorizontalPositive4Tap(
     const uint8_t* src, const ptrdiff_t src_stride, const int subpixel_x,
     const int step_x, const int intermediate_height, int16_t* intermediate) {
   const int kernel_offset = 2;
   const int ref_x = subpixel_x >> kScaleSubPixelBits;
   const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
   const uint8x16_t filter_taps0 = GetPositive4TapFilter(0);
   const uint8x16_t filter_taps1 = GetPositive4TapFilter(1);
   const uint8x16_t filter_taps2 = GetPositive4TapFilter(2);
   const uint8x16_t filter_taps3 = GetPositive4TapFilter(3);
   uint16x8_t index_steps = vmulq_n_u16(vmovl_u8(vcreate_u8(0x0706050403020100)),
                                        static_cast<uint16_t>(step_x));
   int p = subpixel_x;
   const uint16x8_t base = vdupq_n_u16(1 << (kBitdepth8 + kFilterBits - 1));
   // First filter is special, just a 128 tap on the center.
   const uint8_t* src_x =
       &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
   // Only add steps to the 10-bit truncated p to avoid overflow.
   const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
   const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
   const uint8x8_t filter_indices =
       vand_u8(vshrn_n_u16(subpel_index_offsets, 6), filter_index_mask);
   // Note that filter_id depends on x.
   // For each x, tapsK has kSubPixelFilters[filter_index][filter_id][k].
   const uint8x8_t taps0 = VQTbl1U8(filter_taps0, filter_indices);
   const uint8x8_t taps1 = VQTbl1U8(filter_taps1, filter_indices);
   const uint8x8_t taps2 = VQTbl1U8(filter_taps2, filter_indices);
   const uint8x8_t taps3 = VQTbl1U8(filter_taps3, filter_indices);

   const uint8x8_t src_indices =
       vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
   for (int y = 0; y < intermediate_height; ++y) {
     // Load a pool of samples to select from using stepped index vectors.
     uint8x16_t src_vals = vld1q_u8(src_x);

     // For each x, srcK contains src_x[k] where k=1.
     // Whereas taps come from different arrays, src pixels are drawn from the
     // same contiguous line.
     const uint8x8_t src0 = VQTbl1U8(src_vals, src_indices);
     const uint8x8_t src1 =
         VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)));
     const uint8x8_t src2 =
         VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(2)));
     const uint8x8_t src3 =
         VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(3)));

     uint16x8_t sum = vmlal_u8(base, taps0, src0);
     sum = vmlal_u8(sum, taps1, src1);
     sum = vmlal_u8(sum, taps2, src2);
     sum = vmlal_u8(sum, taps3, src3);

     vst1_s16(intermediate,
              vreinterpret_s16_u16(vrshr_n_u16(vget_low_u16(sum), 3)));

     src_x += src_stride;
     intermediate += kIntermediateStride;
   }
 }

 inline uint8x16_t GetSigned4TapFilter(const int tap_index) {
   assert(tap_index < 4);
   // The first and fourth taps of each filter are negative. However
   // 128 does not fit in an 8-bit signed integer. Thus we use subtraction to
   // keep everything unsigned.
   constexpr uint8_t kSubPixel4TapSignedFilterColumns[4][16] = {
       {0, 4, 8, 10, 12, 12, 14, 12, 12, 10, 10, 10, 8, 6, 4, 2},
       {128, 126, 122, 116, 110, 102, 94, 84, 76, 66, 58, 48, 38, 28, 18, 8},
       {0, 8, 18, 28, 38, 48, 58, 66, 76, 84, 94, 102, 110, 116, 122, 126},
       {0, 2, 4, 6, 8, 10, 10, 10, 12, 12, 14, 12, 12, 10, 8, 4}};

   uint8x16_t filter_taps =
       vld1q_u8(kSubPixel4TapSignedFilterColumns[tap_index]);
   return filter_taps;
 }

 // This filter is only possible when width <= 4.
 inline void ConvolveKernelHorizontalSigned4Tap(
     const uint8_t* src, const ptrdiff_t src_stride, const int subpixel_x,
     const int step_x, const int intermediate_height, int16_t* intermediate) {
   const int kernel_offset = 2;
   const int ref_x = subpixel_x >> kScaleSubPixelBits;
   const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
   const uint8x16_t filter_taps0 = GetSigned4TapFilter(0);
   const uint8x16_t filter_taps1 = GetSigned4TapFilter(1);
   const uint8x16_t filter_taps2 = GetSigned4TapFilter(2);
   const uint8x16_t filter_taps3 = GetSigned4TapFilter(3);
   const uint16x8_t index_steps = vmulq_n_u16(vmovl_u8(vcreate_u8(0x03020100)),
                                              static_cast<uint16_t>(step_x));

   const uint16x8_t base = vdupq_n_u16(1 << (kBitdepth8 + kFilterBits - 1));
   int p = subpixel_x;
   const uint8_t* src_x =
       &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
   // Only add steps to the 10-bit truncated p to avoid overflow.
   const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
   const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
   const uint8x8_t filter_indices =
       vand_u8(vshrn_n_u16(subpel_index_offsets, 6), filter_index_mask);
   // Note that filter_id depends on x.
   // For each x, tapsK has kSubPixelFilters[filter_index][filter_id][k].
   const uint8x8_t taps0 = VQTbl1U8(filter_taps0, filter_indices);
   const uint8x8_t taps1 = VQTbl1U8(filter_taps1, filter_indices);
   const uint8x8_t taps2 = VQTbl1U8(filter_taps2, filter_indices);
   const uint8x8_t taps3 = VQTbl1U8(filter_taps3, filter_indices);
   for (int y = 0; y < intermediate_height; ++y) {
     // Load a pool of samples to select from using stepped indices.
     uint8x16_t src_vals = vld1q_u8(src_x);
     const uint8x8_t src_indices =
         vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));

     // For each x, srcK contains src_x[k] where k=1.
     // Whereas taps come from different arrays, src pixels are drawn from the
     // same contiguous line.
     const uint8x8_t src0 = VQTbl1U8(src_vals, src_indices);
     const uint8x8_t src1 =
         VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)));
     const uint8x8_t src2 =
         VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(2)));
     const uint8x8_t src3 =
         VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(3)));

     // Offsetting by base permits a guaranteed positive.
     uint16x8_t sum = vmlsl_u8(base, taps0, src0);
     sum = vmlal_u8(sum, taps1, src1);
     sum = vmlal_u8(sum, taps2, src2);
     sum = vmlsl_u8(sum, taps3, src3);

     vst1_s16(intermediate,
              vreinterpret_s16_u16(vrshr_n_u16(vget_low_u16(sum), 3)));
     src_x += src_stride;
     intermediate += kIntermediateStride;
   }
 }

 void ConvolveCompoundScale2D_NEON(
     const void* const reference, const ptrdiff_t reference_stride,
     const int horizontal_filter_index, const int vertical_filter_index,
     const int inter_round_bits_vertical, const int subpixel_x,
     const int subpixel_y, const int step_x, const int step_y, const int width,
     const int height, void* prediction, const ptrdiff_t pred_stride) {
   const int intermediate_height =
       (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
        kScaleSubPixelBits) +
       kSubPixelTaps;
   // TODO(b/133525024): Decide whether it's worth branching to a special case
   // when step_x or step_y is 1024.
   assert(step_x <= 2048);
   // The output of the horizontal filter, i.e. the intermediate_result, is
   // guaranteed to fit in int16_t.
   int16_t intermediate_result[kMaxSuperBlockSizeInPixels *
                               (2 * kMaxSuperBlockSizeInPixels + 8)];

   // Horizontal filter.
   // Filter types used for width <= 4 are different from those for width > 4.
   // When width > 4, the valid filter index range is always [0, 3].
   // When width <= 4, the valid filter index range is always [3, 5].
   // Similarly for height.
   const int kIntermediateStride = kMaxSuperBlockSizeInPixels;
   int filter_index = GetFilterIndex(horizontal_filter_index, width);
   int16_t* intermediate = intermediate_result;
   const auto* src = static_cast<const uint8_t*>(reference);
   const ptrdiff_t src_stride = reference_stride;
   auto* dest = static_cast<uint16_t*>(prediction);
   switch (filter_index) {
     case 0:
       if (step_x < 1024) {
         ConvolveHorizontalScaled_NEON<0, 6, 1>(
             src, src_stride, width, subpixel_x, step_x, intermediate_height,
             intermediate);
       } else {
         ConvolveHorizontalScaled_NEON<0, 6, 2>(
             src, src_stride, width, subpixel_x, step_x, intermediate_height,
             intermediate);
       }
       break;
     case 1:
       if (step_x < 1024) {
         ConvolveHorizontalScaled_NEON<1, 6, 1>(
             src, src_stride, width, subpixel_x, step_x, intermediate_height,
             intermediate);
       } else {
         ConvolveHorizontalScaled_NEON<1, 6, 2>(
             src, src_stride, width, subpixel_x, step_x, intermediate_height,
             intermediate);
       }
       break;
     case 2:
       if (step_x <= 1024) {
         ConvolveHorizontalScaled_NEON<2, 8, 1>(
             src, src_stride, width, subpixel_x, step_x, intermediate_height,
             intermediate);
       } else {
         ConvolveHorizontalScaled_NEON<2, 8, 2>(
             src, src_stride, width, subpixel_x, step_x, intermediate_height,
             intermediate);
       }
       break;
     case 3:
       ConvolveKernelHorizontal2Tap(src, src_stride, width, subpixel_x, step_x,
                                    intermediate_height, intermediate);
       break;
     case 4:
       assert(width <= 4);
       ConvolveKernelHorizontalSigned4Tap(src, src_stride, subpixel_x, step_x,
                                          intermediate_height, intermediate);
       break;
     default:
       assert(filter_index == 5);
       ConvolveKernelHorizontalPositive4Tap(src, src_stride, subpixel_x, step_x,
                                            intermediate_height, intermediate);
   }
   // Vertical filter.
   filter_index = GetFilterIndex(vertical_filter_index, height);
   intermediate = intermediate_result;
   const int offset_bits = kBitdepth8 + 2 * kFilterBits - 3;
   for (int y = 0, p = subpixel_y & 1023; y < height; ++y, p += step_y) {
     const int filter_id = (p >> 6) & kSubPixelMask;
     for (int x = 0; x < width; ++x) {
       // An offset to guarantee the sum is non negative.
       int sum = 1 << offset_bits;
       for (int k = 0; k < kSubPixelTaps; ++k) {
         sum +=
             kSubPixelFilters[filter_index][filter_id][k] *
             intermediate[((p >> kScaleSubPixelBits) + k) * kIntermediateStride +
                          x];
       }
       assert(sum >= 0 && sum < (1 << (offset_bits + 2)));
       dest[x] = static_cast<uint16_t>(
           RightShiftWithRounding(sum, inter_round_bits_vertical));
     }
     dest += pred_stride;
   }
 }

 void ConvolveHorizontal_NEON(const void* const reference,
                              const ptrdiff_t reference_stride,
                              const int horizontal_filter_index,
                              const int /*vertical_filter_index*/,
                              const int /*inter_round_bits_vertical*/,
                              const int subpixel_x, const int /*subpixel_y*/,
                              const int /*step_x*/, const int /*step_y*/,
                              const int width, const int height,
                              void* prediction, const ptrdiff_t pred_stride) {
   // For 8 (and 10) bit calculations |inter_round_bits_horizontal| is 3.
   const int filter_index = GetFilterIndex(horizontal_filter_index, width);
   // Set |src| to the outermost tap.
   const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
   auto* dest = static_cast<uint8_t*>(prediction);

   HorizontalPass<false, true>(src, reference_stride, dest, pred_stride, width,
                               height, subpixel_x, filter_index);
 }

 template <int min_width, int num_taps>
 void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
                     uint8_t* dst, const ptrdiff_t dst_stride, const int width,
                     const int height, const int16x8_t taps) {
   constexpr int next_row = num_taps - 1;
   // |src| points to the outermost tap of the first value. When doing fewer than
   // 8 taps it needs to be adjusted.
   if (num_taps == 6) {
     src += src_stride;
   } else if (num_taps == 4) {
     src += 2 * src_stride;
   } else if (num_taps == 2) {
     src += 3 * src_stride;
   }

   int x = 0;
   do {
     int16x8_t srcs[8];
     srcs[0] = ZeroExtend(vld1_u8(src + x));
     if (num_taps >= 4) {
       srcs[1] = ZeroExtend(vld1_u8(src + x + src_stride));
       srcs[2] = ZeroExtend(vld1_u8(src + x + 2 * src_stride));
       if (num_taps >= 6) {
         srcs[3] = ZeroExtend(vld1_u8(src + x + 3 * src_stride));
         srcs[4] = ZeroExtend(vld1_u8(src + x + 4 * src_stride));
         if (num_taps == 8) {
           srcs[5] = ZeroExtend(vld1_u8(src + x + 5 * src_stride));
           srcs[6] = ZeroExtend(vld1_u8(src + x + 6 * src_stride));
         }
       }
     }

     int y = 0;
     do {
       srcs[next_row] =
           ZeroExtend(vld1_u8(src + x + (y + next_row) * src_stride));

       const int16x8_t sums = SumTaps<num_taps>(srcs, taps);
       const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits);

       if (min_width == 4) {
         StoreLo4(dst + x + y * dst_stride, results);
       } else {
         vst1_u8(dst + x + y * dst_stride, results);
       }

       srcs[0] = srcs[1];
       if (num_taps >= 4) {
         srcs[1] = srcs[2];
         srcs[2] = srcs[3];
         if (num_taps >= 6) {
           srcs[3] = srcs[4];
           srcs[4] = srcs[5];
           if (num_taps == 8) {
             srcs[5] = srcs[6];
             srcs[6] = srcs[7];
           }
         }
       }
     } while (++y < height);
     x += 8;
   } while (x < width);
 }

 // This function is a simplified version of Convolve2D_C.
 // It is called when it is single prediction mode, where only vertical
 // filtering is required.
 // The output is the single prediction of the block, clipped to valid pixel
 // range.
 void ConvolveVertical_NEON(const void* const reference,
                            const ptrdiff_t reference_stride,
                            const int /*horizontal_filter_index*/,
                            const int vertical_filter_index,
                            const int /*inter_round_bits_vertical*/,
                            const int /*subpixel_x*/, const int subpixel_y,
                            const int /*step_x*/, const int /*step_y*/,
                            const int width, const int height, void* prediction,
                            const ptrdiff_t pred_stride) {
   const int filter_index = GetFilterIndex(vertical_filter_index, height);
   const ptrdiff_t src_stride = reference_stride;
   const auto* src =
       static_cast<const uint8_t*>(reference) - kVerticalOffset * src_stride;
   auto* dest = static_cast<uint8_t*>(prediction);
   const ptrdiff_t dest_stride = pred_stride;
   const int filter_id = (subpixel_y >> 6) & kSubPixelMask;
   // First filter is always a copy.
   if (filter_id == 0) {
     // Move |src| down the actual values and not the start of the context.
     src = static_cast<const uint8_t*>(reference);
     int y = 0;
     do {
       memcpy(dest, src, width * sizeof(src[0]));
       src += src_stride;
       dest += dest_stride;
     } while (++y < height);
     return;
   }

   // Break up by # of taps
   // |filter_index| taps  enum InterpolationFilter
   //        0       6     kInterpolationFilterEightTap
   //        1       6     kInterpolationFilterEightTapSmooth
   //        2       8     kInterpolationFilterEightTapSharp
   //        3       2     kInterpolationFilterBilinear
   //        4       4     kInterpolationFilterSwitchable
   //        5       4     !!! SECRET FILTER !!! only for Wx4.
   if (width >= 4) {
     if (filter_index == 2) {  // 8 tap.
       const int16x8_t taps =
           vld1q_s16(kSubPixelFilters[filter_index][filter_id]);
       if (width == 4) {
         FilterVertical<4, 8>(src, src_stride, dest, dest_stride, width, height,
                              taps);
       } else {
         FilterVertical<8, 8>(src, src_stride, dest, dest_stride, width, height,
                              taps);
       }
     } else if (filter_index < 2) {  // 6 tap.
       const int16x8_t taps =
           vld1q_s16(kSubPixelFilters[filter_index][filter_id]);
       if (width == 4) {
         FilterVertical<4, 6>(src, src_stride, dest, dest_stride, width, height,
                              taps);
       } else {
         FilterVertical<8, 6>(src, src_stride, dest, dest_stride, width, height,
                              taps);
       }
     } else if (filter_index > 3) {  // 4 tap.
       // Store taps in vget_low_s16(taps).
       const int16x8_t taps =
           vld1q_s16(kSubPixelFilters[filter_index][filter_id] + 2);
       if (width == 4) {
         FilterVertical<4, 4>(src, src_stride, dest, dest_stride, width, height,
                              taps);
       } else {
         FilterVertical<8, 4>(src, src_stride, dest, dest_stride, width, height,
                              taps);
       }
     } else {  // 2 tap.
       // Store taps in vget_low_s16(taps).
       const int16x8_t taps =
           vld1q_s16(kSubPixelFilters[filter_index][filter_id] + 2);
       if (width == 4) {
         FilterVertical<4, 2>(src, src_stride, dest, dest_stride, width, height,
                              taps);
       } else {
         FilterVertical<8, 2>(src, src_stride, dest, dest_stride, width, height,
                              taps);
       }
     }
   } else {
     assert(width == 2);
     const int taps = NumTapsInFilter(filter_index);
     src =
         static_cast<const uint8_t*>(reference) - ((taps / 2) - 1) * src_stride;
     VerticalPass2xH</*is_2d=*/false>(src, src_stride, dest, pred_stride, height,
                                      0, filter_index, taps, subpixel_y);
   }
 }

 void ConvolveCompoundCopy_NEON(
     const void* const reference, const ptrdiff_t reference_stride,
     const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
     const int /*inter_round_bits_vertical*/, const int /*subpixel_x*/,
     const int /*subpixel_y*/, const int /*step_x*/, const int /*step_y*/,
     const int width, const int height, void* prediction,
     const ptrdiff_t pred_stride) {
   const auto* src = static_cast<const uint8_t*>(reference);
   const ptrdiff_t src_stride = reference_stride;
   auto* dest = static_cast<uint16_t*>(prediction);
   const int bitdepth = 8;
   const int compound_round_offset =
       (1 << (bitdepth + 4)) + (1 << (bitdepth + 3));
   const uint16x8_t v_compound_round_offset = vdupq_n_u16(compound_round_offset);

   if (width >= 16) {
     int y = 0;
     do {
       int x = 0;
       do {
         const uint8x16_t v_src = vld1q_u8(&src[x]);
         const uint16x8_t v_src_x16_lo = vshll_n_u8(vget_low_u8(v_src), 4);
         const uint16x8_t v_src_x16_hi = vshll_n_u8(vget_high_u8(v_src), 4);
         const uint16x8_t v_dest_lo =
             vaddq_u16(v_src_x16_lo, v_compound_round_offset);
         const uint16x8_t v_dest_hi =
             vaddq_u16(v_src_x16_hi, v_compound_round_offset);
         vst1q_u16(&dest[x], v_dest_lo);
         x += 8;
         vst1q_u16(&dest[x], v_dest_hi);
         x += 8;
       } while (x < width);
       src += src_stride;
       dest += pred_stride;
     } while (++y < height);
   } else if (width == 8) {
     int y = 0;
     do {
       const uint8x8_t v_src = vld1_u8(&src[0]);
       const uint16x8_t v_src_x16 = vshll_n_u8(v_src, 4);
       vst1q_u16(&dest[0], vaddq_u16(v_src_x16, v_compound_round_offset));
       src += src_stride;
       dest += pred_stride;
     } while (++y < height);
   } else if (width == 4) {
     const uint8x8_t zero = vdup_n_u8(0);
     int y = 0;
     do {
       const uint8x8_t v_src = LoadLo4(&src[0], zero);
       const uint16x8_t v_src_x16 = vshll_n_u8(v_src, 4);
       const uint16x8_t v_dest = vaddq_u16(v_src_x16, v_compound_round_offset);
       vst1_u16(&dest[0], vget_low_u16(v_dest));
       src += src_stride;
       dest += pred_stride;
     } while (++y < height);
   } else {  // width == 2
     assert(width == 2);
     int y = 0;
     do {
       dest[0] = (src[0] << 4) + compound_round_offset;
       dest[1] = (src[1] << 4) + compound_round_offset;
       src += src_stride;
       dest += pred_stride;
     } while (++y < height);
   }
 }

 // Input 8 bits and output 16 bits.
 template <int min_width, int num_taps>
 void FilterCompoundVertical(const uint8_t* src, const ptrdiff_t src_stride,
                             uint16_t* dst, const ptrdiff_t dst_stride,
                             const int width, const int height,
                             const int16x8_t taps) {
   constexpr int next_row = num_taps - 1;
   // |src| points to the outermost tap of the first value. When doing fewer than
   // 8 taps it needs to be adjusted.
   if (num_taps == 6) {
     src += src_stride;
   } else if (num_taps == 4) {
     src += 2 * src_stride;
   } else if (num_taps == 2) {
     src += 3 * src_stride;
   }

   const uint16x8_t compound_round_offset = vdupq_n_u16(1 << 12);

   int x = 0;
   do {
     int16x8_t srcs[8];
     srcs[0] = ZeroExtend(vld1_u8(src + x));
     if (num_taps >= 4) {
       srcs[1] = ZeroExtend(vld1_u8(src + x + src_stride));
       srcs[2] = ZeroExtend(vld1_u8(src + x + 2 * src_stride));
       if (num_taps >= 6) {
         srcs[3] = ZeroExtend(vld1_u8(src + x + 3 * src_stride));
         srcs[4] = ZeroExtend(vld1_u8(src + x + 4 * src_stride));
         if (num_taps == 8) {
           srcs[5] = ZeroExtend(vld1_u8(src + x + 5 * src_stride));
           srcs[6] = ZeroExtend(vld1_u8(src + x + 6 * src_stride));
         }
       }
     }

     int y = 0;
     do {
       srcs[next_row] =
           ZeroExtend(vld1_u8(src + x + (y + next_row) * src_stride));

       const uint16x8_t sums = SumTaps8To16<num_taps>(srcs, taps);
       const uint16x8_t shifted = vrshrq_n_u16(sums, 3);
       // In order to keep the sum in 16 bits we add an offset to the sum
       // (1 << (bitdepth + kFilterBits - 1) == 1 << 14). This ensures that the
       // results will never be negative.
       // Normally ConvolveCompoundVertical would add |compound_round_offset| at
       // the end. Instead we use that to compensate for the initial offset.
       // (1 << (bitdepth + 4)) + (1 << (bitdepth + 3)) == (1 << 12) + (1 << 11)
       // After taking into account the shift above:
       // RightShiftWithRounding(LeftShift(sum, bits_shift),
       //                        inter_round_bits_vertical)
       // where bits_shift == kFilterBits - kInterRoundBitsHorizontal == 4
       // and inter_round_bits_vertical == 7
       // and simplifying it to RightShiftWithRounding(sum, 3)
       // we see that the initial offset of 1 << 14 >> 3 == 1 << 11 and
       // |compound_round_offset| can be simplified to 1 << 12.
       const uint16x8_t offset = vaddq_u16(shifted, compound_round_offset);

       if (min_width == 4) {
         vst1_u16(dst + x + y * dst_stride, vget_low_u16(offset));
       } else {
         vst1q_u16(dst + x + y * dst_stride, offset);
       }

       srcs[0] = srcs[1];
       if (num_taps >= 4) {
         srcs[1] = srcs[2];
         srcs[2] = srcs[3];
         if (num_taps >= 6) {
           srcs[3] = srcs[4];
           srcs[4] = srcs[5];
           if (num_taps == 8) {
             srcs[5] = srcs[6];
             srcs[6] = srcs[7];
           }
         }
       }
     } while (++y < height);
     x += 8;
   } while (x < width);
 }

 void ConvolveCompoundVertical_NEON(
     const void* const reference, const ptrdiff_t reference_stride,
     const int /*horizontal_filter_index*/, const int vertical_filter_index,
     const int /*inter_round_bits_vertical*/, const int /*subpixel_x*/,
     const int subpixel_y, const int /*step_x*/, const int /*step_y*/,
     const int width, const int height, void* prediction,
     const ptrdiff_t pred_stride) {
   const int filter_index = GetFilterIndex(vertical_filter_index, height);
   const ptrdiff_t src_stride = reference_stride;
   const auto* src =
       static_cast<const uint8_t*>(reference) - kVerticalOffset * src_stride;
   auto* dest = static_cast<uint16_t*>(prediction);
   const int filter_id = (subpixel_y >> 6) & kSubPixelMask;

   if (width >= 4) {
     const int16x8_t taps = vld1q_s16(kSubPixelFilters[filter_index][filter_id]);

     if (filter_index == 2) {  // 8 tap.
       if (width == 4) {
         FilterCompoundVertical<4, 8>(src, src_stride, dest, pred_stride, width,
                                      height, taps);
       } else {
         FilterCompoundVertical<8, 8>(src, src_stride, dest, pred_stride, width,
                                      height, taps);
       }
     } else if (filter_index < 2) {  // 6 tap.
       if (width == 4) {
         FilterCompoundVertical<4, 6>(src, src_stride, dest, pred_stride, width,
                                      height, taps);
       } else {
         FilterCompoundVertical<8, 6>(src, src_stride, dest, pred_stride, width,
                                      height, taps);
       }
     } else if (filter_index == 3) {  // 2 tap.
       if (width == 4) {
         FilterCompoundVertical<4, 2>(src, src_stride, dest, pred_stride, width,
                                      height, taps);
       } else {
         FilterCompoundVertical<8, 2>(src, src_stride, dest, pred_stride, width,
                                      height, taps);
       }
     } else if (filter_index > 3) {  // 4 tap.
       if (width == 4) {
         FilterCompoundVertical<4, 4>(src, src_stride, dest, pred_stride, width,
                                      height, taps);
       } else {
         FilterCompoundVertical<8, 4>(src, src_stride, dest, pred_stride, width,
                                      height, taps);
       }
     }
   } else {
     assert(width == 2);
     const int taps = NumTapsInFilter(filter_index);
     src =
         static_cast<const uint8_t*>(reference) - ((taps / 2) - 1) * src_stride;
     VerticalPass2xH</*is_2d=*/false, /*is_compound=*/true>(
         src, src_stride, dest, pred_stride, height, 0, filter_index, taps,
         subpixel_y);
   }
 }

 void ConvolveCompoundHorizontal_NEON(
     const void* const reference, const ptrdiff_t reference_stride,
     const int horizontal_filter_index, const int /*vertical_filter_index*/,
     const int /*inter_round_bits_vertical*/, const int subpixel_x,
     const int /*subpixel_y*/, const int /*step_x*/, const int /*step_y*/,
     const int width, const int height, void* prediction,
     const ptrdiff_t pred_stride) {
   const int filter_index = GetFilterIndex(horizontal_filter_index, width);
   const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
   auto* dest = static_cast<uint16_t*>(prediction);

   HorizontalPass(src, reference_stride, dest, pred_stride, width, height,
                  subpixel_x, filter_index);
 }

 void ConvolveCompound2D_NEON(const void* const reference,
                              const ptrdiff_t reference_stride,
                              const int horizontal_filter_index,
                              const int vertical_filter_index,
                              const int inter_round_bits_vertical,
                              const int subpixel_x, const int subpixel_y,
                              const int /*step_x*/, const int /*step_y*/,
                              const int width, const int height,
                              void* prediction, const ptrdiff_t pred_stride) {
   // The output of the horizontal filter, i.e. the intermediate_result, is
   // guaranteed to fit in int16_t.
   uint16_t
       intermediate_result[kMaxSuperBlockSizeInPixels *
                           (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
   const int intermediate_stride = kMaxSuperBlockSizeInPixels;

   // Horizontal filter.
   // Filter types used for width <= 4 are different from those for width > 4.
   // When width > 4, the valid filter index range is always [0, 3].
   // When width <= 4, the valid filter index range is always [4, 5].
   // Similarly for height.
   const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
   const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
   const int horizontal_taps = NumTapsInFilter(horiz_filter_index);
   const int vertical_taps = NumTapsInFilter(vert_filter_index);
   uint16_t* intermediate = intermediate_result;
   const int intermediate_height = height + vertical_taps - 1;
   const ptrdiff_t src_stride = reference_stride;
   const auto* src = static_cast<const uint8_t*>(reference) -
                     kVerticalOffset * src_stride - kHorizontalOffset;
   auto* dest = static_cast<uint16_t*>(prediction);
   int filter_id = (subpixel_x >> 6) & kSubPixelMask;

   if (width >= 4) {
     // TODO(johannkoenig): Use |width| for |intermediate_stride|.
     src = static_cast<const uint8_t*>(reference) -
           (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
     HorizontalPass<true>(src, src_stride, intermediate_result,
                          intermediate_stride, width, intermediate_height,
                          subpixel_x, horiz_filter_index);

     // Vertical filter.
     intermediate = intermediate_result;
     filter_id = ((subpixel_y & 1023) >> 6) & kSubPixelMask;

     const ptrdiff_t dest_stride = pred_stride;
     const int16x8_t taps =
         vld1q_s16(kSubPixelFilters[vert_filter_index][filter_id]);

     if (vertical_taps == 8) {
       Filter2DVertical<8, /*is_compound=*/true>(
           intermediate, intermediate_stride, dest, dest_stride, width, height,
           taps, inter_round_bits_vertical);
     } else if (vertical_taps == 6) {
       Filter2DVertical<6, /*is_compound=*/true>(
           intermediate, intermediate_stride, dest, dest_stride, width, height,
           taps, inter_round_bits_vertical);
     } else if (vertical_taps == 4) {
       Filter2DVertical<4, /*is_compound=*/true>(
           intermediate, intermediate_stride, dest, dest_stride, width, height,
           taps, inter_round_bits_vertical);
     } else {  // |vertical_taps| == 2
       Filter2DVertical<2, /*is_compound=*/true>(
           intermediate, intermediate_stride, dest, dest_stride, width, height,
           taps, inter_round_bits_vertical);
     }
   } else {
     src = static_cast<const uint8_t*>(reference) -
           ((vertical_taps / 2) - 1) * src_stride - ((horizontal_taps / 2) - 1);

     HorizontalPass2xH(src, src_stride, intermediate_result, intermediate_stride,
                       intermediate_height, horiz_filter_index, horizontal_taps,
                       subpixel_x);

     VerticalPass2xH</*is_2d=*/true, /*is_compound=*/true>(
         intermediate_result, intermediate_stride, dest, pred_stride, height,
         inter_round_bits_vertical, vert_filter_index, vertical_taps,
         subpixel_y);
   }
 }

 void Init8bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
   assert(dsp != nullptr);
   dsp->convolve[0][0][0][1] = ConvolveHorizontal_NEON;
   dsp->convolve[0][0][1][0] = ConvolveVertical_NEON;
   dsp->convolve[0][0][1][1] = Convolve2D_NEON;

   dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_NEON;
   dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_NEON;
   dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_NEON;
   dsp->convolve[0][1][1][1] = ConvolveCompound2D_NEON;

   // TODO(petersonab,b/139707209): Fix source buffer overreads.
   // dsp->convolve_scale[1] = ConvolveCompoundScale2D_NEON;
   static_cast<void>(ConvolveCompoundScale2D_NEON);
 }

 }  // namespace
 }  // namespace low_bitdepth

 void ConvolveInit_NEON() { low_bitdepth::Init8bpp(); }

 }  // namespace dsp
 }  // namespace libgav1

 #else   // !LIBGAV1_ENABLE_NEON

 namespace libgav1 {
 namespace dsp {

 void ConvolveInit_NEON() {}

 }  // namespace dsp
 }  // namespace libgav1
 #endif  // LIBGAV1_ENABLE_NEON