src/dsp/arm/convolve_10bit_neon.cc - platform/external/libgav1 - Git at Google

 // Copyright 2021 The libgav1 Authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include "src/dsp/convolve.h"
 #include "src/utils/cpu.h"

 #if LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
 #include <arm_neon.h>

 #include <algorithm>
 #include <cassert>
 #include <cstdint>

 #include "src/dsp/arm/common_neon.h"
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
 #include "src/utils/common.h"
 #include "src/utils/compiler_attributes.h"
 #include "src/utils/constants.h"

 namespace libgav1 {
 namespace dsp {
 namespace {

 // Include the constants and utility functions inside the anonymous namespace.
 #include "src/dsp/convolve.inc"

 template <int filter_index>
 int32x4x2_t SumOnePassTaps(const uint16x8_t* const src,
                            const int16x4_t* const taps) {
   const auto* ssrc = reinterpret_cast<const int16x8_t*>(src);
   int32x4x2_t sum;
   if (filter_index < 2) {
     // 6 taps.
     sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]);
     sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]);
     sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[2]), taps[2]);
     sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[3]), taps[3]);
     sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[4]), taps[4]);
     sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[5]), taps[5]);

     sum.val[1] = vmull_s16(vget_high_s16(ssrc[0]), taps[0]);
     sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[1]), taps[1]);
     sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[2]), taps[2]);
     sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[3]), taps[3]);
     sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[4]), taps[4]);
     sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[5]), taps[5]);
   } else if (filter_index == 2) {
     // 8 taps.
     sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]);
     sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]);
     sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[2]), taps[2]);
     sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[3]), taps[3]);
     sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[4]), taps[4]);
     sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[5]), taps[5]);
     sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[6]), taps[6]);
     sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[7]), taps[7]);

     sum.val[1] = vmull_s16(vget_high_s16(ssrc[0]), taps[0]);
     sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[1]), taps[1]);
     sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[2]), taps[2]);
     sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[3]), taps[3]);
     sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[4]), taps[4]);
     sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[5]), taps[5]);
     sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[6]), taps[6]);
     sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[7]), taps[7]);
   } else if (filter_index == 3) {
     // 2 taps.
     sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]);
     sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]);

     sum.val[1] = vmull_s16(vget_high_s16(ssrc[0]), taps[0]);
     sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[1]), taps[1]);
   } else {
     // 4 taps.
     sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]);
     sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]);
     sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[2]), taps[2]);
     sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[3]), taps[3]);

     sum.val[1] = vmull_s16(vget_high_s16(ssrc[0]), taps[0]);
     sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[1]), taps[1]);
     sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[2]), taps[2]);
     sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[3]), taps[3]);
   }
   return sum;
 }

 template <int filter_index>
 int32x4_t SumOnePassTaps(const uint16x4_t* const src,
                          const int16x4_t* const taps) {
   const auto* ssrc = reinterpret_cast<const int16x4_t*>(src);
   int32x4_t sum;
   if (filter_index < 2) {
     // 6 taps.
     sum = vmull_s16(ssrc[0], taps[0]);
     sum = vmlal_s16(sum, ssrc[1], taps[1]);
     sum = vmlal_s16(sum, ssrc[2], taps[2]);
     sum = vmlal_s16(sum, ssrc[3], taps[3]);
     sum = vmlal_s16(sum, ssrc[4], taps[4]);
     sum = vmlal_s16(sum, ssrc[5], taps[5]);
   } else if (filter_index == 2) {
     // 8 taps.
     sum = vmull_s16(ssrc[0], taps[0]);
     sum = vmlal_s16(sum, ssrc[1], taps[1]);
     sum = vmlal_s16(sum, ssrc[2], taps[2]);
     sum = vmlal_s16(sum, ssrc[3], taps[3]);
     sum = vmlal_s16(sum, ssrc[4], taps[4]);
     sum = vmlal_s16(sum, ssrc[5], taps[5]);
     sum = vmlal_s16(sum, ssrc[6], taps[6]);
     sum = vmlal_s16(sum, ssrc[7], taps[7]);
   } else if (filter_index == 3) {
     // 2 taps.
     sum = vmull_s16(ssrc[0], taps[0]);
     sum = vmlal_s16(sum, ssrc[1], taps[1]);
   } else {
     // 4 taps.
     sum = vmull_s16(ssrc[0], taps[0]);
     sum = vmlal_s16(sum, ssrc[1], taps[1]);
     sum = vmlal_s16(sum, ssrc[2], taps[2]);
     sum = vmlal_s16(sum, ssrc[3], taps[3]);
   }
   return sum;
 }

 template <int filter_index, bool is_compound>
 void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src,
                                  const ptrdiff_t src_stride,
                                  void* LIBGAV1_RESTRICT const dest,
                                  const ptrdiff_t pred_stride, const int width,
                                  const int height,
                                  const int16x4_t* const v_tap) {
   auto* dest16 = static_cast<uint16_t*>(dest);
   const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
   int y = height;
   do {
     int x = 0;
     do {
       const uint16x8_t src_long = vld1q_u16(src + x);
       const uint16x8_t src_long_hi = vld1q_u16(src + x + 8);
       uint16x8_t v_src[8];
       int32x4x2_t v_sum;
       if (filter_index < 2) {
         v_src[0] = src_long;
         v_src[1] = vextq_u16(src_long, src_long_hi, 1);
         v_src[2] = vextq_u16(src_long, src_long_hi, 2);
         v_src[3] = vextq_u16(src_long, src_long_hi, 3);
         v_src[4] = vextq_u16(src_long, src_long_hi, 4);
         v_src[5] = vextq_u16(src_long, src_long_hi, 5);
         v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 1);
       } else if (filter_index == 2) {
         v_src[0] = src_long;
         v_src[1] = vextq_u16(src_long, src_long_hi, 1);
         v_src[2] = vextq_u16(src_long, src_long_hi, 2);
         v_src[3] = vextq_u16(src_long, src_long_hi, 3);
         v_src[4] = vextq_u16(src_long, src_long_hi, 4);
         v_src[5] = vextq_u16(src_long, src_long_hi, 5);
         v_src[6] = vextq_u16(src_long, src_long_hi, 6);
         v_src[7] = vextq_u16(src_long, src_long_hi, 7);
         v_sum = SumOnePassTaps<filter_index>(v_src, v_tap);
       } else if (filter_index == 3) {
         v_src[0] = src_long;
         v_src[1] = vextq_u16(src_long, src_long_hi, 1);
         v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 3);
       } else if (filter_index > 3) {
         v_src[0] = src_long;
         v_src[1] = vextq_u16(src_long, src_long_hi, 1);
         v_src[2] = vextq_u16(src_long, src_long_hi, 2);
         v_src[3] = vextq_u16(src_long, src_long_hi, 3);
         v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 2);
       }
       if (is_compound) {
         const int16x4_t v_compound_offset = vdup_n_s16(kCompoundOffset);
         const int16x4_t d0 =
             vqrshrn_n_s32(v_sum.val[0], kInterRoundBitsHorizontal - 1);
         const int16x4_t d1 =
             vqrshrn_n_s32(v_sum.val[1], kInterRoundBitsHorizontal - 1);
         vst1_u16(&dest16[x],
                  vreinterpret_u16_s16(vadd_s16(d0, v_compound_offset)));
         vst1_u16(&dest16[x + 4],
                  vreinterpret_u16_s16(vadd_s16(d1, v_compound_offset)));
       } else {
         // Normally the Horizontal pass does the downshift in two passes:
         // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
         // kInterRoundBitsHorizontal). Each one uses a rounding shift.
         // Combining them requires adding the rounding offset from the skipped
         // shift.
         const int32x4_t v_first_shift_rounding_bit =
             vdupq_n_s32(1 << (kInterRoundBitsHorizontal - 2));
         v_sum.val[0] = vaddq_s32(v_sum.val[0], v_first_shift_rounding_bit);
         v_sum.val[1] = vaddq_s32(v_sum.val[1], v_first_shift_rounding_bit);
         const uint16x4_t d0 = vmin_u16(
             vqrshrun_n_s32(v_sum.val[0], kFilterBits - 1), v_max_bitdepth);
         const uint16x4_t d1 = vmin_u16(
             vqrshrun_n_s32(v_sum.val[1], kFilterBits - 1), v_max_bitdepth);
         vst1_u16(&dest16[x], d0);
         vst1_u16(&dest16[x + 4], d1);
       }
       x += 8;
     } while (x < width);
     src += src_stride >> 1;
     dest16 += is_compound ? pred_stride : pred_stride >> 1;
   } while (--y != 0);
 }

 template <int filter_index, bool is_compound>
 void FilterHorizontalWidth4(const uint16_t* LIBGAV1_RESTRICT src,
                             const ptrdiff_t src_stride,
                             void* LIBGAV1_RESTRICT const dest,
                             const ptrdiff_t pred_stride, const int height,
                             const int16x4_t* const v_tap) {
   auto* dest16 = static_cast<uint16_t*>(dest);
   const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
   int y = height;
   do {
     const uint16x8_t v_zero = vdupq_n_u16(0);
     uint16x4_t v_src[4];
     int32x4_t v_sum;
     const uint16x8_t src_long = vld1q_u16(src);
     v_src[0] = vget_low_u16(src_long);
     if (filter_index == 3) {
       v_src[1] = vget_low_u16(vextq_u16(src_long, v_zero, 1));
       v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 3);
     } else {
       v_src[1] = vget_low_u16(vextq_u16(src_long, v_zero, 1));
       v_src[2] = vget_low_u16(vextq_u16(src_long, v_zero, 2));
       v_src[3] = vget_low_u16(vextq_u16(src_long, v_zero, 3));
       v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 2);
     }
     if (is_compound) {
       const int16x4_t d0 = vqrshrn_n_s32(v_sum, kInterRoundBitsHorizontal - 1);
       vst1_u16(&dest16[0],
                vreinterpret_u16_s16(vadd_s16(d0, vdup_n_s16(kCompoundOffset))));
     } else {
       const int32x4_t v_first_shift_rounding_bit =
           vdupq_n_s32(1 << (kInterRoundBitsHorizontal - 2));
       v_sum = vaddq_s32(v_sum, v_first_shift_rounding_bit);
       const uint16x4_t d0 =
           vmin_u16(vqrshrun_n_s32(v_sum, kFilterBits - 1), v_max_bitdepth);
       vst1_u16(&dest16[0], d0);
     }
     src += src_stride >> 1;
     dest16 += is_compound ? pred_stride : pred_stride >> 1;
   } while (--y != 0);
 }

 template <int filter_index>
 void FilterHorizontalWidth2(const uint16_t* LIBGAV1_RESTRICT src,
                             const ptrdiff_t src_stride,
                             void* LIBGAV1_RESTRICT const dest,
                             const ptrdiff_t pred_stride, const int height,
                             const int16x4_t* const v_tap) {
   auto* dest16 = static_cast<uint16_t*>(dest);
   const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
   int y = height >> 1;
   do {
     const int16x8_t v_zero = vdupq_n_s16(0);
     const int16x8_t input0 = vreinterpretq_s16_u16(vld1q_u16(src));
     const int16x8_t input1 =
         vreinterpretq_s16_u16(vld1q_u16(src + (src_stride >> 1)));
     const int16x8x2_t input = vzipq_s16(input0, input1);
     int32x4_t v_sum;
     if (filter_index == 3) {
       v_sum = vmull_s16(vget_low_s16(input.val[0]), v_tap[3]);
       v_sum = vmlal_s16(v_sum,
                         vget_low_s16(vextq_s16(input.val[0], input.val[1], 2)),
                         v_tap[4]);
     } else {
       v_sum = vmull_s16(vget_low_s16(input.val[0]), v_tap[2]);
       v_sum = vmlal_s16(v_sum, vget_low_s16(vextq_s16(input.val[0], v_zero, 2)),
                         v_tap[3]);
       v_sum = vmlal_s16(v_sum, vget_low_s16(vextq_s16(input.val[0], v_zero, 4)),
                         v_tap[4]);
       v_sum = vmlal_s16(v_sum,
                         vget_low_s16(vextq_s16(input.val[0], input.val[1], 6)),
                         v_tap[5]);
     }
     // Normally the Horizontal pass does the downshift in two passes:
     // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
     // kInterRoundBitsHorizontal). Each one uses a rounding shift.
     // Combining them requires adding the rounding offset from the skipped
     // shift.
     const int32x4_t v_first_shift_rounding_bit =
         vdupq_n_s32(1 << (kInterRoundBitsHorizontal - 2));
     v_sum = vaddq_s32(v_sum, v_first_shift_rounding_bit);
     const uint16x4_t d0 =
         vmin_u16(vqrshrun_n_s32(v_sum, kFilterBits - 1), v_max_bitdepth);
     dest16[0] = vget_lane_u16(d0, 0);
     dest16[1] = vget_lane_u16(d0, 2);
     dest16 += pred_stride >> 1;
     dest16[0] = vget_lane_u16(d0, 1);
     dest16[1] = vget_lane_u16(d0, 3);
     dest16 += pred_stride >> 1;
     src += src_stride;
   } while (--y != 0);
 }

 template <int filter_index, bool is_compound>
 void FilterHorizontal(const uint16_t* LIBGAV1_RESTRICT const src,
                       const ptrdiff_t src_stride,
                       void* LIBGAV1_RESTRICT const dest,
                       const ptrdiff_t pred_stride, const int width,
                       const int height, const int16x4_t* const v_tap) {
   assert(width < 8 || filter_index <= 3);
   // Don't simplify the redundant if conditions with the template parameters,
   // which helps the compiler generate compact code.
   if (width >= 8 && filter_index <= 3) {
     FilterHorizontalWidth8AndUp<filter_index, is_compound>(
         src, src_stride, dest, pred_stride, width, height, v_tap);
     return;
   }

   // Horizontal passes only needs to account for number of taps 2 and 4 when
   // |width| <= 4.
   assert(width <= 4);
   assert(filter_index >= 3 && filter_index <= 5);
   if (filter_index >= 3 && filter_index <= 5) {
     if (width == 4) {
       FilterHorizontalWidth4<filter_index, is_compound>(
           src, src_stride, dest, pred_stride, height, v_tap);
       return;
     }
     assert(width == 2);
     if (!is_compound) {
       FilterHorizontalWidth2<filter_index>(src, src_stride, dest, pred_stride,
                                            height, v_tap);
     }
   }
 }

 template <bool is_compound = false>
 LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
     const uint16_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
     void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride,
     const int width, const int height, const int filter_id,
     const int filter_index) {
   // Duplicate the absolute value for each tap.  Negative taps are corrected
   // by using the vmlsl_u8 instruction.  Positive taps use vmlal_u8.
   int16x4_t v_tap[kSubPixelTaps];
   assert(filter_id != 0);

   for (int k = 0; k < kSubPixelTaps; ++k) {
     v_tap[k] = vdup_n_s16(kHalfSubPixelFilters[filter_index][filter_id][k]);
   }

   if (filter_index == 2) {  // 8 tap.
     FilterHorizontal<2, is_compound>(src, src_stride, dst, dst_stride, width,
                                      height, v_tap);
   } else if (filter_index == 1) {  // 6 tap.
     FilterHorizontal<1, is_compound>(src + 1, src_stride, dst, dst_stride,
                                      width, height, v_tap);
   } else if (filter_index == 0) {  // 6 tap.
     FilterHorizontal<0, is_compound>(src + 1, src_stride, dst, dst_stride,
                                      width, height, v_tap);
   } else if (filter_index == 4) {  // 4 tap.
     FilterHorizontal<4, is_compound>(src + 2, src_stride, dst, dst_stride,
                                      width, height, v_tap);
   } else if (filter_index == 5) {  // 4 tap.
     FilterHorizontal<5, is_compound>(src + 2, src_stride, dst, dst_stride,
                                      width, height, v_tap);
   } else {  // 2 tap.
     FilterHorizontal<3, is_compound>(src + 3, src_stride, dst, dst_stride,
                                      width, height, v_tap);
   }
 }

 void ConvolveHorizontal_NEON(
     const void* LIBGAV1_RESTRICT const reference,
     const ptrdiff_t reference_stride, const int horizontal_filter_index,
     const int /*vertical_filter_index*/, const int horizontal_filter_id,
     const int /*vertical_filter_id*/, const int width, const int height,
     void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
   const int filter_index = GetFilterIndex(horizontal_filter_index, width);
   // Set |src| to the outermost tap.
   const auto* const src =
       static_cast<const uint16_t*>(reference) - kHorizontalOffset;
   auto* const dest = static_cast<uint16_t*>(prediction);

   DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
                    horizontal_filter_id, filter_index);
 }

 void ConvolveCompoundHorizontal_NEON(
     const void* LIBGAV1_RESTRICT const reference,
     const ptrdiff_t reference_stride, const int horizontal_filter_index,
     const int /*vertical_filter_index*/, const int horizontal_filter_id,
     const int /*vertical_filter_id*/, const int width, const int height,
     void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) {
   const int filter_index = GetFilterIndex(horizontal_filter_index, width);
   const auto* const src =
       static_cast<const uint16_t*>(reference) - kHorizontalOffset;
   auto* const dest = static_cast<uint16_t*>(prediction);

   DoHorizontalPass</*is_compound=*/true>(src, reference_stride, dest, width,
                                          width, height, horizontal_filter_id,
                                          filter_index);
 }

 template <int filter_index, bool is_compound = false>
 void FilterVertical(const uint16_t* LIBGAV1_RESTRICT const src,
                     const ptrdiff_t src_stride,
                     void* LIBGAV1_RESTRICT const dst,
                     const ptrdiff_t dst_stride, const int width,
                     const int height, const int16x4_t* const taps) {
   const int num_taps = GetNumTapsInFilter(filter_index);
   const int next_row = num_taps - 1;
   const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
   auto* const dst16 = static_cast<uint16_t*>(dst);
   assert(width >= 8);

   int x = 0;
   do {
     const uint16_t* src_x = src + x;
     uint16x8_t srcs[8];
     srcs[0] = vld1q_u16(src_x);
     src_x += src_stride;
     if (num_taps >= 4) {
       srcs[1] = vld1q_u16(src_x);
       src_x += src_stride;
       srcs[2] = vld1q_u16(src_x);
       src_x += src_stride;
       if (num_taps >= 6) {
         srcs[3] = vld1q_u16(src_x);
         src_x += src_stride;
         srcs[4] = vld1q_u16(src_x);
         src_x += src_stride;
         if (num_taps == 8) {
           srcs[5] = vld1q_u16(src_x);
           src_x += src_stride;
           srcs[6] = vld1q_u16(src_x);
           src_x += src_stride;
         }
       }
     }

     // Decreasing the y loop counter produces worse code with clang.
     // Don't unroll this loop since it generates too much code and the decoder
     // is even slower.
     int y = 0;
     do {
       srcs[next_row] = vld1q_u16(src_x);
       src_x += src_stride;

       const int32x4x2_t v_sum = SumOnePassTaps<filter_index>(srcs, taps);
       if (is_compound) {
         const int16x4_t v_compound_offset = vdup_n_s16(kCompoundOffset);
         const int16x4_t d0 =
             vqrshrn_n_s32(v_sum.val[0], kInterRoundBitsHorizontal - 1);
         const int16x4_t d1 =
             vqrshrn_n_s32(v_sum.val[1], kInterRoundBitsHorizontal - 1);
         vst1_u16(dst16 + x + y * dst_stride,
                  vreinterpret_u16_s16(vadd_s16(d0, v_compound_offset)));
         vst1_u16(dst16 + x + 4 + y * dst_stride,
                  vreinterpret_u16_s16(vadd_s16(d1, v_compound_offset)));
       } else {
         const uint16x4_t d0 = vmin_u16(
             vqrshrun_n_s32(v_sum.val[0], kFilterBits - 1), v_max_bitdepth);
         const uint16x4_t d1 = vmin_u16(
             vqrshrun_n_s32(v_sum.val[1], kFilterBits - 1), v_max_bitdepth);
         vst1_u16(dst16 + x + y * dst_stride, d0);
         vst1_u16(dst16 + x + 4 + y * dst_stride, d1);
       }

       srcs[0] = srcs[1];
       if (num_taps >= 4) {
         srcs[1] = srcs[2];
         srcs[2] = srcs[3];
         if (num_taps >= 6) {
           srcs[3] = srcs[4];
           srcs[4] = srcs[5];
           if (num_taps == 8) {
             srcs[5] = srcs[6];
             srcs[6] = srcs[7];
           }
         }
       }
     } while (++y < height);
     x += 8;
   } while (x < width);
 }

 template <int filter_index, bool is_compound = false>
 void FilterVertical4xH(const uint16_t* LIBGAV1_RESTRICT src,
                        const ptrdiff_t src_stride,
                        void* LIBGAV1_RESTRICT const dst,
                        const ptrdiff_t dst_stride, const int height,
                        const int16x4_t* const taps) {
   const int num_taps = GetNumTapsInFilter(filter_index);
   const int next_row = num_taps - 1;
   const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
   auto* dst16 = static_cast<uint16_t*>(dst);

   uint16x4_t srcs[9];
   srcs[0] = vld1_u16(src);
   src += src_stride;
   if (num_taps >= 4) {
     srcs[1] = vld1_u16(src);
     src += src_stride;
     srcs[2] = vld1_u16(src);
     src += src_stride;
     if (num_taps >= 6) {
       srcs[3] = vld1_u16(src);
       src += src_stride;
       srcs[4] = vld1_u16(src);
       src += src_stride;
       if (num_taps == 8) {
         srcs[5] = vld1_u16(src);
         src += src_stride;
         srcs[6] = vld1_u16(src);
         src += src_stride;
       }
     }
   }

   int y = height;
   do {
     srcs[next_row] = vld1_u16(src);
     src += src_stride;
     srcs[num_taps] = vld1_u16(src);
     src += src_stride;

     const int32x4_t v_sum = SumOnePassTaps<filter_index>(srcs, taps);
     const int32x4_t v_sum_1 = SumOnePassTaps<filter_index>(srcs + 1, taps);
     if (is_compound) {
       const int16x4_t d0 = vqrshrn_n_s32(v_sum, kInterRoundBitsHorizontal - 1);
       const int16x4_t d1 =
           vqrshrn_n_s32(v_sum_1, kInterRoundBitsHorizontal - 1);
       vst1_u16(dst16,
                vreinterpret_u16_s16(vadd_s16(d0, vdup_n_s16(kCompoundOffset))));
       dst16 += dst_stride;
       vst1_u16(dst16,
                vreinterpret_u16_s16(vadd_s16(d1, vdup_n_s16(kCompoundOffset))));
       dst16 += dst_stride;
     } else {
       const uint16x4_t d0 =
           vmin_u16(vqrshrun_n_s32(v_sum, kFilterBits - 1), v_max_bitdepth);
       const uint16x4_t d1 =
           vmin_u16(vqrshrun_n_s32(v_sum_1, kFilterBits - 1), v_max_bitdepth);
       vst1_u16(dst16, d0);
       dst16 += dst_stride;
       vst1_u16(dst16, d1);
       dst16 += dst_stride;
     }

     srcs[0] = srcs[2];
     if (num_taps >= 4) {
       srcs[1] = srcs[3];
       srcs[2] = srcs[4];
       if (num_taps >= 6) {
         srcs[3] = srcs[5];
         srcs[4] = srcs[6];
         if (num_taps == 8) {
           srcs[5] = srcs[7];
           srcs[6] = srcs[8];
         }
       }
     }
     y -= 2;
   } while (y != 0);
 }

 template <int filter_index>
 void FilterVertical2xH(const uint16_t* LIBGAV1_RESTRICT src,
                        const ptrdiff_t src_stride,
                        void* LIBGAV1_RESTRICT const dst,
                        const ptrdiff_t dst_stride, const int height,
                        const int16x4_t* const taps) {
   const int num_taps = GetNumTapsInFilter(filter_index);
   const int next_row = num_taps - 1;
   const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
   auto* dst16 = static_cast<uint16_t*>(dst);
   const uint16x4_t v_zero = vdup_n_u16(0);

   uint16x4_t srcs[9];
   srcs[0] = Load2<0>(src, v_zero);
   src += src_stride;
   if (num_taps >= 4) {
     srcs[0] = Load2<1>(src, srcs[0]);
     src += src_stride;
     srcs[2] = Load2<0>(src, v_zero);
     src += src_stride;
     srcs[1] = vext_u16(srcs[0], srcs[2], 2);
     if (num_taps >= 6) {
       srcs[2] = Load2<1>(src, srcs[2]);
       src += src_stride;
       srcs[4] = Load2<0>(src, v_zero);
       src += src_stride;
       srcs[3] = vext_u16(srcs[2], srcs[4], 2);
       if (num_taps == 8) {
         srcs[4] = Load2<1>(src, srcs[4]);
         src += src_stride;
         srcs[6] = Load2<0>(src, v_zero);
         src += src_stride;
         srcs[5] = vext_u16(srcs[4], srcs[6], 2);
       }
     }
   }

   int y = height;
   do {
     srcs[next_row - 1] = Load2<1>(src, srcs[next_row - 1]);
     src += src_stride;
     srcs[num_taps] = Load2<0>(src, v_zero);
     src += src_stride;
     srcs[next_row] = vext_u16(srcs[next_row - 1], srcs[num_taps], 2);

     const int32x4_t v_sum = SumOnePassTaps<filter_index>(srcs, taps);
     const uint16x4_t d0 =
         vmin_u16(vqrshrun_n_s32(v_sum, kFilterBits - 1), v_max_bitdepth);
     Store2<0>(dst16, d0);
     dst16 += dst_stride;
     Store2<1>(dst16, d0);
     dst16 += dst_stride;

     srcs[0] = srcs[2];
     if (num_taps >= 4) {
       srcs[1] = srcs[3];
       srcs[2] = srcs[4];
       if (num_taps >= 6) {
         srcs[3] = srcs[5];
         srcs[4] = srcs[6];
         if (num_taps == 8) {
           srcs[5] = srcs[7];
           srcs[6] = srcs[8];
         }
       }
     }
     y -= 2;
   } while (y != 0);
 }

 void ConvolveVertical_NEON(
     const void* LIBGAV1_RESTRICT const reference,
     const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
     const int vertical_filter_index, const int /*horizontal_filter_id*/,
     const int vertical_filter_id, const int width, const int height,
     void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
   const int filter_index = GetFilterIndex(vertical_filter_index, height);
   const int vertical_taps = GetNumTapsInFilter(filter_index);
   const ptrdiff_t src_stride = reference_stride >> 1;
   const auto* src = static_cast<const uint16_t*>(reference) -
                     (vertical_taps / 2 - 1) * src_stride;
   auto* const dest = static_cast<uint16_t*>(prediction);
   const ptrdiff_t dest_stride = pred_stride >> 1;
   assert(vertical_filter_id != 0);

   int16x4_t taps[8];
   for (int k = 0; k < kSubPixelTaps; ++k) {
     taps[k] =
         vdup_n_s16(kHalfSubPixelFilters[filter_index][vertical_filter_id][k]);
   }

   if (filter_index == 0) {  // 6 tap.
     if (width == 2) {
       FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height,
                            taps + 1);
     } else if (width == 4) {
       FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height,
                            taps + 1);
     } else {
       FilterVertical<0>(src, src_stride, dest, dest_stride, width, height,
                         taps + 1);
     }
   } else if ((filter_index == 1) &
              ((vertical_filter_id == 1) | (vertical_filter_id == 7) |
               (vertical_filter_id == 8) | (vertical_filter_id == 9) |
               (vertical_filter_id == 15))) {  // 6 tap.
     if (width == 2) {
       FilterVertical2xH<1>(src, src_stride, dest, dest_stride, height,
                            taps + 1);
     } else if (width == 4) {
       FilterVertical4xH<1>(src, src_stride, dest, dest_stride, height,
                            taps + 1);
     } else {
       FilterVertical<1>(src, src_stride, dest, dest_stride, width, height,
                         taps + 1);
     }
   } else if (filter_index == 2) {  // 8 tap.
     if (width == 2) {
       FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
     } else if (width == 4) {
       FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
     } else {
       FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
                         taps);
     }
   } else if (filter_index == 3) {  // 2 tap.
     if (width == 2) {
       FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height,
                            taps + 3);
     } else if (width == 4) {
       FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height,
                            taps + 3);
     } else {
       FilterVertical<3>(src, src_stride, dest, dest_stride, width, height,
                         taps + 3);
     }
   } else {
     // 4 tap. When |filter_index| == 1 the |vertical_filter_id| values listed
     // below map to 4 tap filters.
     assert(filter_index == 5 || filter_index == 4 ||
            (filter_index == 1 &&
             (vertical_filter_id == 0 || vertical_filter_id == 2 ||
              vertical_filter_id == 3 || vertical_filter_id == 4 ||
              vertical_filter_id == 5 || vertical_filter_id == 6 ||
              vertical_filter_id == 10 || vertical_filter_id == 11 ||
              vertical_filter_id == 12 || vertical_filter_id == 13 ||
              vertical_filter_id == 14)));
     // According to GetNumTapsInFilter() this has 6 taps but here we are
     // treating it as though it has 4.
     if (filter_index == 1) src += src_stride;
     if (width == 2) {
       FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height,
                            taps + 2);
     } else if (width == 4) {
       FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height,
                            taps + 2);
     } else {
       FilterVertical<5>(src, src_stride, dest, dest_stride, width, height,
                         taps + 2);
     }
   }
 }

 void ConvolveCompoundVertical_NEON(
     const void* LIBGAV1_RESTRICT const reference,
     const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
     const int vertical_filter_index, const int /*horizontal_filter_id*/,
     const int vertical_filter_id, const int width, const int height,
     void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) {
   const int filter_index = GetFilterIndex(vertical_filter_index, height);
   const int vertical_taps = GetNumTapsInFilter(filter_index);
   const ptrdiff_t src_stride = reference_stride >> 1;
   const auto* src = static_cast<const uint16_t*>(reference) -
                     (vertical_taps / 2 - 1) * src_stride;
   auto* const dest = static_cast<uint16_t*>(prediction);
   assert(vertical_filter_id != 0);

   int16x4_t taps[8];
   for (int k = 0; k < kSubPixelTaps; ++k) {
     taps[k] =
         vdup_n_s16(kHalfSubPixelFilters[filter_index][vertical_filter_id][k]);
   }

   if (filter_index == 0) {  // 6 tap.
     if (width == 4) {
       FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4,
                                                  height, taps + 1);
     } else {
       FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width,
                                               width, height, taps + 1);
     }
   } else if ((filter_index == 1) &
              ((vertical_filter_id == 1) | (vertical_filter_id == 7) |
               (vertical_filter_id == 8) | (vertical_filter_id == 9) |
               (vertical_filter_id == 15))) {  // 6 tap.
     if (width == 4) {
       FilterVertical4xH<1, /*is_compound=*/true>(src, src_stride, dest, 4,
                                                  height, taps + 1);
     } else {
       FilterVertical<1, /*is_compound=*/true>(src, src_stride, dest, width,
                                               width, height, taps + 1);
     }
   } else if (filter_index == 2) {  // 8 tap.
     if (width == 4) {
       FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
                                                  height, taps);
     } else {
       FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
                                               width, height, taps);
     }
   } else if (filter_index == 3) {  // 2 tap.
     if (width == 4) {
       FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4,
                                                  height, taps + 3);
     } else {
       FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width,
                                               width, height, taps + 3);
     }
   } else {
     // 4 tap. When |filter_index| == 1 the |filter_id| values listed below map
     // to 4 tap filters.
     assert(filter_index == 5 || filter_index == 4 ||
            (filter_index == 1 &&
             (vertical_filter_id == 2 || vertical_filter_id == 3 ||
              vertical_filter_id == 4 || vertical_filter_id == 5 ||
              vertical_filter_id == 6 || vertical_filter_id == 10 ||
              vertical_filter_id == 11 || vertical_filter_id == 12 ||
              vertical_filter_id == 13 || vertical_filter_id == 14)));
     // According to GetNumTapsInFilter() this has 6 taps but here we are
     // treating it as though it has 4.
     if (filter_index == 1) src += src_stride;
     if (width == 4) {
       FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4,
                                                  height, taps + 2);
     } else {
       FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width,
                                               width, height, taps + 2);
     }
   }
 }

 void ConvolveCompoundCopy_NEON(
     const void* const reference, const ptrdiff_t reference_stride,
     const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
     const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
     const int width, const int height, void* const prediction,
     const ptrdiff_t /*pred_stride*/) {
   const auto* src = static_cast<const uint16_t*>(reference);
   const ptrdiff_t src_stride = reference_stride >> 1;
   auto* dest = static_cast<uint16_t*>(prediction);
   constexpr int final_shift =
       kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
   const uint16x8_t offset =
       vdupq_n_u16((1 << kBitdepth10) + (1 << (kBitdepth10 - 1)));

   if (width >= 16) {
     int y = height;
     do {
       int x = 0;
       int w = width;
       do {
         const uint16x8_t v_src_lo = vld1q_u16(&src[x]);
         const uint16x8_t v_src_hi = vld1q_u16(&src[x + 8]);
         const uint16x8_t v_sum_lo = vaddq_u16(v_src_lo, offset);
         const uint16x8_t v_sum_hi = vaddq_u16(v_src_hi, offset);
         const uint16x8_t v_dest_lo = vshlq_n_u16(v_sum_lo, final_shift);
         const uint16x8_t v_dest_hi = vshlq_n_u16(v_sum_hi, final_shift);
         vst1q_u16(&dest[x], v_dest_lo);
         vst1q_u16(&dest[x + 8], v_dest_hi);
         x += 16;
         w -= 16;
       } while (w != 0);
       src += src_stride;
       dest += width;
     } while (--y != 0);
   } else if (width == 8) {
     int y = height;
     do {
       const uint16x8_t v_src_lo = vld1q_u16(&src[0]);
       const uint16x8_t v_src_hi = vld1q_u16(&src[src_stride]);
       const uint16x8_t v_sum_lo = vaddq_u16(v_src_lo, offset);
       const uint16x8_t v_sum_hi = vaddq_u16(v_src_hi, offset);
       const uint16x8_t v_dest_lo = vshlq_n_u16(v_sum_lo, final_shift);
       const uint16x8_t v_dest_hi = vshlq_n_u16(v_sum_hi, final_shift);
       vst1q_u16(&dest[0], v_dest_lo);
       vst1q_u16(&dest[8], v_dest_hi);
       src += src_stride << 1;
       dest += 16;
       y -= 2;
     } while (y != 0);
   } else {  // width == 4
     int y = height;
     do {
       const uint16x4_t v_src_lo = vld1_u16(&src[0]);
       const uint16x4_t v_src_hi = vld1_u16(&src[src_stride]);
       const uint16x4_t v_sum_lo = vadd_u16(v_src_lo, vget_low_u16(offset));
       const uint16x4_t v_sum_hi = vadd_u16(v_src_hi, vget_low_u16(offset));
       const uint16x4_t v_dest_lo = vshl_n_u16(v_sum_lo, final_shift);
       const uint16x4_t v_dest_hi = vshl_n_u16(v_sum_hi, final_shift);
       vst1_u16(&dest[0], v_dest_lo);
       vst1_u16(&dest[4], v_dest_hi);
       src += src_stride << 1;
       dest += 8;
       y -= 2;
     } while (y != 0);
   }
 }

 void Init10bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
   assert(dsp != nullptr);
   dsp->convolve[0][0][0][1] = ConvolveHorizontal_NEON;
   dsp->convolve[0][0][1][0] = ConvolveVertical_NEON;

   dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_NEON;
   dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_NEON;
   dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_NEON;
 }

 }  // namespace

 void ConvolveInit10bpp_NEON() { Init10bpp(); }

 }  // namespace dsp
 }  // namespace libgav1

 #else   // !(LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10)

 namespace libgav1 {
 namespace dsp {

 void ConvolveInit10bpp_NEON() {}

 }  // namespace dsp
 }  // namespace libgav1
 #endif  // LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10