libgav1/src/dsp/arm/loop_filter_neon.cc - platform/external/libgav1 - Git at Google

 // Copyright 2019 The libgav1 Authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include "src/dsp/loop_filter.h"
 #include "src/utils/cpu.h"

 #if LIBGAV1_ENABLE_NEON

 #include <arm_neon.h>

 #include <cassert>
 #include <cstddef>
 #include <cstdint>

 #include "src/dsp/arm/common_neon.h"
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"

 namespace libgav1 {
 namespace dsp {
 namespace low_bitdepth {
 namespace {

 // (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh)
 inline uint8x8_t Hev(const uint8x8_t abd_p0p1_q0q1, const uint8_t thresh) {
   const uint8x8_t a = vcgt_u8(abd_p0p1_q0q1, vdup_n_u8(thresh));
   return vorr_u8(a, RightShift<32>(a));
 }

 // abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh
 inline uint8x8_t OuterThreshold(const uint8x8_t p0q0, const uint8x8_t p1q1,
                                 const uint8_t outer_thresh) {
   const uint8x8x2_t a = Interleave32(p0q0, p1q1);
   const uint8x8_t b = vabd_u8(a.val[0], a.val[1]);
   const uint8x8_t p0q0_double = vqadd_u8(b, b);
   const uint8x8_t p1q1_half = RightShift<32>(vshr_n_u8(b, 1));
   const uint8x8_t c = vqadd_u8(p0q0_double, p1q1_half);
   return vcle_u8(c, vdup_n_u8(outer_thresh));
 }

 // abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
 //   OuterThreshhold()
 inline uint8x8_t NeedsFilter4(const uint8x8_t abd_p0p1_q0q1,
                               const uint8x8_t p0q0, const uint8x8_t p1q1,
                               const uint8_t inner_thresh,
                               const uint8_t outer_thresh) {
   const uint8x8_t a = vcle_u8(abd_p0p1_q0q1, vdup_n_u8(inner_thresh));
   const uint8x8_t inner_mask = vand_u8(a, RightShift<32>(a));
   const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
   return vand_u8(inner_mask, outer_mask);
 }

 inline void Filter4Masks(const uint8x8_t p0q0, const uint8x8_t p1q1,
                          const uint8_t hev_thresh, const uint8_t outer_thresh,
                          const uint8_t inner_thresh, uint8x8_t* const hev_mask,
                          uint8x8_t* const needs_filter4_mask) {
   const uint8x8_t p0p1_q0q1 = vabd_u8(p0q0, p1q1);
   // This includes cases where NeedsFilter4() is not true and so Filter2() will
   // not be applied.
   const uint8x8_t hev_tmp_mask = Hev(p0p1_q0q1, hev_thresh);

   *needs_filter4_mask =
       NeedsFilter4(p0p1_q0q1, p0q0, p1q1, inner_thresh, outer_thresh);

   // Filter2() will only be applied if both NeedsFilter4() and Hev() are true.
   *hev_mask = vand_u8(hev_tmp_mask, *needs_filter4_mask);
 }

 // Calculate Filter4() or Filter2() based on |hev_mask|.
 inline void Filter4(const uint8x8_t q0p1, const uint8x8_t p0q1,
                     const uint8x8_t hev_mask, uint8x8_t* const p1q1_result,
                     uint8x8_t* const p0q0_result) {
   const int16x4_t zero = vdup_n_s16(0);

   // a = 3 * (q0 - p0) + Clip3(p1 - q1, min_signed_val, max_signed_val);
   const int16x8_t q0mp0_p1mq1 = vreinterpretq_s16_u16(vsubl_u8(q0p1, p0q1));
   const int16x4_t q0mp0_3 = vmul_n_s16(vget_low_s16(q0mp0_p1mq1), 3);

   // If this is for Filter2() then include |p1mq1|. Otherwise zero it.
   const int16x4_t p1mq1 = vget_high_s16(q0mp0_p1mq1);
   const int8x8_t p1mq1_saturated = vqmovn_s16(vcombine_s16(p1mq1, zero));
   const int8x8_t hev_option =
       vand_s8(vreinterpret_s8_u8(hev_mask), p1mq1_saturated);

   const int16x4_t a =
       vget_low_s16(vaddw_s8(vcombine_s16(q0mp0_3, zero), hev_option));

   // We can not shift with rounding because the clamp comes *before* the
   // shifting. a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3; a2 =
   // Clip3(a + 3, min_signed_val, max_signed_val) >> 3;
   const int16x4_t plus_four = vadd_s16(a, vdup_n_s16(4));
   const int16x4_t plus_three = vadd_s16(a, vdup_n_s16(3));
   const int8x8_t a2_a1 =
       vshr_n_s8(vqmovn_s16(vcombine_s16(plus_three, plus_four)), 3);

   // a3 is in the high 4 values.
   // a3 = (a1 + 1) >> 1;
   const int8x8_t a3 = vrshr_n_s8(a2_a1, 1);

   const int16x8_t p0q1_l = vreinterpretq_s16_u16(vmovl_u8(p0q1));
   const int16x8_t q0p1_l = vreinterpretq_s16_u16(vmovl_u8(q0p1));

   const int16x8_t p1q1_l =
       vcombine_s16(vget_high_s16(q0p1_l), vget_high_s16(p0q1_l));

   const int8x8_t a3_ma3 = InterleaveHigh32(a3, vneg_s8(a3));
   const int16x8_t p1q1_a3 = vaddw_s8(p1q1_l, a3_ma3);

   const int16x8_t p0q0_l =
       vcombine_s16(vget_low_s16(p0q1_l), vget_low_s16(q0p1_l));
   // Need to shift the second term or we end up with a2_ma2.
   const int8x8_t a2_ma1 =
       InterleaveLow32(a2_a1, RightShift<32>(vneg_s8(a2_a1)));
   const int16x8_t p0q0_a = vaddw_s8(p0q0_l, a2_ma1);

   *p1q1_result = vqmovun_s16(p1q1_a3);
   *p0q0_result = vqmovun_s16(p0q0_a);
 }

 void Horizontal4_NEON(void* const dest, const ptrdiff_t stride,
                       const int outer_thresh, const int inner_thresh,
                       const int hev_thresh) {
   uint8_t* dst = static_cast<uint8_t*>(dest);

   const uint8x8_t p1_v = Load4(dst - 2 * stride);
   const uint8x8_t p0_v = Load4(dst - stride);
   const uint8x8_t p0q0 = Load4<1>(dst, p0_v);
   const uint8x8_t p1q1 = Load4<1>(dst + stride, p1_v);

   uint8x8_t hev_mask;
   uint8x8_t needs_filter4_mask;
   Filter4Masks(p0q0, p1q1, hev_thresh, outer_thresh, inner_thresh, &hev_mask,
                &needs_filter4_mask);

   // Copy the masks to the high bits for packed comparisons later.
   hev_mask = InterleaveLow32(hev_mask, hev_mask);
   needs_filter4_mask = InterleaveLow32(needs_filter4_mask, needs_filter4_mask);

 #if defined(__aarch64__)
   // This provides a good speedup for the unit test. Not sure how applicable it
   // is to valid streams though.
   // Consider doing this on armv7 if there is a quick way to check if a vector
   // is zero.
   if (vaddv_u8(needs_filter4_mask) == 0) {
     // None of the values will be filtered.
     return;
   }
 #endif  // defined(__aarch64__)

   uint8x8_t f_p1q1;
   uint8x8_t f_p0q0;
   const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
   Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);

   // Already integrated the Hev mask when calculating the filtered values.
   const uint8x8_t p0q0_output = vbsl_u8(needs_filter4_mask, f_p0q0, p0q0);

   // p1/q1 are unmodified if only Hev() is true. This works because it was and'd
   // with |needs_filter4_mask| previously.
   const uint8x8_t p1q1_mask = veor_u8(hev_mask, needs_filter4_mask);
   const uint8x8_t p1q1_output = vbsl_u8(p1q1_mask, f_p1q1, p1q1);

   StoreLo4(dst - 2 * stride, p1q1_output);
   StoreLo4(dst - stride, p0q0_output);
   StoreHi4(dst, p0q0_output);
   StoreHi4(dst + stride, p1q1_output);
 }

 void Vertical4_NEON(void* const dest, const ptrdiff_t stride,
                     const int outer_thresh, const int inner_thresh,
                     const int hev_thresh) {
   uint8_t* dst = static_cast<uint8_t*>(dest);

   // Move |dst| to the left side of the filter window.
   dst -= 2;

   // |p1q0| and |p0q1| are named for the values they will contain after the
   // transpose.
   const uint8x8_t row0 = Load4(dst);
   uint8x8_t p1q0 = Load4<1>(dst + stride, row0);
   const uint8x8_t row2 = Load4(dst + 2 * stride);
   uint8x8_t p0q1 = Load4<1>(dst + 3 * stride, row2);

   Transpose4x4(&p1q0, &p0q1);
   // Rearrange.
   const uint8x8x2_t p1q1xq0p0 = Interleave32(p1q0, Transpose32(p0q1));
   const uint8x8x2_t p1q1xp0q0 = {p1q1xq0p0.val[0],
                                  Transpose32(p1q1xq0p0.val[1])};

   uint8x8_t hev_mask;
   uint8x8_t needs_filter4_mask;
   Filter4Masks(p1q1xp0q0.val[1], p1q1xp0q0.val[0], hev_thresh, outer_thresh,
                inner_thresh, &hev_mask, &needs_filter4_mask);

   // Copy the masks to the high bits for packed comparisons later.
   hev_mask = InterleaveLow32(hev_mask, hev_mask);
   needs_filter4_mask = InterleaveLow32(needs_filter4_mask, needs_filter4_mask);

 #if defined(__aarch64__)
   // This provides a good speedup for the unit test. Not sure how applicable it
   // is to valid streams though.
   // Consider doing this on armv7 if there is a quick way to check if a vector
   // is zero.
   if (vaddv_u8(needs_filter4_mask) == 0) {
     // None of the values will be filtered.
     return;
   }
 #endif  // defined(__aarch64__)

   uint8x8_t f_p1q1;
   uint8x8_t f_p0q0;
   Filter4(Transpose32(p1q0), p0q1, hev_mask, &f_p1q1, &f_p0q0);

   // Already integrated the Hev mask when calculating the filtered values.
   const uint8x8_t p0q0_output =
       vbsl_u8(needs_filter4_mask, f_p0q0, p1q1xp0q0.val[1]);

   // p1/q1 are unmodified if only Hev() is true. This works because it was and'd
   // with |needs_filter4_mask| previously.
   const uint8x8_t p1q1_mask = veor_u8(hev_mask, needs_filter4_mask);
   const uint8x8_t p1q1_output = vbsl_u8(p1q1_mask, f_p1q1, p1q1xp0q0.val[0]);

   // Put things back in order to reverse the transpose.
   const uint8x8x2_t p1p0xq1q0 = Interleave32(p1q1_output, p0q0_output);
   uint8x8_t output_0 = p1p0xq1q0.val[0],
             output_1 = Transpose32(p1p0xq1q0.val[1]);

   Transpose4x4(&output_0, &output_1);

   StoreLo4(dst, output_0);
   StoreLo4(dst + stride, output_1);
   StoreHi4(dst + 2 * stride, output_0);
   StoreHi4(dst + 3 * stride, output_1);
 }

 // abs(p1 - p0) <= flat_thresh && abs(q1 - q0) <= flat_thresh &&
 //   abs(p2 - p0) <= flat_thresh && abs(q2 - q0) <= flat_thresh
 // |flat_thresh| == 1 for 8 bit decode.
 inline uint8x8_t IsFlat3(const uint8x8_t abd_p0p1_q0q1,
                          const uint8x8_t abd_p0p2_q0q2) {
   const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p0p2_q0q2);
   const uint8x8_t b = vcle_u8(a, vdup_n_u8(1));
   return vand_u8(b, RightShift<32>(b));
 }

 // abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh &&
 //   abs(q1 - q0) <= inner_thresh && abs(q2 - q1) <= inner_thresh &&
 //   OuterThreshhold()
 inline uint8x8_t NeedsFilter6(const uint8x8_t abd_p0p1_q0q1,
                               const uint8x8_t abd_p1p2_q1q2,
                               const uint8x8_t p0q0, const uint8x8_t p1q1,
                               const uint8_t inner_thresh,
                               const uint8_t outer_thresh) {
   const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p1p2_q1q2);
   const uint8x8_t b = vcle_u8(a, vdup_n_u8(inner_thresh));
   const uint8x8_t inner_mask = vand_u8(b, RightShift<32>(b));
   const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
   return vand_u8(inner_mask, outer_mask);
 }

 inline void Filter6Masks(const uint8x8_t p2q2, const uint8x8_t p1q1,
                          const uint8x8_t p0q0, const uint8_t hev_thresh,
                          const uint8_t outer_thresh, const uint8_t inner_thresh,
                          uint8x8_t* const needs_filter6_mask,
                          uint8x8_t* const is_flat3_mask,
                          uint8x8_t* const hev_mask) {
   const uint8x8_t p0p1_q0q1 = vabd_u8(p0q0, p1q1);
   *hev_mask = Hev(p0p1_q0q1, hev_thresh);
   *is_flat3_mask = IsFlat3(p0p1_q0q1, vabd_u8(p0q0, p2q2));
   *needs_filter6_mask = NeedsFilter6(p0p1_q0q1, vabd_u8(p1q1, p2q2), p0q0, p1q1,
                                      inner_thresh, outer_thresh);
 }

 inline void Filter6(const uint8x8_t p2q2, const uint8x8_t p1q1,
                     const uint8x8_t p0q0, uint8x8_t* const p1q1_output,
                     uint8x8_t* const p0q0_output) {
   // Sum p1 and q1 output from opposite directions
   // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
   //      ^^^^^^^^
   // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q3)
   //                                 ^^^^^^^^
   const uint16x8_t p2q2_double = vaddl_u8(p2q2, p2q2);
   uint16x8_t sum = vaddw_u8(p2q2_double, p2q2);

   // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
   //                 ^^^^^^^^
   // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q3)
   //                      ^^^^^^^^
   sum = vaddq_u16(vaddl_u8(p1q1, p1q1), sum);

   // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
   //                            ^^^^^^^^
   // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q3)
   //           ^^^^^^^^
   sum = vaddq_u16(vaddl_u8(p0q0, p0q0), sum);

   // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
   //                                       ^^
   // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q3)
   //      ^^
   const uint8x8_t q0p0 = Transpose32(p0q0);
   sum = vaddw_u8(sum, q0p0);

   *p1q1_output = vrshrn_n_u16(sum, 3);

   // Convert to p0 and q0 output:
   // p0 = p1 - (2 * p2) + q0 + q1
   // q0 = q1 - (2 * q2) + p0 + p1
   sum = vsubq_u16(sum, p2q2_double);
   const uint8x8_t q1p1 = Transpose32(p1q1);
   sum = vaddq_u16(vaddl_u8(q0p0, q1p1), sum);

   *p0q0_output = vrshrn_n_u16(sum, 3);
 }

 void Horizontal6_NEON(void* const dest, const ptrdiff_t stride,
                       const int outer_thresh, const int inner_thresh,
                       const int hev_thresh) {
   auto* dst = static_cast<uint8_t*>(dest);

   const uint8x8_t p2_v = Load4(dst - 3 * stride);
   const uint8x8_t p1_v = Load4(dst - 2 * stride);
   const uint8x8_t p0_v = Load4(dst - stride);
   const uint8x8_t p0q0 = Load4<1>(dst, p0_v);
   const uint8x8_t p1q1 = Load4<1>(dst + stride, p1_v);
   const uint8x8_t p2q2 = Load4<1>(dst + 2 * stride, p2_v);

   uint8x8_t needs_filter6_mask, is_flat3_mask, hev_mask;
   Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
                &needs_filter6_mask, &is_flat3_mask, &hev_mask);

   needs_filter6_mask = InterleaveLow32(needs_filter6_mask, needs_filter6_mask);
   is_flat3_mask = InterleaveLow32(is_flat3_mask, is_flat3_mask);
   hev_mask = InterleaveLow32(hev_mask, hev_mask);

 #if defined(__aarch64__)
   // This provides a good speedup for the unit test. Not sure how applicable it
   // is to valid streams though.
   // Consider doing this on armv7 if there is a quick way to check if a vector
   // is zero.
   if (vaddv_u8(needs_filter6_mask) == 0) {
     // None of the values will be filtered.
     return;
   }
 #endif  // defined(__aarch64__)

   uint8x8_t f_p1q1;
   uint8x8_t f_p0q0;
   const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
   Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
   // Reset the outer values if only a Hev() mask was required.
   f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);

   uint8x8_t f6_p1q1, f6_p0q0;
 #if defined(__aarch64__)
   if (vaddv_u8(vand_u8(is_flat3_mask, needs_filter6_mask)) == 0) {
     // Filter6() does not apply.
     const uint8x8_t zero = vdup_n_u8(0);
     f6_p1q1 = zero;
     f6_p0q0 = zero;
   } else {
 #endif  // defined(__aarch64__)
     Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
 #if defined(__aarch64__)
   }
 #endif  // defined(__aarch64__)

   uint8x8_t p1q1_output = vbsl_u8(is_flat3_mask, f6_p1q1, f_p1q1);
   p1q1_output = vbsl_u8(needs_filter6_mask, p1q1_output, p1q1);
   StoreLo4(dst - 2 * stride, p1q1_output);
   StoreHi4(dst + stride, p1q1_output);

   uint8x8_t p0q0_output = vbsl_u8(is_flat3_mask, f6_p0q0, f_p0q0);
   p0q0_output = vbsl_u8(needs_filter6_mask, p0q0_output, p0q0);
   StoreLo4(dst - stride, p0q0_output);
   StoreHi4(dst, p0q0_output);
 }

 void Vertical6_NEON(void* const dest, const ptrdiff_t stride,
                     const int outer_thresh, const int inner_thresh,
                     const int hev_thresh) {
   auto* dst = static_cast<uint8_t*>(dest);

   // Move |dst| to the left side of the filter window.
   dst -= 3;

   // |p2q1|, |p1q2|, |p0xx| and |q0xx| are named for the values they will
   // contain after the transpose.
   // These over-read by 2 bytes. We only need 6.
   uint8x8_t p2q1 = vld1_u8(dst);
   uint8x8_t p1q2 = vld1_u8(dst + stride);
   uint8x8_t p0xx = vld1_u8(dst + 2 * stride);
   uint8x8_t q0xx = vld1_u8(dst + 3 * stride);

   Transpose8x4(&p2q1, &p1q2, &p0xx, &q0xx);

   const uint8x8x2_t p2q2xq1p1 = Interleave32(p2q1, Transpose32(p1q2));
   const uint8x8_t p2q2 = p2q2xq1p1.val[0];
   const uint8x8_t p1q1 = Transpose32(p2q2xq1p1.val[1]);
   const uint8x8_t p0q0 = InterleaveLow32(p0xx, q0xx);

   uint8x8_t needs_filter6_mask, is_flat3_mask, hev_mask;
   Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
                &needs_filter6_mask, &is_flat3_mask, &hev_mask);

   needs_filter6_mask = InterleaveLow32(needs_filter6_mask, needs_filter6_mask);
   is_flat3_mask = InterleaveLow32(is_flat3_mask, is_flat3_mask);
   hev_mask = InterleaveLow32(hev_mask, hev_mask);

 #if defined(__aarch64__)
   // This provides a good speedup for the unit test. Not sure how applicable it
   // is to valid streams though.
   // Consider doing this on armv7 if there is a quick way to check if a vector
   // is zero.
   if (vaddv_u8(needs_filter6_mask) == 0) {
     // None of the values will be filtered.
     return;
   }
 #endif  // defined(__aarch64__)

   uint8x8_t f_p1q1;
   uint8x8_t f_p0q0;
   const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
   Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
   // Reset the outer values if only a Hev() mask was required.
   f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);

   uint8x8_t f6_p1q1, f6_p0q0;
 #if defined(__aarch64__)
   if (vaddv_u8(vand_u8(is_flat3_mask, needs_filter6_mask)) == 0) {
     // Filter6() does not apply.
     const uint8x8_t zero = vdup_n_u8(0);
     f6_p1q1 = zero;
     f6_p0q0 = zero;
   } else {
 #endif  // defined(__aarch64__)
     Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
 #if defined(__aarch64__)
   }
 #endif  // defined(__aarch64__)

   uint8x8_t p1q1_output = vbsl_u8(is_flat3_mask, f6_p1q1, f_p1q1);
   p1q1_output = vbsl_u8(needs_filter6_mask, p1q1_output, p1q1);

   uint8x8_t p0q0_output = vbsl_u8(is_flat3_mask, f6_p0q0, f_p0q0);
   p0q0_output = vbsl_u8(needs_filter6_mask, p0q0_output, p0q0);

   // The six tap filter is only six taps on input. Output is limited to p1-q1.
   dst += 1;
   // Put things back in order to reverse the transpose.
   const uint8x8x2_t p1p0xq1q0 = Interleave32(p1q1_output, p0q0_output);
   uint8x8_t output_0 = p1p0xq1q0.val[0];
   uint8x8_t output_1 = Transpose32(p1p0xq1q0.val[1]);

   Transpose4x4(&output_0, &output_1);

   StoreLo4(dst, output_0);
   StoreLo4(dst + stride, output_1);
   StoreHi4(dst + 2 * stride, output_0);
   StoreHi4(dst + 3 * stride, output_1);
 }

 // IsFlat4 uses N=1, IsFlatOuter4 uses N=4.
 // abs(p[N] - p0) <= flat_thresh && abs(q[N] - q0) <= flat_thresh &&
 //   abs(p[N+1] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh &&
 //   abs(p[N+2] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh
 // |flat_thresh| == 1 for 8 bit decode.
 inline uint8x8_t IsFlat4(const uint8x8_t abd_p0n0_q0n0,
                          const uint8x8_t abd_p0n1_q0n1,
                          const uint8x8_t abd_p0n2_q0n2) {
   const uint8x8_t a = vmax_u8(abd_p0n0_q0n0, abd_p0n1_q0n1);
   const uint8x8_t b = vmax_u8(a, abd_p0n2_q0n2);
   const uint8x8_t c = vcle_u8(b, vdup_n_u8(1));
   return vand_u8(c, RightShift<32>(c));
 }

 // abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh &&
 //   abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
 //   abs(q2 - q1) <= inner_thresh && abs(q3 - q2) <= inner_thresh
 //   OuterThreshhold()
 inline uint8x8_t NeedsFilter8(const uint8x8_t abd_p0p1_q0q1,
                               const uint8x8_t abd_p1p2_q1q2,
                               const uint8x8_t abd_p2p3_q2q3,
                               const uint8x8_t p0q0, const uint8x8_t p1q1,
                               const uint8_t inner_thresh,
                               const uint8_t outer_thresh) {
   const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p1p2_q1q2);
   const uint8x8_t b = vmax_u8(a, abd_p2p3_q2q3);
   const uint8x8_t c = vcle_u8(b, vdup_n_u8(inner_thresh));
   const uint8x8_t inner_mask = vand_u8(c, RightShift<32>(c));
   const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
   return vand_u8(inner_mask, outer_mask);
 }

 inline void Filter8Masks(const uint8x8_t p3q3, const uint8x8_t p2q2,
                          const uint8x8_t p1q1, const uint8x8_t p0q0,
                          const uint8_t hev_thresh, const uint8_t outer_thresh,
                          const uint8_t inner_thresh,
                          uint8x8_t* const needs_filter8_mask,
                          uint8x8_t* const is_flat4_mask,
                          uint8x8_t* const hev_mask) {
   const uint8x8_t p0p1_q0q1 = vabd_u8(p0q0, p1q1);
   *hev_mask = Hev(p0p1_q0q1, hev_thresh);
   *is_flat4_mask = IsFlat4(p0p1_q0q1, vabd_u8(p0q0, p2q2), vabd_u8(p0q0, p3q3));
   *needs_filter8_mask =
       NeedsFilter8(p0p1_q0q1, vabd_u8(p1q1, p2q2), vabd_u8(p2q2, p3q3), p0q0,
                    p1q1, inner_thresh, outer_thresh);
 }

 inline void Filter8(const uint8x8_t p3q3, const uint8x8_t p2q2,
                     const uint8x8_t p1q1, const uint8x8_t p0q0,
                     uint8x8_t* const p2q2_output, uint8x8_t* const p1q1_output,
                     uint8x8_t* const p0q0_output) {
   // Sum p2 and q2 output from opposite directions
   // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0
   //      ^^^^^^^^
   // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3)
   //                                ^^^^^^^^
   uint16x8_t sum = vaddw_u8(vaddl_u8(p3q3, p3q3), p3q3);

   // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0
   //                 ^^^^^^^^
   // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3)
   //                     ^^^^^^^^
   sum = vaddq_u16(vaddl_u8(p2q2, p2q2), sum);

   // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0
   //                            ^^^^^^^
   // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3)
   //           ^^^^^^^
   sum = vaddq_u16(vaddl_u8(p1q1, p0q0), sum);

   // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0
   //                                      ^^
   // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3)
   //      ^^
   const uint8x8_t q0p0 = Transpose32(p0q0);
   sum = vaddw_u8(sum, q0p0);

   *p2q2_output = vrshrn_n_u16(sum, 3);

   // Convert to p1 and q1 output:
   // p1 = p2 - p3 - p2 + p1 + q1
   // q1 = q2 - q3 - q2 + q0 + p1
   sum = vsubq_u16(sum, vaddl_u8(p3q3, p2q2));
   const uint8x8_t q1p1 = Transpose32(p1q1);
   sum = vaddq_u16(vaddl_u8(p1q1, q1p1), sum);

   *p1q1_output = vrshrn_n_u16(sum, 3);

   // Convert to p0 and q0 output:
   // p0 = p1 - p3 - p1 + p0 + q2
   // q0 = q1 - q3 - q1 + q0 + p2
   sum = vsubq_u16(sum, vaddl_u8(p3q3, p1q1));
   const uint8x8_t q2p2 = Transpose32(p2q2);
   sum = vaddq_u16(vaddl_u8(p0q0, q2p2), sum);

   *p0q0_output = vrshrn_n_u16(sum, 3);
 }

 void Horizontal8_NEON(void* const dest, const ptrdiff_t stride,
                       const int outer_thresh, const int inner_thresh,
                       const int hev_thresh) {
   auto* dst = static_cast<uint8_t*>(dest);

   const uint8x8_t p3_v = Load4(dst - 4 * stride);
   const uint8x8_t p2_v = Load4(dst - 3 * stride);
   const uint8x8_t p1_v = Load4(dst - 2 * stride);
   const uint8x8_t p0_v = Load4(dst - stride);
   const uint8x8_t p0q0 = Load4<1>(dst, p0_v);
   const uint8x8_t p1q1 = Load4<1>(dst + stride, p1_v);
   const uint8x8_t p2q2 = Load4<1>(dst + 2 * stride, p2_v);
   const uint8x8_t p3q3 = Load4<1>(dst + 3 * stride, p3_v);

   uint8x8_t needs_filter8_mask, is_flat4_mask, hev_mask;
   Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
                &needs_filter8_mask, &is_flat4_mask, &hev_mask);

   needs_filter8_mask = InterleaveLow32(needs_filter8_mask, needs_filter8_mask);
   is_flat4_mask = vand_u8(is_flat4_mask, needs_filter8_mask);
   is_flat4_mask = InterleaveLow32(is_flat4_mask, is_flat4_mask);
   hev_mask = InterleaveLow32(hev_mask, hev_mask);

 #if defined(__aarch64__)
   // This provides a good speedup for the unit test. Not sure how applicable it
   // is to valid streams though.
   // Consider doing this on armv7 if there is a quick way to check if a vector
   // is zero.
   if (vaddv_u8(needs_filter8_mask) == 0) {
     // None of the values will be filtered.
     return;
   }
 #endif  // defined(__aarch64__)

   uint8x8_t f_p1q1;
   uint8x8_t f_p0q0;
   const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
   Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
   // Reset the outer values if only a Hev() mask was required.
   f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);

   uint8x8_t f8_p2q2, f8_p1q1, f8_p0q0;
 #if defined(__aarch64__)
   if (vaddv_u8(is_flat4_mask) == 0) {
     // Filter8() does not apply.
     const uint8x8_t zero = vdup_n_u8(0);
     f8_p2q2 = zero;
     f8_p1q1 = zero;
     f8_p0q0 = zero;
   } else {
 #endif  // defined(__aarch64__)
     Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);

     const uint8x8_t p2p2_output = vbsl_u8(is_flat4_mask, f8_p2q2, p2q2);
     StoreLo4(dst - 3 * stride, p2p2_output);
     StoreHi4(dst + 2 * stride, p2p2_output);
 #if defined(__aarch64__)
   }
 #endif  // defined(__aarch64__)

   uint8x8_t p1q1_output = vbsl_u8(is_flat4_mask, f8_p1q1, f_p1q1);
   p1q1_output = vbsl_u8(needs_filter8_mask, p1q1_output, p1q1);
   StoreLo4(dst - 2 * stride, p1q1_output);
   StoreHi4(dst + stride, p1q1_output);

   uint8x8_t p0q0_output = vbsl_u8(is_flat4_mask, f8_p0q0, f_p0q0);
   p0q0_output = vbsl_u8(needs_filter8_mask, p0q0_output, p0q0);
   StoreLo4(dst - stride, p0q0_output);
   StoreHi4(dst, p0q0_output);
 }

 void Vertical8_NEON(void* const dest, const ptrdiff_t stride,
                     const int outer_thresh, const int inner_thresh,
                     const int hev_thresh) {
   auto* dst = static_cast<uint8_t*>(dest);
   // Move |dst| to the left side of the filter window.
   dst -= 4;

   // |p3q0|, |p2q1|, |p1q2| and |p0q3| are named for the values they will
   // contain after the transpose.
   uint8x8_t p3q0 = vld1_u8(dst);
   uint8x8_t p2q1 = vld1_u8(dst + stride);
   uint8x8_t p1q2 = vld1_u8(dst + 2 * stride);
   uint8x8_t p0q3 = vld1_u8(dst + 3 * stride);

   Transpose8x4(&p3q0, &p2q1, &p1q2, &p0q3);
   const uint8x8x2_t p3q3xq0p0 = Interleave32(p3q0, Transpose32(p0q3));
   const uint8x8_t p3q3 = p3q3xq0p0.val[0];
   const uint8x8_t p0q0 = Transpose32(p3q3xq0p0.val[1]);
   const uint8x8x2_t p2q2xq1p1 = Interleave32(p2q1, Transpose32(p1q2));
   const uint8x8_t p2q2 = p2q2xq1p1.val[0];
   const uint8x8_t p1q1 = Transpose32(p2q2xq1p1.val[1]);

   uint8x8_t needs_filter8_mask, is_flat4_mask, hev_mask;
   Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
                &needs_filter8_mask, &is_flat4_mask, &hev_mask);

   needs_filter8_mask = InterleaveLow32(needs_filter8_mask, needs_filter8_mask);
   is_flat4_mask = vand_u8(is_flat4_mask, needs_filter8_mask);
   is_flat4_mask = InterleaveLow32(is_flat4_mask, is_flat4_mask);
   hev_mask = InterleaveLow32(hev_mask, hev_mask);

 #if defined(__aarch64__)
   // This provides a good speedup for the unit test. Not sure how applicable it
   // is to valid streams though.
   // Consider doing this on armv7 if there is a quick way to check if a vector
   // is zero.
   if (vaddv_u8(needs_filter8_mask) == 0) {
     // None of the values will be filtered.
     return;
   }
 #endif  // defined(__aarch64__)

   uint8x8_t f_p1q1;
   uint8x8_t f_p0q0;
   const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
   Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
   // Reset the outer values if only a Hev() mask was required.
   f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);

   uint8x8_t f8_p2q2, f8_p1q1, f8_p0q0;
 #if defined(__aarch64__)
   if (vaddv_u8(is_flat4_mask) == 0) {
     // Filter8() does not apply.
     const uint8x8_t zero = vdup_n_u8(0);
     f8_p2q2 = zero;
     f8_p1q1 = zero;
     f8_p0q0 = zero;
   } else {
 #endif  // defined(__aarch64__)
     Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);

 #if defined(__aarch64__)
   }
 #endif  // defined(__aarch64__)

   // Always prepare and store p2/q2 because we need to transpose it anyway.
   const uint8x8_t p2q2_output = vbsl_u8(is_flat4_mask, f8_p2q2, p2q2);

   uint8x8_t p1q1_output = vbsl_u8(is_flat4_mask, f8_p1q1, f_p1q1);
   p1q1_output = vbsl_u8(needs_filter8_mask, p1q1_output, p1q1);

   uint8x8_t p0q0_output = vbsl_u8(is_flat4_mask, f8_p0q0, f_p0q0);
   p0q0_output = vbsl_u8(needs_filter8_mask, p0q0_output, p0q0);

   // Write out p3/q3 as well. There isn't a good way to write out 6 bytes.
   // Variable names reflect the values before transposition.
   const uint8x8x2_t p3q0xq3p0_output =
       Interleave32(p3q3, Transpose32(p0q0_output));
   uint8x8_t p3q0_output = p3q0xq3p0_output.val[0];
   uint8x8_t p0q3_output = Transpose32(p3q0xq3p0_output.val[1]);
   const uint8x8x2_t p2q1xq2p1_output =
       Interleave32(p2q2_output, Transpose32(p1q1_output));
   uint8x8_t p2q1_output = p2q1xq2p1_output.val[0];
   uint8x8_t p1q2_output = Transpose32(p2q1xq2p1_output.val[1]);

   Transpose8x4(&p3q0_output, &p2q1_output, &p1q2_output, &p0q3_output);

   vst1_u8(dst, p3q0_output);
   vst1_u8(dst + stride, p2q1_output);
   vst1_u8(dst + 2 * stride, p1q2_output);
   vst1_u8(dst + 3 * stride, p0q3_output);
 }

 inline void Filter14(const uint8x8_t p6q6, const uint8x8_t p5q5,
                      const uint8x8_t p4q4, const uint8x8_t p3q3,
                      const uint8x8_t p2q2, const uint8x8_t p1q1,
                      const uint8x8_t p0q0, uint8x8_t* const p5q5_output,
                      uint8x8_t* const p4q4_output, uint8x8_t* const p3q3_output,
                      uint8x8_t* const p2q2_output, uint8x8_t* const p1q1_output,
                      uint8x8_t* const p0q0_output) {
   // Sum p5 and q5 output from opposite directions
   // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
   //      ^^^^^^^^
   // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
   //                                                     ^^^^^^^^
   uint16x8_t sum = vsubw_u8(vshll_n_u8(p6q6, 3), p6q6);

   // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
   //                 ^^^^^^^^
   // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
   //                                          ^^^^^^^^
   sum = vaddq_u16(vaddl_u8(p5q5, p5q5), sum);

   // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
   //                            ^^^^^^^^
   // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
   //                               ^^^^^^^^
   sum = vaddq_u16(vaddl_u8(p4q4, p4q4), sum);

   // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
   //                                       ^^^^^^^
   // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
   //                     ^^^^^^^
   sum = vaddq_u16(vaddl_u8(p3q3, p2q2), sum);

   // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
   //                                                 ^^^^^^^
   // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
   //           ^^^^^^^
   sum = vaddq_u16(vaddl_u8(p1q1, p0q0), sum);

   // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
   //                                                           ^^
   // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
   //      ^^
   const uint8x8_t q0p0 = Transpose32(p0q0);
   sum = vaddw_u8(sum, q0p0);

   *p5q5_output = vrshrn_n_u16(sum, 4);

   // Convert to p4 and q4 output:
   // p4 = p5 - (2 * p6) + p3 + q1
   // q4 = q5 - (2 * q6) + q3 + p1
   sum = vsubq_u16(sum, vaddl_u8(p6q6, p6q6));
   const uint8x8_t q1p1 = Transpose32(p1q1);
   sum = vaddq_u16(vaddl_u8(p3q3, q1p1), sum);

   *p4q4_output = vrshrn_n_u16(sum, 4);

   // Convert to p3 and q3 output:
   // p3 = p4 - p6 - p5 + p2 + q2
   // q3 = q4 - q6 - q5 + q2 + p2
   sum = vsubq_u16(sum, vaddl_u8(p6q6, p5q5));
   const uint8x8_t q2p2 = Transpose32(p2q2);
   sum = vaddq_u16(vaddl_u8(p2q2, q2p2), sum);

   *p3q3_output = vrshrn_n_u16(sum, 4);

   // Convert to p2 and q2 output:
   // p2 = p3 - p6 - p4 + p1 + q3
   // q2 = q3 - q6 - q4 + q1 + p3
   sum = vsubq_u16(sum, vaddl_u8(p6q6, p4q4));
   const uint8x8_t q3p3 = Transpose32(p3q3);
   sum = vaddq_u16(vaddl_u8(p1q1, q3p3), sum);

   *p2q2_output = vrshrn_n_u16(sum, 4);

   // Convert to p1 and q1 output:
   // p1 = p2 - p6 - p3 + p0 + q4
   // q1 = q2 - q6 - q3 + q0 + p4
   sum = vsubq_u16(sum, vaddl_u8(p6q6, p3q3));
   const uint8x8_t q4p4 = Transpose32(p4q4);
   sum = vaddq_u16(vaddl_u8(p0q0, q4p4), sum);

   *p1q1_output = vrshrn_n_u16(sum, 4);

   // Convert to p0 and q0 output:
   // p0 = p1 - p6 - p2 + q0 + q5
   // q0 = q1 - q6 - q2 + p0 + p5
   sum = vsubq_u16(sum, vaddl_u8(p6q6, p2q2));
   const uint8x8_t q5p5 = Transpose32(p5q5);
   sum = vaddq_u16(vaddl_u8(q0p0, q5p5), sum);

   *p0q0_output = vrshrn_n_u16(sum, 4);
 }

 void Horizontal14_NEON(void* const dest, const ptrdiff_t stride,
                        const int outer_thresh, const int inner_thresh,
                        const int hev_thresh) {
   auto* dst = static_cast<uint8_t*>(dest);

   const uint8x8_t p6_v = Load4(dst - 7 * stride);
   const uint8x8_t p5_v = Load4(dst - 6 * stride);
   const uint8x8_t p4_v = Load4(dst - 5 * stride);
   const uint8x8_t p3_v = Load4(dst - 4 * stride);
   const uint8x8_t p2_v = Load4(dst - 3 * stride);
   const uint8x8_t p1_v = Load4(dst - 2 * stride);
   const uint8x8_t p0_v = Load4(dst - stride);
   const uint8x8_t p0q0 = Load4<1>(dst, p0_v);
   const uint8x8_t p1q1 = Load4<1>(dst + stride, p1_v);
   const uint8x8_t p2q2 = Load4<1>(dst + 2 * stride, p2_v);
   const uint8x8_t p3q3 = Load4<1>(dst + 3 * stride, p3_v);
   const uint8x8_t p4q4 = Load4<1>(dst + 4 * stride, p4_v);
   const uint8x8_t p5q5 = Load4<1>(dst + 5 * stride, p5_v);
   const uint8x8_t p6q6 = Load4<1>(dst + 6 * stride, p6_v);

   uint8x8_t needs_filter8_mask, is_flat4_mask, hev_mask;
   Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
                &needs_filter8_mask, &is_flat4_mask, &hev_mask);

   needs_filter8_mask = InterleaveLow32(needs_filter8_mask, needs_filter8_mask);
   is_flat4_mask = vand_u8(is_flat4_mask, needs_filter8_mask);
   is_flat4_mask = InterleaveLow32(is_flat4_mask, is_flat4_mask);
   hev_mask = InterleaveLow32(hev_mask, hev_mask);

 #if defined(__aarch64__)
   // This provides a good speedup for the unit test. Not sure how applicable it
   // is to valid streams though.
   // Consider doing this on armv7 if there is a quick way to check if a vector
   // is zero.
   if (vaddv_u8(needs_filter8_mask) == 0) {
     // None of the values will be filtered.
     return;
   }
 #endif  // defined(__aarch64__)

   // Decide between Filter8() and Filter14().
   uint8x8_t is_flat_outer4_mask =
       IsFlat4(vabd_u8(p0q0, p4q4), vabd_u8(p0q0, p5q5), vabd_u8(p0q0, p6q6));
   is_flat_outer4_mask = vand_u8(is_flat4_mask, is_flat_outer4_mask);
   is_flat_outer4_mask =
       InterleaveLow32(is_flat_outer4_mask, is_flat_outer4_mask);

   uint8x8_t f_p1q1;
   uint8x8_t f_p0q0;
   const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
   Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
   // Reset the outer values if only a Hev() mask was required.
   f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);

   uint8x8_t f8_p1q1, f8_p0q0;
   uint8x8_t f14_p2q2, f14_p1q1, f14_p0q0;
 #if defined(__aarch64__)
   if (vaddv_u8(is_flat4_mask) == 0) {
     // Filter8() and Filter14() do not apply.
     const uint8x8_t zero = vdup_n_u8(0);
     f8_p1q1 = zero;
     f8_p0q0 = zero;
     f14_p1q1 = zero;
     f14_p0q0 = zero;
   } else {
 #endif  // defined(__aarch64__)
     uint8x8_t f8_p2q2;
     Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);

 #if defined(__aarch64__)
     if (vaddv_u8(is_flat_outer4_mask) == 0) {
       // Filter14() does not apply.
       const uint8x8_t zero = vdup_n_u8(0);
       f14_p2q2 = zero;
       f14_p1q1 = zero;
       f14_p0q0 = zero;
     } else {
 #endif  // defined(__aarch64__)
       uint8x8_t f14_p5q5, f14_p4q4, f14_p3q3;
       Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
                &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);

       const uint8x8_t p5q5_output =
           vbsl_u8(is_flat_outer4_mask, f14_p5q5, p5q5);
       StoreLo4(dst - 6 * stride, p5q5_output);
       StoreHi4(dst + 5 * stride, p5q5_output);

       const uint8x8_t p4q4_output =
           vbsl_u8(is_flat_outer4_mask, f14_p4q4, p4q4);
       StoreLo4(dst - 5 * stride, p4q4_output);
       StoreHi4(dst + 4 * stride, p4q4_output);

       const uint8x8_t p3q3_output =
           vbsl_u8(is_flat_outer4_mask, f14_p3q3, p3q3);
       StoreLo4(dst - 4 * stride, p3q3_output);
       StoreHi4(dst + 3 * stride, p3q3_output);
 #if defined(__aarch64__)
     }
 #endif  // defined(__aarch64__)

     uint8x8_t p2q2_output = vbsl_u8(is_flat_outer4_mask, f14_p2q2, f8_p2q2);
     p2q2_output = vbsl_u8(is_flat4_mask, p2q2_output, p2q2);
     StoreLo4(dst - 3 * stride, p2q2_output);
     StoreHi4(dst + 2 * stride, p2q2_output);
 #if defined(__aarch64__)
   }
 #endif  // defined(__aarch64__)

   uint8x8_t p1q1_output = vbsl_u8(is_flat_outer4_mask, f14_p1q1, f8_p1q1);
   p1q1_output = vbsl_u8(is_flat4_mask, p1q1_output, f_p1q1);
   p1q1_output = vbsl_u8(needs_filter8_mask, p1q1_output, p1q1);
   StoreLo4(dst - 2 * stride, p1q1_output);
   StoreHi4(dst + stride, p1q1_output);

   uint8x8_t p0q0_output = vbsl_u8(is_flat_outer4_mask, f14_p0q0, f8_p0q0);
   p0q0_output = vbsl_u8(is_flat4_mask, p0q0_output, f_p0q0);
   p0q0_output = vbsl_u8(needs_filter8_mask, p0q0_output, p0q0);
   StoreLo4(dst - stride, p0q0_output);
   StoreHi4(dst, p0q0_output);
 }

 void Vertical14_NEON(void* const dest, const ptrdiff_t stride,
                      const int outer_thresh, const int inner_thresh,
                      const int hev_thresh) {
   auto* dst = static_cast<uint8_t*>(dest);
   dst -= 8;
   // input
   // p7 p6 p5 p4 p3 p2 p1 p0  q0 q1 q2 q3 q4 q5 q6 q7
   const uint8x16_t x0 = vld1q_u8(dst);
   dst += stride;
   const uint8x16_t x1 = vld1q_u8(dst);
   dst += stride;
   const uint8x16_t x2 = vld1q_u8(dst);
   dst += stride;
   const uint8x16_t x3 = vld1q_u8(dst);
   dst -= (stride * 3);

   // re-order input
 #if defined(__aarch64__)
   const uint8x8_t index_qp3toqp0 = vcreate_u8(0x0b0a090804050607);
   const uint8x8_t index_qp7toqp4 = vcreate_u8(0x0f0e0d0c00010203);
   const uint8x16_t index_qp7toqp0 = vcombine_u8(index_qp3toqp0, index_qp7toqp4);

   uint8x16_t input_0 = vqtbl1q_u8(x0, index_qp7toqp0);
   uint8x16_t input_1 = vqtbl1q_u8(x1, index_qp7toqp0);
   uint8x16_t input_2 = vqtbl1q_u8(x2, index_qp7toqp0);
   uint8x16_t input_3 = vqtbl1q_u8(x3, index_qp7toqp0);
 #else
   const uint8x8_t index_qp3toqp0 = vcreate_u8(0x0b0a090804050607);
   const uint8x8_t index_qp7toqp4 = vcreate_u8(0x0f0e0d0c00010203);

   const uint8x8_t x0_qp3qp0 = VQTbl1U8(x0, index_qp3toqp0);
   const uint8x8_t x1_qp3qp0 = VQTbl1U8(x1, index_qp3toqp0);
   const uint8x8_t x2_qp3qp0 = VQTbl1U8(x2, index_qp3toqp0);
   const uint8x8_t x3_qp3qp0 = VQTbl1U8(x3, index_qp3toqp0);

   const uint8x8_t x0_qp7qp4 = VQTbl1U8(x0, index_qp7toqp4);
   const uint8x8_t x1_qp7qp4 = VQTbl1U8(x1, index_qp7toqp4);
   const uint8x8_t x2_qp7qp4 = VQTbl1U8(x2, index_qp7toqp4);
   const uint8x8_t x3_qp7qp4 = VQTbl1U8(x3, index_qp7toqp4);

   const uint8x16_t input_0 = vcombine_u8(x0_qp3qp0, x0_qp7qp4);
   const uint8x16_t input_1 = vcombine_u8(x1_qp3qp0, x1_qp7qp4);
   const uint8x16_t input_2 = vcombine_u8(x2_qp3qp0, x2_qp7qp4);
   const uint8x16_t input_3 = vcombine_u8(x3_qp3qp0, x3_qp7qp4);
 #endif
   // input after re-order
   // p0 p1 p2 p3 q0 q1 q2 q3  p4 p5 p6 p7 q4 q5 q6 q7

   const uint8x16x2_t in01 = vtrnq_u8(input_0, input_1);
   const uint8x16x2_t in23 = vtrnq_u8(input_2, input_3);
   const uint16x8x2_t in02 = vtrnq_u16(vreinterpretq_u16_u8(in01.val[0]),
                                       vreinterpretq_u16_u8(in23.val[0]));
   const uint16x8x2_t in13 = vtrnq_u16(vreinterpretq_u16_u8(in01.val[1]),
                                       vreinterpretq_u16_u8(in23.val[1]));

   const uint8x8_t p0q0 = vget_low_u8(vreinterpretq_u8_u16(in02.val[0]));
   const uint8x8_t p1q1 = vget_low_u8(vreinterpretq_u8_u16(in13.val[0]));

   const uint8x8_t p2q2 = vget_low_u8(vreinterpretq_u8_u16(in02.val[1]));
   const uint8x8_t p3q3 = vget_low_u8(vreinterpretq_u8_u16(in13.val[1]));

   const uint8x8_t p4q4 = vget_high_u8(vreinterpretq_u8_u16(in02.val[0]));
   const uint8x8_t p5q5 = vget_high_u8(vreinterpretq_u8_u16(in13.val[0]));

   const uint8x8_t p6q6 = vget_high_u8(vreinterpretq_u8_u16(in02.val[1]));
   const uint8x8_t p7q7 = vget_high_u8(vreinterpretq_u8_u16(in13.val[1]));

   uint8x8_t needs_filter8_mask, is_flat4_mask, hev_mask;
   Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
                &needs_filter8_mask, &is_flat4_mask, &hev_mask);

   needs_filter8_mask = InterleaveLow32(needs_filter8_mask, needs_filter8_mask);
   is_flat4_mask = vand_u8(is_flat4_mask, needs_filter8_mask);
   is_flat4_mask = InterleaveLow32(is_flat4_mask, is_flat4_mask);
   hev_mask = InterleaveLow32(hev_mask, hev_mask);

 #if defined(__aarch64__)
   // This provides a good speedup for the unit test. Not sure how applicable it
   // is to valid streams though.
   // Consider doing this on armv7 if there is a quick way to check if a vector
   // is zero.
   if (vaddv_u8(needs_filter8_mask) == 0) {
     // None of the values will be filtered.
     return;
   }
 #endif  // defined(__aarch64__)

   // Decide between Filter8() and Filter14().
   uint8x8_t is_flat_outer4_mask =
       IsFlat4(vabd_u8(p0q0, p4q4), vabd_u8(p0q0, p5q5), vabd_u8(p0q0, p6q6));
   is_flat_outer4_mask = vand_u8(is_flat4_mask, is_flat_outer4_mask);
   is_flat_outer4_mask =
       InterleaveLow32(is_flat_outer4_mask, is_flat_outer4_mask);

   uint8x8_t f_p0q0, f_p1q1;
   const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
   Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
   // Reset the outer values if only a Hev() mask was required.
   f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);

   uint8x8_t p1q1_output, p0q0_output;
   uint8x8_t p5q5_output, p4q4_output, p3q3_output, p2q2_output;

 #if defined(__aarch64__)
   if (vaddv_u8(is_flat4_mask) == 0) {
     // Filter8() and Filter14() do not apply.
     p1q1_output = p1q1;
     p0q0_output = p0q0;

     p5q5_output = p5q5;
     p4q4_output = p4q4;
     p3q3_output = p3q3;
     p2q2_output = p2q2;
   } else {
 #endif  // defined(__aarch64__)
     uint8x8_t f8_p2q2, f8_p1q1, f8_p0q0;
     Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);

 #if defined(__aarch64__)
     if (vaddv_u8(is_flat_outer4_mask) == 0) {
       // Filter14() does not apply.
       p5q5_output = p5q5;
       p4q4_output = p4q4;
       p3q3_output = p3q3;
       p2q2_output = f8_p2q2;
       p1q1_output = f8_p1q1;
       p0q0_output = f8_p0q0;
     } else {
 #endif  // defined(__aarch64__)
       uint8x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
       Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
                &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);

       p5q5_output = vbsl_u8(is_flat_outer4_mask, f14_p5q5, p5q5);
       p4q4_output = vbsl_u8(is_flat_outer4_mask, f14_p4q4, p4q4);
       p3q3_output = vbsl_u8(is_flat_outer4_mask, f14_p3q3, p3q3);
       p2q2_output = vbsl_u8(is_flat_outer4_mask, f14_p2q2, f8_p2q2);
       p1q1_output = vbsl_u8(is_flat_outer4_mask, f14_p1q1, f8_p1q1);
       p0q0_output = vbsl_u8(is_flat_outer4_mask, f14_p0q0, f8_p0q0);
 #if defined(__aarch64__)
     }
 #endif  // defined(__aarch64__)
     p2q2_output = vbsl_u8(is_flat4_mask, p2q2_output, p2q2);
 #if defined(__aarch64__)
   }
 #endif  // defined(__aarch64__)

   p1q1_output = vbsl_u8(is_flat4_mask, p1q1_output, f_p1q1);
   p1q1_output = vbsl_u8(needs_filter8_mask, p1q1_output, p1q1);
   p0q0_output = vbsl_u8(is_flat4_mask, p0q0_output, f_p0q0);
   p0q0_output = vbsl_u8(needs_filter8_mask, p0q0_output, p0q0);

   const uint8x16_t p0q0_p4q4 = vcombine_u8(p0q0_output, p4q4_output);
   const uint8x16_t p2q2_p6q6 = vcombine_u8(p2q2_output, p6q6);
   const uint8x16_t p1q1_p5q5 = vcombine_u8(p1q1_output, p5q5_output);
   const uint8x16_t p3q3_p7q7 = vcombine_u8(p3q3_output, p7q7);

   const uint16x8x2_t out02 = vtrnq_u16(vreinterpretq_u16_u8(p0q0_p4q4),
                                        vreinterpretq_u16_u8(p2q2_p6q6));
   const uint16x8x2_t out13 = vtrnq_u16(vreinterpretq_u16_u8(p1q1_p5q5),
                                        vreinterpretq_u16_u8(p3q3_p7q7));
   const uint8x16x2_t out01 = vtrnq_u8(vreinterpretq_u8_u16(out02.val[0]),
                                       vreinterpretq_u8_u16(out13.val[0]));
   const uint8x16x2_t out23 = vtrnq_u8(vreinterpretq_u8_u16(out02.val[1]),
                                       vreinterpretq_u8_u16(out13.val[1]));

 #if defined(__aarch64__)
   const uint8x8_t index_p7top0 = vcreate_u8(0x0001020308090a0b);
   const uint8x8_t index_q7toq0 = vcreate_u8(0x0f0e0d0c07060504);
   const uint8x16_t index_p7toq7 = vcombine_u8(index_p7top0, index_q7toq0);

   const uint8x16_t output_0 = vqtbl1q_u8(out01.val[0], index_p7toq7);
   const uint8x16_t output_1 = vqtbl1q_u8(out01.val[1], index_p7toq7);
   const uint8x16_t output_2 = vqtbl1q_u8(out23.val[0], index_p7toq7);
   const uint8x16_t output_3 = vqtbl1q_u8(out23.val[1], index_p7toq7);
 #else
   const uint8x8_t index_p7top0 = vcreate_u8(0x0001020308090a0b);
   const uint8x8_t index_q7toq0 = vcreate_u8(0x0f0e0d0c07060504);

   const uint8x8_t x0_p7p0 = VQTbl1U8(out01.val[0], index_p7top0);
   const uint8x8_t x1_p7p0 = VQTbl1U8(out01.val[1], index_p7top0);
   const uint8x8_t x2_p7p0 = VQTbl1U8(out23.val[0], index_p7top0);
   const uint8x8_t x3_p7p0 = VQTbl1U8(out23.val[1], index_p7top0);

   const uint8x8_t x0_q7q0 = VQTbl1U8(out01.val[0], index_q7toq0);
   const uint8x8_t x1_q7q0 = VQTbl1U8(out01.val[1], index_q7toq0);
   const uint8x8_t x2_q7q0 = VQTbl1U8(out23.val[0], index_q7toq0);
   const uint8x8_t x3_q7q0 = VQTbl1U8(out23.val[1], index_q7toq0);

   const uint8x16_t output_0 = vcombine_u8(x0_p7p0, x0_q7q0);
   const uint8x16_t output_1 = vcombine_u8(x1_p7p0, x1_q7q0);
   const uint8x16_t output_2 = vcombine_u8(x2_p7p0, x2_q7q0);
   const uint8x16_t output_3 = vcombine_u8(x3_p7p0, x3_q7q0);
 #endif

   vst1q_u8(dst, output_0);
   dst += stride;
   vst1q_u8(dst, output_1);
   dst += stride;
   vst1q_u8(dst, output_2);
   dst += stride;
   vst1q_u8(dst, output_3);
 }

 void Init8bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
   assert(dsp != nullptr);
   dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
       Horizontal4_NEON;
   dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = Vertical4_NEON;

   dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
       Horizontal6_NEON;
   dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = Vertical6_NEON;

   dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
       Horizontal8_NEON;
   dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = Vertical8_NEON;

   dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
       Horizontal14_NEON;
   dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
       Vertical14_NEON;
 }
 }  // namespace
 }  // namespace low_bitdepth

 void LoopFilterInit_NEON() { low_bitdepth::Init8bpp(); }

 }  // namespace dsp
 }  // namespace libgav1

 #else  // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {

 void LoopFilterInit_NEON() {}

 }  // namespace dsp
 }  // namespace libgav1
 #endif  // LIBGAV1_ENABLE_NEON