libgav1/src/dsp/x86/loop_filter_sse4.cc - platform/external/libgav1 - Git at Google

 // Copyright 2019 The libgav1 Authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include "src/dsp/loop_filter.h"
 #include "src/utils/cpu.h"

 #if LIBGAV1_ENABLE_SSE4_1

 #include <smmintrin.h>

 #include <cassert>
 #include <cstddef>
 #include <cstdint>
 #include <cstring>

 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
 #include "src/dsp/x86/common_sse4.h"

 namespace libgav1 {
 namespace dsp {
 namespace {

 inline __m128i FilterAdd2Sub2(const __m128i& total, const __m128i& a1,
                               const __m128i& a2, const __m128i& s1,
                               const __m128i& s2) {
   __m128i x = _mm_add_epi16(a1, total);
   x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(s1, s2)), a2);
   return x;
 }

 }  // namespace

 namespace low_bitdepth {
 namespace {

 inline __m128i AbsDiff(const __m128i& a, const __m128i& b) {
   return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
 }

 inline __m128i CheckOuterThreshF4(const __m128i& q1q0, const __m128i& p1p0,
                                   const __m128i& outer_thresh) {
   const __m128i fe = _mm_set1_epi8(static_cast<int8_t>(0xfe));
   //  abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh;
   const __m128i abs_pmq = AbsDiff(p1p0, q1q0);
   const __m128i a = _mm_adds_epu8(abs_pmq, abs_pmq);
   const __m128i b = _mm_srli_epi16(_mm_and_si128(abs_pmq, fe), 1);
   const __m128i c = _mm_adds_epu8(a, _mm_srli_si128(b, 4));
   return _mm_subs_epu8(c, outer_thresh);
 }

 inline __m128i Hev(const __m128i& qp1, const __m128i& qp0,
                    const __m128i& hev_thresh) {
   const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
   const __m128i max_pq =
       _mm_max_epu8(abs_qp1mqp0, _mm_srli_si128(abs_qp1mqp0, 4));
   const __m128i hev_mask0 = _mm_cvtepu8_epi16(max_pq);
   const __m128i hev_mask1 = _mm_cmpgt_epi16(hev_mask0, hev_thresh);
   const __m128i hev_mask = _mm_packs_epi16(hev_mask1, hev_mask1);
   return hev_mask;
 }

 inline __m128i AddShift3(const __m128i& a, const __m128i& b) {
   const __m128i c = _mm_adds_epi8(a, b);
   const __m128i d = _mm_unpacklo_epi8(c, c);
   const __m128i e = _mm_srai_epi16(d, 11); /* >> 3 */
   return _mm_packs_epi16(e, e);
 }

 inline __m128i AddShift1(const __m128i& a, const __m128i& b) {
   const __m128i c = _mm_adds_epi8(a, b);
   const __m128i d = _mm_unpacklo_epi8(c, c);
   const __m128i e = _mm_srai_epi16(d, 9); /* >> 1 */
   return _mm_packs_epi16(e, e);
 }

 //------------------------------------------------------------------------------
 // 4-tap filters

 inline __m128i NeedsFilter4(const __m128i& q1q0, const __m128i& p1p0,
                             const __m128i& qp1, const __m128i& qp0,
                             const __m128i& outer_thresh,
                             const __m128i& inner_thresh) {
   const __m128i outer_mask = CheckOuterThreshF4(q1q0, p1p0, outer_thresh);
   const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
   const __m128i inner_mask = _mm_subs_epu8(
       _mm_max_epu8(abs_qp1mqp0, _mm_srli_si128(abs_qp1mqp0, 4)), inner_thresh);
   // ~mask
   const __m128i zero = _mm_setzero_si128();
   const __m128i a = _mm_or_si128(outer_mask, inner_mask);
   const __m128i b = _mm_cmpeq_epi8(a, zero);
   return b;
 }

 inline void Filter4(const __m128i& qp1, const __m128i& qp0, __m128i* oqp1,
                     __m128i* oqp0, const __m128i& mask, const __m128i& hev) {
   const __m128i t80 = _mm_set1_epi8(static_cast<int8_t>(0x80));
   const __m128i t1 = _mm_set1_epi8(0x1);
   const __m128i qp1qp0 = _mm_unpacklo_epi64(qp0, qp1);
   const __m128i qps1qps0 = _mm_xor_si128(qp1qp0, t80);
   const __m128i ps1qs0 = _mm_shuffle_epi32(qps1qps0, 0x09);
   const __m128i qs1ps0 = _mm_shuffle_epi32(qps1qps0, 0x0c);
   const __m128i _hev = _mm_unpacklo_epi32(hev, hev);
   const __m128i x = _mm_subs_epi8(ps1qs0, qs1ps0);
   __m128i a = _mm_and_si128(_mm_srli_si128(x, 4), _hev);

   a = _mm_adds_epi8(a, x);
   a = _mm_adds_epi8(a, x);
   a = _mm_adds_epi8(a, x);
   a = _mm_and_si128(a, mask);
   a = _mm_unpacklo_epi32(a, a);

   const __m128i t4t3 = _mm_set_epi32(0x0, 0x0, 0x04040404, 0x03030303);
   const __m128i a1a2 = AddShift3(a, t4t3);
   const __m128i a1a1 = _mm_shuffle_epi32(a1a2, 0x55);
   const __m128i a3a3 = _mm_andnot_si128(_hev, AddShift1(a1a1, t1));
   // -1 -1 -1 -1 1 1 1 1 -1 -1 -1 -1 1 1 1 1
   const __m128i adjust_sign_for_add =
       _mm_unpacklo_epi32(t1, _mm_cmpeq_epi8(t1, t1));

   const __m128i a3a3a1a2 = _mm_unpacklo_epi64(a1a2, a3a3);
   const __m128i ma3a3ma1a2 = _mm_sign_epi8(a3a3a1a2, adjust_sign_for_add);

   const __m128i b = _mm_adds_epi8(qps1qps0, ma3a3ma1a2);
   const __m128i c = _mm_xor_si128(b, t80);

   *oqp0 = c;
   *oqp1 = _mm_srli_si128(c, 8);
 }

 void Horizontal4(void* dest, ptrdiff_t stride, int outer_thresh,
                  int inner_thresh, int hev_thresh) {
   auto* const dst = static_cast<uint8_t*>(dest);
   const __m128i zero = _mm_setzero_si128();
   const __m128i v_outer_thresh =
       _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
   const __m128i v_inner_thresh =
       _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
   const __m128i v_hev_thresh =
       _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh), 0);

   const __m128i p1 = Load4(dst - 2 * stride);
   const __m128i p0 = Load4(dst - 1 * stride);
   const __m128i q0 = Load4(dst + 0 * stride);
   const __m128i q1 = Load4(dst + 1 * stride);
   const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
   const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
   const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
   const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);

   const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
   const __m128i v_needs_mask =
       NeedsFilter4(q1q0, p1p0, qp1, qp0, v_outer_thresh, v_inner_thresh);

   __m128i oqp1;
   __m128i oqp0;
   Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);

   Store4(dst - 2 * stride, oqp1);
   Store4(dst - 1 * stride, oqp0);
   Store4(dst + 0 * stride, _mm_srli_si128(oqp0, 4));
   Store4(dst + 1 * stride, _mm_srli_si128(oqp1, 4));
 }

 inline void Transpose4x4(const __m128i& x0, const __m128i& x1,
                          const __m128i& x2, const __m128i& x3, __m128i* d0,
                          __m128i* d1, __m128i* d2, __m128i* d3) {
   // input
   // x0   00 01 02 03 xx xx xx xx xx xx xx xx xx xx xx xx
   // x1   10 11 12 13 xx xx xx xx xx xx xx xx xx xx xx xx
   // x2   20 21 22 23 xx xx xx xx xx xx xx xx xx xx xx xx
   // x3   30 31 32 33 xx xx xx xx xx xx xx xx xx xx xx xx
   // output
   // d0   00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
   // d1   01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
   // d2   02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
   // d3   03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx

   // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
   const __m128i w0 = _mm_unpacklo_epi8(x0, x1);
   // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
   const __m128i w1 = _mm_unpacklo_epi8(x2, x3);

   // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
   *d0 = _mm_unpacklo_epi16(w0, w1);
   // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
   *d1 = _mm_srli_si128(*d0, 4);
   // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
   *d2 = _mm_srli_si128(*d0, 8);
   // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
   *d3 = _mm_srli_si128(*d0, 12);
 }

 void Vertical4(void* dest, ptrdiff_t stride, int outer_thresh, int inner_thresh,
                int hev_thresh) {
   auto* const dst = static_cast<uint8_t*>(dest);
   const __m128i zero = _mm_setzero_si128();
   const __m128i v_outer_thresh =
       _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
   const __m128i v_inner_thresh =
       _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
   const __m128i v_hev_thresh0 =
       _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
   const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);

   __m128i x0 = Load4(dst - 2 + 0 * stride);
   __m128i x1 = Load4(dst - 2 + 1 * stride);
   __m128i x2 = Load4(dst - 2 + 2 * stride);
   __m128i x3 = Load4(dst - 2 + 3 * stride);

   // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
   const __m128i w0 = _mm_unpacklo_epi8(x0, x1);
   // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
   const __m128i w1 = _mm_unpacklo_epi8(x2, x3);
   // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
   const __m128i d0 = _mm_unpacklo_epi16(w0, w1);
   const __m128i qp1 = _mm_shuffle_epi32(d0, 0xc);
   const __m128i qp0 = _mm_srli_si128(d0, 4);
   const __m128i q1q0 = _mm_srli_si128(d0, 8);
   const __m128i p1p0 = _mm_shuffle_epi32(d0, 0x1);

   const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
   const __m128i v_needs_mask =
       NeedsFilter4(q1q0, p1p0, qp1, qp0, v_outer_thresh, v_inner_thresh);

   __m128i oqp1;
   __m128i oqp0;
   Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);

   const __m128i p1 = oqp1;
   const __m128i p0 = oqp0;
   const __m128i q0 = _mm_srli_si128(oqp0, 4);
   const __m128i q1 = _mm_srli_si128(oqp1, 4);

   Transpose4x4(p1, p0, q0, q1, &x0, &x1, &x2, &x3);

   Store4(dst - 2 + 0 * stride, x0);
   Store4(dst - 2 + 1 * stride, x1);
   Store4(dst - 2 + 2 * stride, x2);
   Store4(dst - 2 + 3 * stride, x3);
 }

 //------------------------------------------------------------------------------
 // 5-tap (chroma) filters

 inline __m128i NeedsFilter6(const __m128i& q1q0, const __m128i& p1p0,
                             const __m128i& qp2, const __m128i& qp1,
                             const __m128i& qp0, const __m128i& outer_thresh,
                             const __m128i& inner_thresh) {
   const __m128i outer_mask = CheckOuterThreshF4(q1q0, p1p0, outer_thresh);
   const __m128i abs_qp2mqp1 = AbsDiff(qp2, qp1);
   const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
   const __m128i max_pq = _mm_max_epu8(abs_qp2mqp1, abs_qp1mqp0);
   const __m128i inner_mask = _mm_subs_epu8(
       _mm_max_epu8(max_pq, _mm_srli_si128(max_pq, 4)), inner_thresh);
   // ~mask
   const __m128i zero = _mm_setzero_si128();
   const __m128i a = _mm_or_si128(outer_mask, inner_mask);
   const __m128i b = _mm_cmpeq_epi8(a, zero);
   return b;
 }

 inline __m128i IsFlat3(const __m128i& qp2, const __m128i& qp1,
                        const __m128i& qp0, const __m128i& flat_thresh) {
   const __m128i abs_pq2mpq0 = AbsDiff(qp2, qp0);
   const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
   const __m128i max_pq = _mm_max_epu8(abs_pq2mpq0, abs_qp1mqp0);
   const __m128i flat_mask = _mm_subs_epu8(
       _mm_max_epu8(max_pq, _mm_srli_si128(max_pq, 4)), flat_thresh);
   // ~mask
   const __m128i zero = _mm_setzero_si128();
   const __m128i a = _mm_cmpeq_epi8(flat_mask, zero);
   return a;
 }

 inline void Filter6(const __m128i& qp2, const __m128i& qp1, const __m128i& qp0,
                     __m128i* oqp1, __m128i* oqp0) {
   const __m128i four = _mm_set1_epi16(4);
   const __m128i qp2_lo = _mm_cvtepu8_epi16(qp2);
   const __m128i qp1_lo = _mm_cvtepu8_epi16(qp1);
   const __m128i qp0_lo = _mm_cvtepu8_epi16(qp0);
   const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
   const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);

   __m128i f6_lo =
       _mm_add_epi16(_mm_add_epi16(qp2_lo, four), _mm_add_epi16(qp2_lo, qp2_lo));

   f6_lo = _mm_add_epi16(_mm_add_epi16(f6_lo, qp1_lo), qp1_lo);

   f6_lo = _mm_add_epi16(_mm_add_epi16(f6_lo, qp0_lo),
                         _mm_add_epi16(qp0_lo, pq0_lo));

   // p2 * 3 + p1 * 2 + p0 * 2 + q0
   // q2 * 3 + q1 * 2 + q0 * 2 + p0
   *oqp1 = _mm_srli_epi16(f6_lo, 3);
   *oqp1 = _mm_packus_epi16(*oqp1, *oqp1);

   // p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1
   // q2 + q1 * 2 + q0 * 2 + p0 * 2 + p1
   f6_lo = FilterAdd2Sub2(f6_lo, pq0_lo, pq1_lo, qp2_lo, qp2_lo);
   *oqp0 = _mm_srli_epi16(f6_lo, 3);
   *oqp0 = _mm_packus_epi16(*oqp0, *oqp0);
 }

 void Horizontal6(void* dest, ptrdiff_t stride, int outer_thresh,
                  int inner_thresh, int hev_thresh) {
   auto* const dst = static_cast<uint8_t*>(dest);
   const __m128i zero = _mm_setzero_si128();
   const __m128i v_flat_thresh = _mm_set1_epi8(1);
   const __m128i v_outer_thresh =
       _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
   const __m128i v_inner_thresh =
       _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
   const __m128i v_hev_thresh0 =
       _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
   const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);

   const __m128i p2 = Load4(dst - 3 * stride);
   const __m128i p1 = Load4(dst - 2 * stride);
   const __m128i p0 = Load4(dst - 1 * stride);
   const __m128i q0 = Load4(dst + 0 * stride);
   const __m128i q1 = Load4(dst + 1 * stride);
   const __m128i q2 = Load4(dst + 2 * stride);
   const __m128i qp2 = _mm_unpacklo_epi32(p2, q2);
   const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
   const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
   const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
   const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);

   const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
   const __m128i v_needs_mask =
       NeedsFilter6(q1q0, p1p0, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
   __m128i oqp1;
   __m128i oqp0;

   Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);

   const __m128i v_isflat3_mask = IsFlat3(qp2, qp1, qp0, v_flat_thresh);
   const __m128i v_mask =
       _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat3_mask), 0);

   if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
     __m128i oqp1_f6;
     __m128i oqp0_f6;

     Filter6(qp2, qp1, qp0, &oqp1_f6, &oqp0_f6);

     oqp1 = _mm_blendv_epi8(oqp1, oqp1_f6, v_mask);
     oqp0 = _mm_blendv_epi8(oqp0, oqp0_f6, v_mask);
   }

   Store4(dst - 2 * stride, oqp1);
   Store4(dst - 1 * stride, oqp0);
   Store4(dst + 0 * stride, _mm_srli_si128(oqp0, 4));
   Store4(dst + 1 * stride, _mm_srli_si128(oqp1, 4));
 }

 inline void Transpose8x4To4x8(const __m128i& x0, const __m128i& x1,
                               const __m128i& x2, const __m128i& x3, __m128i* d0,
                               __m128i* d1, __m128i* d2, __m128i* d3,
                               __m128i* d4, __m128i* d5, __m128i* d6,
                               __m128i* d7) {
   // input
   // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
   // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
   // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
   // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
   // output
   // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
   // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
   // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
   // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
   // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
   // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
   // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
   // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx

   // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
   const __m128i w0 = _mm_unpacklo_epi8(x0, x1);
   // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
   const __m128i w1 = _mm_unpacklo_epi8(x2, x3);
   // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
   const __m128i ww0 = _mm_unpacklo_epi16(w0, w1);
   // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
   const __m128i ww1 = _mm_unpackhi_epi16(w0, w1);

   // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
   *d0 = ww0;
   // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
   *d1 = _mm_srli_si128(ww0, 4);
   // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
   *d2 = _mm_srli_si128(ww0, 8);
   // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
   *d3 = _mm_srli_si128(ww0, 12);
   // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
   *d4 = ww1;
   // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
   *d5 = _mm_srli_si128(ww1, 4);
   // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
   *d6 = _mm_srli_si128(ww1, 8);
   // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
   *d7 = _mm_srli_si128(ww1, 12);
 }

 void Vertical6(void* dest, ptrdiff_t stride, int outer_thresh, int inner_thresh,
                int hev_thresh) {
   auto* const dst = static_cast<uint8_t*>(dest);
   const __m128i zero = _mm_setzero_si128();
   const __m128i v_flat_thresh = _mm_set1_epi8(1);
   const __m128i v_outer_thresh =
       _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
   const __m128i v_inner_thresh =
       _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
   const __m128i v_hev_thresh0 =
       _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
   const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);

   __m128i x0 = LoadLo8(dst - 3 + 0 * stride);
   __m128i x1 = LoadLo8(dst - 3 + 1 * stride);
   __m128i x2 = LoadLo8(dst - 3 + 2 * stride);
   __m128i x3 = LoadLo8(dst - 3 + 3 * stride);

   __m128i p2, p1, p0, q0, q1, q2;
   __m128i z0, z1;  // not used

   Transpose8x4To4x8(x0, x1, x2, x3, &p2, &p1, &p0, &q0, &q1, &q2, &z0, &z1);

   const __m128i qp2 = _mm_unpacklo_epi32(p2, q2);
   const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
   const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
   const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
   const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);

   const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
   const __m128i v_needs_mask =
       NeedsFilter6(q1q0, p1p0, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
   __m128i oqp1;
   __m128i oqp0;

   Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);

   const __m128i v_isflat3_mask = IsFlat3(qp2, qp1, qp0, v_flat_thresh);
   const __m128i v_mask =
       _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat3_mask), 0);

   if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
     __m128i oqp1_f6;
     __m128i oqp0_f6;

     Filter6(qp2, qp1, qp0, &oqp1_f6, &oqp0_f6);

     oqp1 = _mm_blendv_epi8(oqp1, oqp1_f6, v_mask);
     oqp0 = _mm_blendv_epi8(oqp0, oqp0_f6, v_mask);
   }

   p1 = oqp1;
   p0 = oqp0;
   q0 = _mm_srli_si128(oqp0, 4);
   q1 = _mm_srli_si128(oqp1, 4);

   Transpose4x4(p1, p0, q0, q1, &x0, &x1, &x2, &x3);

   Store4(dst - 2 + 0 * stride, x0);
   Store4(dst - 2 + 1 * stride, x1);
   Store4(dst - 2 + 2 * stride, x2);
   Store4(dst - 2 + 3 * stride, x3);
 }

 //------------------------------------------------------------------------------
 // 7-tap filters

 inline __m128i NeedsFilter8(const __m128i& q1q0, const __m128i& p1p0,
                             const __m128i& qp3, const __m128i& qp2,
                             const __m128i& qp1, const __m128i& qp0,
                             const __m128i& outer_thresh,
                             const __m128i& inner_thresh) {
   const __m128i outer_mask = CheckOuterThreshF4(q1q0, p1p0, outer_thresh);
   const __m128i abs_qp2mqp1 = AbsDiff(qp2, qp1);
   const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
   const __m128i max_pq_a = _mm_max_epu8(abs_qp2mqp1, abs_qp1mqp0);
   const __m128i abs_pq3mpq2 = AbsDiff(qp3, qp2);
   const __m128i max_pq = _mm_max_epu8(max_pq_a, abs_pq3mpq2);
   const __m128i inner_mask = _mm_subs_epu8(
       _mm_max_epu8(max_pq, _mm_srli_si128(max_pq, 4)), inner_thresh);
   // ~mask
   const __m128i zero = _mm_setzero_si128();
   const __m128i a = _mm_or_si128(outer_mask, inner_mask);
   const __m128i b = _mm_cmpeq_epi8(a, zero);
   return b;
 }

 inline __m128i IsFlat4(const __m128i& qp3, const __m128i& qp2,
                        const __m128i& qp1, const __m128i& qp0,
                        const __m128i& flat_thresh) {
   const __m128i abs_pq2mpq0 = AbsDiff(qp2, qp0);
   const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
   const __m128i max_pq_a = _mm_max_epu8(abs_pq2mpq0, abs_qp1mqp0);
   const __m128i abs_pq3mpq0 = AbsDiff(qp3, qp0);
   const __m128i max_pq = _mm_max_epu8(max_pq_a, abs_pq3mpq0);
   const __m128i flat_mask = _mm_subs_epu8(
       _mm_max_epu8(max_pq, _mm_srli_si128(max_pq, 4)), flat_thresh);
   // ~mask
   const __m128i zero = _mm_setzero_si128();
   const __m128i a = _mm_cmpeq_epi8(flat_mask, zero);
   return a;
 }

 inline void Filter8(const __m128i& qp3, const __m128i& qp2, const __m128i& qp1,
                     const __m128i& qp0, __m128i* oqp2, __m128i* oqp1,
                     __m128i* oqp0) {
   const __m128i four = _mm_set1_epi16(4);
   const __m128i qp3_lo = _mm_cvtepu8_epi16(qp3);
   const __m128i qp2_lo = _mm_cvtepu8_epi16(qp2);
   const __m128i qp1_lo = _mm_cvtepu8_epi16(qp1);
   const __m128i qp0_lo = _mm_cvtepu8_epi16(qp0);
   const __m128i pq2_lo = _mm_shuffle_epi32(qp2_lo, 0x4e);
   const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
   const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);

   __m128i f8_lo =
       _mm_add_epi16(_mm_add_epi16(qp3_lo, four), _mm_add_epi16(qp3_lo, qp3_lo));

   f8_lo = _mm_add_epi16(_mm_add_epi16(f8_lo, qp2_lo), qp2_lo);

   f8_lo = _mm_add_epi16(_mm_add_epi16(f8_lo, qp1_lo),
                         _mm_add_epi16(qp0_lo, pq0_lo));

   // p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0
   // q3 + q3 + q3 + 2 * q2 + q1 + q0 + p0
   *oqp2 = _mm_srli_epi16(f8_lo, 3);
   *oqp2 = _mm_packus_epi16(*oqp2, *oqp2);

   // p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1
   // q3 + q3 + q2 + 2 * q1 + q0 + p0 + p1
   f8_lo = FilterAdd2Sub2(f8_lo, qp1_lo, pq1_lo, qp3_lo, qp2_lo);
   *oqp1 = _mm_srli_epi16(f8_lo, 3);
   *oqp1 = _mm_packus_epi16(*oqp1, *oqp1);

   // p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2
   // q3 + q2 + q1 + 2 * q0 + p0 + p1 + p2
   f8_lo = FilterAdd2Sub2(f8_lo, qp0_lo, pq2_lo, qp3_lo, qp1_lo);
   *oqp0 = _mm_srli_epi16(f8_lo, 3);
   *oqp0 = _mm_packus_epi16(*oqp0, *oqp0);
 }

 void Horizontal8(void* dest, ptrdiff_t stride, int outer_thresh,
                  int inner_thresh, int hev_thresh) {
   auto* const dst = static_cast<uint8_t*>(dest);
   const __m128i zero = _mm_setzero_si128();
   const __m128i v_flat_thresh = _mm_set1_epi8(1);
   const __m128i v_outer_thresh =
       _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
   const __m128i v_inner_thresh =
       _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
   const __m128i v_hev_thresh0 =
       _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
   const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);

   const __m128i p3 = Load4(dst - 4 * stride);
   const __m128i p2 = Load4(dst - 3 * stride);
   const __m128i p1 = Load4(dst - 2 * stride);
   const __m128i p0 = Load4(dst - 1 * stride);
   const __m128i q0 = Load4(dst + 0 * stride);
   const __m128i q1 = Load4(dst + 1 * stride);
   const __m128i q2 = Load4(dst + 2 * stride);
   const __m128i q3 = Load4(dst + 3 * stride);

   const __m128i qp3 = _mm_unpacklo_epi32(p3, q3);
   const __m128i qp2 = _mm_unpacklo_epi32(p2, q2);
   const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
   const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
   const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
   const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);

   const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
   const __m128i v_needs_mask = NeedsFilter8(q1q0, p1p0, qp3, qp2, qp1, qp0,
                                             v_outer_thresh, v_inner_thresh);
   __m128i oqp1;
   __m128i oqp0;

   Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);

   const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
   const __m128i v_mask =
       _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);

   if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
     __m128i oqp2_f8;
     __m128i oqp1_f8;
     __m128i oqp0_f8;

     Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);

     oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
     oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
     oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
     Store4(dst - 3 * stride, oqp2_f8);
     Store4(dst + 2 * stride, _mm_srli_si128(oqp2_f8, 4));
   }

   Store4(dst - 2 * stride, oqp1);
   Store4(dst - 1 * stride, oqp0);
   Store4(dst + 0 * stride, _mm_srli_si128(oqp0, 4));
   Store4(dst + 1 * stride, _mm_srli_si128(oqp1, 4));
 }

 inline void Transpose8x8To8x4(const __m128i& x0, const __m128i& x1,
                               const __m128i& x2, const __m128i& x3,
                               const __m128i& x4, const __m128i& x5,
                               const __m128i& x6, const __m128i& x7, __m128i* d0,
                               __m128i* d1, __m128i* d2, __m128i* d3) {
   // input
   // x0 00 01 02 03 04 05 06 07
   // x1 10 11 12 13 14 15 16 17
   // x2 20 21 22 23 24 25 26 27
   // x3 30 31 32 33 34 35 36 37
   // x4 40 41 42 43 44 45 46 47
   // x5 50 51 52 53 54 55 56 57
   // x6 60 61 62 63 64 65 66 67
   // x7 70 71 72 73 74 75 76 77
   // output
   // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx xx
   // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
   // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
   // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx

   // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
   const __m128i w0 = _mm_unpacklo_epi8(x0, x1);
   // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
   const __m128i w1 = _mm_unpacklo_epi8(x2, x3);
   // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
   const __m128i w2 = _mm_unpacklo_epi8(x4, x5);
   // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
   const __m128i w3 = _mm_unpacklo_epi8(x6, x7);

   // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
   const __m128i w4 = _mm_unpacklo_epi16(w0, w1);
   // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
   const __m128i w5 = _mm_unpacklo_epi16(w2, w3);

   // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
   *d0 = _mm_unpacklo_epi32(w4, w5);
   *d1 = _mm_srli_si128(*d0, 8);
   // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
   *d2 = _mm_unpackhi_epi32(w4, w5);
   *d3 = _mm_srli_si128(*d2, 8);
 }

 void Vertical8(void* dest, ptrdiff_t stride, int outer_thresh, int inner_thresh,
                int hev_thresh) {
   auto* const dst = static_cast<uint8_t*>(dest);
   const __m128i zero = _mm_setzero_si128();
   const __m128i v_flat_thresh = _mm_set1_epi8(1);
   const __m128i v_outer_thresh =
       _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
   const __m128i v_inner_thresh =
       _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
   const __m128i v_hev_thresh0 =
       _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
   const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);

   __m128i x0 = LoadLo8(dst - 4 + 0 * stride);
   __m128i x1 = LoadLo8(dst - 4 + 1 * stride);
   __m128i x2 = LoadLo8(dst - 4 + 2 * stride);
   __m128i x3 = LoadLo8(dst - 4 + 3 * stride);

   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
   Transpose8x4To4x8(x0, x1, x2, x3, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);

   const __m128i qp3 = _mm_unpacklo_epi32(p3, q3);
   const __m128i qp2 = _mm_unpacklo_epi32(p2, q2);
   const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
   const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
   const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
   const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);

   const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
   const __m128i v_needs_mask = NeedsFilter8(q1q0, p1p0, qp3, qp2, qp1, qp0,
                                             v_outer_thresh, v_inner_thresh);
   __m128i oqp1;
   __m128i oqp0;

   Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);

   const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
   const __m128i v_mask =
       _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);

   if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
     __m128i oqp2_f8;
     __m128i oqp1_f8;
     __m128i oqp0_f8;

     Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);

     oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
     oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
     oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);

     p2 = oqp2_f8;
     q2 = _mm_srli_si128(oqp2_f8, 4);
   }

   p1 = oqp1;
   p0 = oqp0;
   q0 = _mm_srli_si128(oqp0, 4);
   q1 = _mm_srli_si128(oqp1, 4);

   Transpose8x8To8x4(p3, p2, p1, p0, q0, q1, q2, q3, &x0, &x1, &x2, &x3);

   StoreLo8(dst - 4 + 0 * stride, x0);
   StoreLo8(dst - 4 + 1 * stride, x1);
   StoreLo8(dst - 4 + 2 * stride, x2);
   StoreLo8(dst - 4 + 3 * stride, x3);
 }

 //------------------------------------------------------------------------------
 // 13-tap filters

 inline void Filter14(const __m128i& qp6, const __m128i& qp5, const __m128i& qp4,
                      const __m128i& qp3, const __m128i& qp2, const __m128i& qp1,
                      const __m128i& qp0, __m128i* oqp5, __m128i* oqp4,
                      __m128i* oqp3, __m128i* oqp2, __m128i* oqp1,
                      __m128i* oqp0) {
   const __m128i eight = _mm_set1_epi16(8);
   const __m128i qp6_lo = _mm_cvtepu8_epi16(qp6);
   const __m128i qp5_lo = _mm_cvtepu8_epi16(qp5);
   const __m128i qp4_lo = _mm_cvtepu8_epi16(qp4);
   const __m128i qp3_lo = _mm_cvtepu8_epi16(qp3);
   const __m128i qp2_lo = _mm_cvtepu8_epi16(qp2);
   const __m128i qp1_lo = _mm_cvtepu8_epi16(qp1);
   const __m128i qp0_lo = _mm_cvtepu8_epi16(qp0);
   const __m128i pq5_lo = _mm_shuffle_epi32(qp5_lo, 0x4e);
   const __m128i pq4_lo = _mm_shuffle_epi32(qp4_lo, 0x4e);
   const __m128i pq3_lo = _mm_shuffle_epi32(qp3_lo, 0x4e);
   const __m128i pq2_lo = _mm_shuffle_epi32(qp2_lo, 0x4e);
   const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
   const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);

   __m128i f14_lo =
       _mm_add_epi16(eight, _mm_sub_epi16(_mm_slli_epi16(qp6_lo, 3), qp6_lo));

   f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp5_lo),
                          _mm_add_epi16(qp5_lo, qp4_lo));

   f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp4_lo),
                          _mm_add_epi16(qp3_lo, qp2_lo));

   f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp1_lo),
                          _mm_add_epi16(qp0_lo, pq0_lo));

   // p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0
   // q6 * 7 + q5 * 2 + q4 * 2 + q3 + q2 + q1 + q0 + p0
   *oqp5 = _mm_srli_epi16(f14_lo, 4);
   *oqp5 = _mm_packus_epi16(*oqp5, *oqp5);

   // p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1
   // q6 * 5 + q5 * 2 + q4 * 2 + q3 * 2 + q2 + q1 + q0 + p0 + p1
   f14_lo = FilterAdd2Sub2(f14_lo, qp3_lo, pq1_lo, qp6_lo, qp6_lo);
   *oqp4 = _mm_srli_epi16(f14_lo, 4);
   *oqp4 = _mm_packus_epi16(*oqp4, *oqp4);

   // p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2
   // q6 * 4 + q5 + q4 * 2 + q3 * 2 + q2 * 2 + q1 + q0 + p0 + p1 + p2
   f14_lo = FilterAdd2Sub2(f14_lo, qp2_lo, pq2_lo, qp6_lo, qp5_lo);
   *oqp3 = _mm_srli_epi16(f14_lo, 4);
   *oqp3 = _mm_packus_epi16(*oqp3, *oqp3);

   // p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3
   // q6 * 3 + q5 + q4 + q3 * 2 + q2 * 2 + q1 * 2 + q0 + p0 + p1 + p2 + p3
   f14_lo = FilterAdd2Sub2(f14_lo, qp1_lo, pq3_lo, qp6_lo, qp4_lo);
   *oqp2 = _mm_srli_epi16(f14_lo, 4);
   *oqp2 = _mm_packus_epi16(*oqp2, *oqp2);

   // p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 + q0 + q1 + q2 + q3 + q4
   // q6 * 2 + q5 + q4 + q3 + q2 * 2 + q1 * 2 + q0 * 2 + p0 + p1 + p2 + p3 + p4
   f14_lo = FilterAdd2Sub2(f14_lo, qp0_lo, pq4_lo, qp6_lo, qp3_lo);
   *oqp1 = _mm_srli_epi16(f14_lo, 4);
   *oqp1 = _mm_packus_epi16(*oqp1, *oqp1);

   // p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1 + q2 + q3 + q4 + q5
   // q6 + q5 + q4 + q3 + q2 + q1 * 2 + q0 * 2 + p0 * 2 + p1 + p2 + p3 + p4 + p5
   f14_lo = FilterAdd2Sub2(f14_lo, pq0_lo, pq5_lo, qp6_lo, qp2_lo);
   *oqp0 = _mm_srli_epi16(f14_lo, 4);
   *oqp0 = _mm_packus_epi16(*oqp0, *oqp0);
 }

 void Horizontal14(void* dest, ptrdiff_t stride, int outer_thresh,
                   int inner_thresh, int hev_thresh) {
   auto* const dst = static_cast<uint8_t*>(dest);
   const __m128i zero = _mm_setzero_si128();
   const __m128i v_flat_thresh = _mm_set1_epi8(1);
   const __m128i v_outer_thresh =
       _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
   const __m128i v_inner_thresh =
       _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
   const __m128i v_hev_thresh0 =
       _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
   const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);

   const __m128i p3 = Load4(dst - 4 * stride);
   const __m128i p2 = Load4(dst - 3 * stride);
   const __m128i p1 = Load4(dst - 2 * stride);
   const __m128i p0 = Load4(dst - 1 * stride);
   const __m128i q0 = Load4(dst + 0 * stride);
   const __m128i q1 = Load4(dst + 1 * stride);
   const __m128i q2 = Load4(dst + 2 * stride);
   const __m128i q3 = Load4(dst + 3 * stride);

   const __m128i qp3 = _mm_unpacklo_epi32(p3, q3);
   const __m128i qp2 = _mm_unpacklo_epi32(p2, q2);
   const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
   const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
   const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
   const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);

   const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
   const __m128i v_needs_mask = NeedsFilter8(q1q0, p1p0, qp3, qp2, qp1, qp0,
                                             v_outer_thresh, v_inner_thresh);

   __m128i oqp1;
   __m128i oqp0;

   Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);

   const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
   const __m128i v_mask =
       _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);

   if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
     const __m128i p6 = Load4(dst - 7 * stride);
     const __m128i p5 = Load4(dst - 6 * stride);
     const __m128i p4 = Load4(dst - 5 * stride);
     const __m128i q4 = Load4(dst + 4 * stride);
     const __m128i q5 = Load4(dst + 5 * stride);
     const __m128i q6 = Load4(dst + 6 * stride);
     const __m128i qp6 = _mm_unpacklo_epi32(p6, q6);
     const __m128i qp5 = _mm_unpacklo_epi32(p5, q5);
     const __m128i qp4 = _mm_unpacklo_epi32(p4, q4);

     const __m128i v_isflatouter4_mask =
         IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh);
     const __m128i v_flat4_mask =
         _mm_shuffle_epi32(_mm_and_si128(v_mask, v_isflatouter4_mask), 0);

     __m128i oqp2_f8;
     __m128i oqp1_f8;
     __m128i oqp0_f8;

     Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);

     oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
     oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
     oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);

     if (_mm_test_all_zeros(v_flat4_mask,
                            _mm_cmpeq_epi8(v_flat4_mask, v_flat4_mask)) == 0) {
       __m128i oqp5_f14;
       __m128i oqp4_f14;
       __m128i oqp3_f14;
       __m128i oqp2_f14;
       __m128i oqp1_f14;
       __m128i oqp0_f14;

       Filter14(qp6, qp5, qp4, qp3, qp2, qp1, qp0, &oqp5_f14, &oqp4_f14,
                &oqp3_f14, &oqp2_f14, &oqp1_f14, &oqp0_f14);

       oqp5_f14 = _mm_blendv_epi8(qp5, oqp5_f14, v_flat4_mask);
       oqp4_f14 = _mm_blendv_epi8(qp4, oqp4_f14, v_flat4_mask);
       oqp3_f14 = _mm_blendv_epi8(qp3, oqp3_f14, v_flat4_mask);
       oqp2_f8 = _mm_blendv_epi8(oqp2_f8, oqp2_f14, v_flat4_mask);
       oqp1 = _mm_blendv_epi8(oqp1, oqp1_f14, v_flat4_mask);
       oqp0 = _mm_blendv_epi8(oqp0, oqp0_f14, v_flat4_mask);

       Store4(dst - 6 * stride, oqp5_f14);
       Store4(dst - 5 * stride, oqp4_f14);
       Store4(dst - 4 * stride, oqp3_f14);
       Store4(dst + 3 * stride, _mm_srli_si128(oqp3_f14, 4));
       Store4(dst + 4 * stride, _mm_srli_si128(oqp4_f14, 4));
       Store4(dst + 5 * stride, _mm_srli_si128(oqp5_f14, 4));
     }

     Store4(dst - 3 * stride, oqp2_f8);
     Store4(dst + 2 * stride, _mm_srli_si128(oqp2_f8, 4));
   }

   Store4(dst - 2 * stride, oqp1);
   Store4(dst - 1 * stride, oqp0);
   Store4(dst + 0 * stride, _mm_srli_si128(oqp0, 4));
   Store4(dst + 1 * stride, _mm_srli_si128(oqp1, 4));
 }

 // Each of the 8x4 blocks of input data (p7-p0 and q0-q7) are transposed to 4x8,
 // then unpacked to the correct qp register. (qp7 - qp0)
 //
 // p7 p6 p5 p4 p3 p2 p1 p0  q0 q1 q2 q3 q4 q5 q6 q7
 //
 // 00 01 02 03 04 05 06 07  08 09 0a 0b 0c 0d 0e 0f
 // 10 11 12 13 14 15 16 17  18 19 1a 1b 1c 1d 1e 1f
 // 20 21 22 23 24 25 26 27  28 29 2a 2b 2c 2d 2e 2f
 // 30 31 32 33 34 35 36 37  38 39 3a 3b 3c 3d 3e 3f

 inline void DualTranspose8x4To4x8(const __m128i& x0, const __m128i& x1,
                                   const __m128i& x2, const __m128i& x3,
                                   __m128i* q0p0, __m128i* q1p1, __m128i* q2p2,
                                   __m128i* q3p3, __m128i* q4p4, __m128i* q5p5,
                                   __m128i* q6p6, __m128i* q7p7) {
   // 00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
   const __m128i w0 = _mm_unpacklo_epi8(x0, x1);
   // 20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
   const __m128i w1 = _mm_unpacklo_epi8(x2, x3);
   // 08 18 09 19 0a 1a 0b 1b  0c 1c 0d 1d 0e 1e 0f 1f
   const __m128i w2 = _mm_unpackhi_epi8(x0, x1);
   // 28 38 29 39 2a 3a 2b 3b  2c 3c 2d 3d 2e 3e 2f 3f
   const __m128i w3 = _mm_unpackhi_epi8(x2, x3);
   // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
   const __m128i ww0 = _mm_unpacklo_epi16(w0, w1);
   // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
   const __m128i ww1 = _mm_unpackhi_epi16(w0, w1);
   // 08 18 28 38 09 19 29 39  0a 1a 2a 3a 0b 1b 2b 3b
   const __m128i ww2 = _mm_unpacklo_epi16(w2, w3);
   // 0c 1c 2c 3c 0d 1d 2d 3d  0e 1e 2e 3e 0f 1f 2f 3f
   const __m128i ww3 = _mm_unpackhi_epi16(w2, w3);
   // 00 10 20 30  0f 1f 2f 3f  xx xx xx xx xx xx xx xx
   *q7p7 = _mm_unpacklo_epi32(ww0, _mm_srli_si128(ww3, 12));
   // 01 11 21 31  0e 1e 2e 3e  xx xx xx xx xx xx xx xx
   *q6p6 = _mm_unpackhi_epi32(_mm_slli_si128(ww0, 4), ww3);
   // 02 12 22 32  0d 1d 2d 3d  xx xx xx xx xx xx xx xx
   *q5p5 = _mm_unpackhi_epi32(ww0, _mm_slli_si128(ww3, 4));
   // 03 13 23 33  0c 1c 2c 3c  xx xx xx xx xx xx xx xx
   *q4p4 = _mm_unpacklo_epi32(_mm_srli_si128(ww0, 12), ww3);
   // 04 14 24 34  0b 1b 2b 3b  xx xx xx xx xx xx xx xx
   *q3p3 = _mm_unpacklo_epi32(ww1, _mm_srli_si128(ww2, 12));
   // 05 15 25 35  0a 1a 2a 3a  xx xx xx xx xx xx xx xx
   *q2p2 = _mm_unpackhi_epi32(_mm_slli_si128(ww1, 4), ww2);
   // 06 16 26 36  09 19 29 39  xx xx xx xx xx xx xx xx
   *q1p1 = _mm_unpackhi_epi32(ww1, _mm_slli_si128(ww2, 4));
   // 07 17 27 37  08 18 28 38  xx xx xx xx xx xx xx xx
   *q0p0 = _mm_unpacklo_epi32(_mm_srli_si128(ww1, 12), ww2);
 }

 inline void DualTranspose4x8To8x4(const __m128i& qp7, const __m128i& qp6,
                                   const __m128i& qp5, const __m128i& qp4,
                                   const __m128i& qp3, const __m128i& qp2,
                                   const __m128i& qp1, const __m128i& qp0,
                                   __m128i* x0, __m128i* x1, __m128i* x2,
                                   __m128i* x3) {
   // qp7: 00 10 20 30  0f 1f 2f 3f  xx xx xx xx xx xx xx xx
   // qp6: 01 11 21 31  0e 1e 2e 3e  xx xx xx xx xx xx xx xx
   // qp5: 02 12 22 32  0d 1d 2d 3d  xx xx xx xx xx xx xx xx
   // qp4: 03 13 23 33  0c 1c 2c 3c  xx xx xx xx xx xx xx xx
   // qp3: 04 14 24 34  0b 1b 2b 3b  xx xx xx xx xx xx xx xx
   // qp2: 05 15 25 35  0a 1a 2a 3a  xx xx xx xx xx xx xx xx
   // qp1: 06 16 26 36  09 19 29 39  xx xx xx xx xx xx xx xx
   // qp0: 07 17 27 37  08 18 28 38  xx xx xx xx xx xx xx xx

   // 00 01 10 11 20 21 30 31  0f 0e 1f 1e 2f 2e 3f 3e
   const __m128i w0 = _mm_unpacklo_epi8(qp7, qp6);
   // 02 03 12 13 22 23 32 33  xx xx xx xx xx xx xx xx
   const __m128i w1 = _mm_unpacklo_epi8(qp5, qp4);
   // 04 05 14 15 24 25 34 35  xx xx xx xx xx xx xx xx
   const __m128i w2 = _mm_unpacklo_epi8(qp3, qp2);
   // 06 07 16 17 26 27 36 37  xx xx xx xx xx xx xx xx
   const __m128i w3 = _mm_unpacklo_epi8(qp1, qp0);
   // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33
   const __m128i w4 = _mm_unpacklo_epi16(w0, w1);
   // 04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37
   const __m128i w5 = _mm_unpacklo_epi16(w2, w3);
   // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
   const __m128i d0 = _mm_unpacklo_epi32(w4, w5);
   // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37
   const __m128i d2 = _mm_unpackhi_epi32(w4, w5);
   // xx xx xx xx xx xx xx xx 08 09 18 19 28 29 38 39
   const __m128i w10 = _mm_unpacklo_epi8(qp0, qp1);
   // xx xx xx xx xx xx xx xx 0a 0b 1a 1b 2a 2b 3a 3b
   const __m128i w11 = _mm_unpacklo_epi8(qp2, qp3);
   // xx xx xx xx xx xx xx xx 0c 0d 1c 1d 2c 2d 3c 3d
   const __m128i w12 = _mm_unpacklo_epi8(qp4, qp5);
   // xx xx xx xx xx xx xx xx 0e 0f 1e 1f 2e 2f 3e 3f
   const __m128i w13 = _mm_unpacklo_epi8(qp6, qp7);
   // 08 09 0a 0b 18 19 1a 1b 28 29 2a 2b 38 39 3a 3b
   const __m128i w14 = _mm_unpackhi_epi16(w10, w11);
   // 0c 0d 0e 0f 1c 1d 1e 1f 2c 2d 2e 2f 3c 3d 3e 3f
   const __m128i w15 = _mm_unpackhi_epi16(w12, w13);
   // 08 09 0a 0b 0c 0d 0e 0f 18 19 1a 1b 1c 1d 1e 1f
   const __m128i d1 = _mm_unpacklo_epi32(w14, w15);
   // 28 29 2a 2b 2c 2d 2e 2f 38 39 3a 3b 3c 3d 3e 3f
   const __m128i d3 = _mm_unpackhi_epi32(w14, w15);

   // p7 p6 p5 p4 p3 p2 p1 p0  q0 q1 q2 q3 q4 q5 q6 q7
   //
   // 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
   *x0 = _mm_unpacklo_epi64(d0, d1);
   // 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f
   *x1 = _mm_unpackhi_epi64(d0, d1);
   // 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f
   *x2 = _mm_unpacklo_epi64(d2, d3);
   // 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f
   *x3 = _mm_unpackhi_epi64(d2, d3);
 }

 void Vertical14(void* dest, ptrdiff_t stride, int outer_thresh,
                 int inner_thresh, int hev_thresh) {
   auto* const dst = static_cast<uint8_t*>(dest);
   const __m128i zero = _mm_setzero_si128();
   const __m128i v_flat_thresh = _mm_set1_epi8(1);
   const __m128i v_outer_thresh =
       _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
   const __m128i v_inner_thresh =
       _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
   const __m128i v_hev_thresh0 =
       _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
   const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);

   __m128i x0 = LoadUnaligned16(dst - 8 + 0 * stride);
   __m128i x1 = LoadUnaligned16(dst - 8 + 1 * stride);
   __m128i x2 = LoadUnaligned16(dst - 8 + 2 * stride);
   __m128i x3 = LoadUnaligned16(dst - 8 + 3 * stride);

   __m128i qp7, qp6, qp5, qp4, qp3, qp2, qp1, qp0;

   DualTranspose8x4To4x8(x0, x1, x2, x3, &qp0, &qp1, &qp2, &qp3, &qp4, &qp5,
                         &qp6, &qp7);

   const __m128i qp1qp0 = _mm_unpacklo_epi64(qp0, qp1);
   const __m128i q1q0 = _mm_shuffle_epi32(qp1qp0, 0x0d);
   const __m128i p1p0 = _mm_shuffle_epi32(qp1qp0, 0x08);

   const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
   const __m128i v_needs_mask = NeedsFilter8(q1q0, p1p0, qp3, qp2, qp1, qp0,
                                             v_outer_thresh, v_inner_thresh);

   __m128i oqp1;
   __m128i oqp0;

   Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);

   const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
   const __m128i v_mask =
       _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);

   if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
     const __m128i v_isflatouter4_mask =
         IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh);
     const __m128i v_flat4_mask =
         _mm_shuffle_epi32(_mm_and_si128(v_mask, v_isflatouter4_mask), 0);

     __m128i oqp2_f8;
     __m128i oqp1_f8;
     __m128i oqp0_f8;

     Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);

     oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
     oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
     oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);

     if (_mm_test_all_zeros(v_flat4_mask,
                            _mm_cmpeq_epi8(v_flat4_mask, v_flat4_mask)) == 0) {
       __m128i oqp5_f14;
       __m128i oqp4_f14;
       __m128i oqp3_f14;
       __m128i oqp2_f14;
       __m128i oqp1_f14;
       __m128i oqp0_f14;

       Filter14(qp6, qp5, qp4, qp3, qp2, qp1, qp0, &oqp5_f14, &oqp4_f14,
                &oqp3_f14, &oqp2_f14, &oqp1_f14, &oqp0_f14);

       oqp5_f14 = _mm_blendv_epi8(qp5, oqp5_f14, v_flat4_mask);
       oqp4_f14 = _mm_blendv_epi8(qp4, oqp4_f14, v_flat4_mask);
       oqp3_f14 = _mm_blendv_epi8(qp3, oqp3_f14, v_flat4_mask);
       oqp2_f8 = _mm_blendv_epi8(oqp2_f8, oqp2_f14, v_flat4_mask);
       oqp1 = _mm_blendv_epi8(oqp1, oqp1_f14, v_flat4_mask);
       oqp0 = _mm_blendv_epi8(oqp0, oqp0_f14, v_flat4_mask);
       qp3 = oqp3_f14;
       qp4 = oqp4_f14;
       qp5 = oqp5_f14;
     }
     qp2 = oqp2_f8;
   }

   DualTranspose4x8To8x4(qp7, qp6, qp5, qp4, qp3, qp2, oqp1, oqp0, &x0, &x1, &x2,
                         &x3);

   StoreUnaligned16(dst - 8 + 0 * stride, x0);
   StoreUnaligned16(dst - 8 + 1 * stride, x1);
   StoreUnaligned16(dst - 8 + 2 * stride, x2);
   StoreUnaligned16(dst - 8 + 3 * stride, x3);
 }

 void Init8bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
   assert(dsp != nullptr);
   static_cast<void>(dsp);
 #if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize4_LoopFilterTypeHorizontal)
   dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] = Horizontal4;
 #endif
 #if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize6_LoopFilterTypeHorizontal)
   dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] = Horizontal6;
 #endif
 #if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize8_LoopFilterTypeHorizontal)
   dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] = Horizontal8;
 #endif
 #if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize14_LoopFilterTypeHorizontal)
   dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
       Horizontal14;
 #endif
 #if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize4_LoopFilterTypeVertical)
   dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = Vertical4;
 #endif
 #if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize6_LoopFilterTypeVertical)
   dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = Vertical6;
 #endif
 #if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize8_LoopFilterTypeVertical)
   dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = Vertical8;
 #endif
 #if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize14_LoopFilterTypeVertical)
   dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] = Vertical14;
 #endif
 }
 }  // namespace
 }  // namespace low_bitdepth

 //------------------------------------------------------------------------------
 namespace high_bitdepth {
 namespace {

 #if LIBGAV1_MAX_BITDEPTH >= 10

 template <int bitdepth>
 struct LoopFilterFuncs_SSE4_1 {
   LoopFilterFuncs_SSE4_1() = delete;

   static constexpr int kThreshShift = bitdepth - 8;

   static void Vertical4(void* dest, ptrdiff_t stride, int outer_thresh,
                         int inner_thresh, int hev_thresh);
   static void Horizontal4(void* dest, ptrdiff_t stride, int outer_thresh,
                           int inner_thresh, int hev_thresh);
   static void Vertical6(void* dest, ptrdiff_t stride, int outer_thresh,
                         int inner_thresh, int hev_thresh);
   static void Horizontal6(void* dest, ptrdiff_t stride, int outer_thresh,
                           int inner_thresh, int hev_thresh);
   static void Vertical8(void* dest, ptrdiff_t stride, int outer_thresh,
                         int inner_thresh, int hev_thresh);
   static void Horizontal8(void* dest, ptrdiff_t stride, int outer_thresh,
                           int inner_thresh, int hev_thresh);
   static void Vertical14(void* dest, ptrdiff_t stride, int outer_thresh,
                          int inner_thresh, int hev_thresh);
   static void Horizontal14(void* dest, ptrdiff_t stride, int outer_thresh,
                            int inner_thresh, int hev_thresh);
 };

 inline __m128i Clamp(const __m128i& min, const __m128i& max,
                      const __m128i& val) {
   const __m128i a = _mm_min_epi16(val, max);
   const __m128i b = _mm_max_epi16(a, min);
   return b;
 }

 inline __m128i AddShift3(const __m128i& a, const __m128i& b,
                          const __m128i& vmin, const __m128i& vmax) {
   const __m128i c = _mm_adds_epi16(a, b);
   const __m128i d = Clamp(vmin, vmax, c);
   const __m128i e = _mm_srai_epi16(d, 3); /* >> 3 */
   return e;
 }

 inline __m128i AddShift1(const __m128i& a, const __m128i& b) {
   const __m128i c = _mm_adds_epi16(a, b);
   const __m128i e = _mm_srai_epi16(c, 1); /* >> 1 */
   return e;
 }

 inline __m128i AbsDiff(const __m128i& a, const __m128i& b) {
   return _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
 }

 inline __m128i Hev(const __m128i& qp1, const __m128i& qp0,
                    const __m128i& hev_thresh) {
   const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
   const __m128i max_pq =
       _mm_max_epu16(abs_qp1mqp0, _mm_srli_si128(abs_qp1mqp0, 8));
   const __m128i hev_mask = _mm_cmpgt_epi16(max_pq, hev_thresh);
   return hev_mask;
 }

 inline __m128i CheckOuterThreshF4(const __m128i& q1q0, const __m128i& p1p0,
                                   const __m128i& outer_thresh) {
   //  abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh;
   const __m128i abs_pmq = AbsDiff(p1p0, q1q0);
   const __m128i a = _mm_adds_epu16(abs_pmq, abs_pmq);
   const __m128i b = _mm_srli_epi16(abs_pmq, 1);
   const __m128i c = _mm_adds_epu16(a, _mm_srli_si128(b, 8));
   return _mm_subs_epu16(c, outer_thresh);
 }

 inline __m128i NeedsFilter4(const __m128i& q1q0, const __m128i& p1p0,
                             const __m128i& qp1, const __m128i& qp0,
                             const __m128i& outer_thresh,
                             const __m128i& inner_thresh) {
   const __m128i outer_mask = CheckOuterThreshF4(q1q0, p1p0, outer_thresh);
   const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
   const __m128i max_abs_qp1mqp =
       _mm_max_epu16(abs_qp1mqp0, _mm_srli_si128(abs_qp1mqp0, 8));
   const __m128i inner_mask = _mm_subs_epu16(max_abs_qp1mqp, inner_thresh);
   // ~mask
   const __m128i zero = _mm_setzero_si128();
   const __m128i a = _mm_or_si128(outer_mask, inner_mask);
   const __m128i b = _mm_cmpeq_epi16(a, zero);
   return b;
 }

 inline void Filter4(const __m128i& qp1, const __m128i& qp0, __m128i* oqp1,
                     __m128i* oqp0, const __m128i& mask, const __m128i& hev,
                     int bitdepth) {
   const __m128i t4 = _mm_set1_epi16(4);
   const __m128i t3 = _mm_set1_epi16(3);
   const __m128i t80 = _mm_set1_epi16(static_cast<int16_t>(1 << (bitdepth - 1)));
   const __m128i t1 = _mm_set1_epi16(0x1);
   const __m128i vmin = _mm_subs_epi16(_mm_setzero_si128(), t80);
   const __m128i vmax = _mm_subs_epi16(t80, t1);
   const __m128i ps1 = _mm_subs_epi16(qp1, t80);
   const __m128i ps0 = _mm_subs_epi16(qp0, t80);
   const __m128i qs0 = _mm_srli_si128(ps0, 8);
   const __m128i qs1 = _mm_srli_si128(ps1, 8);

   __m128i a = _mm_subs_epi16(ps1, qs1);
   a = _mm_and_si128(Clamp(vmin, vmax, a), hev);

   const __m128i x = _mm_subs_epi16(qs0, ps0);
   a = _mm_adds_epi16(a, x);
   a = _mm_adds_epi16(a, x);
   a = _mm_adds_epi16(a, x);
   a = _mm_and_si128(Clamp(vmin, vmax, a), mask);

   const __m128i a1 = AddShift3(a, t4, vmin, vmax);
   const __m128i a2 = AddShift3(a, t3, vmin, vmax);
   const __m128i a3 = _mm_andnot_si128(hev, AddShift1(a1, t1));

   const __m128i ops1 = _mm_adds_epi16(ps1, a3);
   const __m128i ops0 = _mm_adds_epi16(ps0, a2);
   const __m128i oqs0 = _mm_subs_epi16(qs0, a1);
   const __m128i oqs1 = _mm_subs_epi16(qs1, a3);

   __m128i oqps1 = _mm_unpacklo_epi64(ops1, oqs1);
   __m128i oqps0 = _mm_unpacklo_epi64(ops0, oqs0);

   oqps1 = Clamp(vmin, vmax, oqps1);
   oqps0 = Clamp(vmin, vmax, oqps0);

   *oqp1 = _mm_adds_epi16(oqps1, t80);
   *oqp0 = _mm_adds_epi16(oqps0, t80);
 }

 template <int bitdepth>
 void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal4(void* dest,
                                                    ptrdiff_t stride8,
                                                    int outer_thresh,
                                                    int inner_thresh,
                                                    int hev_thresh) {
   auto* const dst = static_cast<uint16_t*>(dest);
   const ptrdiff_t stride = stride8 / 2;
   const __m128i v_outer_thresh =
       _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
   const __m128i v_inner_thresh =
       _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
   const __m128i v_hev_thresh =
       _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
   const __m128i p1 = LoadLo8(dst - 2 * stride);
   const __m128i p0 = LoadLo8(dst - 1 * stride);
   const __m128i qp0 = LoadHi8(p0, dst + 0 * stride);
   const __m128i qp1 = LoadHi8(p1, dst + 1 * stride);
   const __m128i q1q0 = _mm_unpackhi_epi64(qp0, qp1);
   const __m128i p1p0 = _mm_unpacklo_epi64(qp0, qp1);
   const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
   const __m128i v_needs_mask =
       NeedsFilter4(q1q0, p1p0, qp1, qp0, v_outer_thresh, v_inner_thresh);

   __m128i oqp1;
   __m128i oqp0;
   Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);

   StoreLo8(dst - 2 * stride, oqp1);
   StoreLo8(dst - 1 * stride, oqp0);
   StoreHi8(dst + 0 * stride, oqp0);
   StoreHi8(dst + 1 * stride, oqp1);
 }

 template <int bitdepth>
 void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical4(void* dest, ptrdiff_t stride8,
                                                  int outer_thresh,
                                                  int inner_thresh,
                                                  int hev_thresh) {
   auto* const dst = static_cast<uint16_t*>(dest);
   const ptrdiff_t stride = stride8 / 2;
   const __m128i v_outer_thresh =
       _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
   const __m128i v_inner_thresh =
       _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
   const __m128i v_hev_thresh =
       _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
   const __m128i x0 = LoadLo8(dst - 2 + 0 * stride);
   const __m128i x1 = LoadLo8(dst - 2 + 1 * stride);
   const __m128i x2 = LoadLo8(dst - 2 + 2 * stride);
   const __m128i x3 = LoadLo8(dst - 2 + 3 * stride);
   // 00 10 01 11 02 12 03 13
   const __m128i w0 = _mm_unpacklo_epi16(x0, x1);
   // 20 30 21 31 22 32 23 33
   const __m128i w1 = _mm_unpacklo_epi16(x2, x3);
   // 00 10 20 30 01 11 21 31   p0p1
   const __m128i a = _mm_unpacklo_epi32(w0, w1);
   const __m128i p1p0 = _mm_shuffle_epi32(a, 0x4e);
   // 02 12 22 32 03 13 23 33   q1q0
   const __m128i q1q0 = _mm_unpackhi_epi32(w0, w1);
   const __m128i qp1 = _mm_unpackhi_epi64(p1p0, q1q0);
   const __m128i qp0 = _mm_unpacklo_epi64(p1p0, q1q0);
   const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
   const __m128i v_needs_mask =
       NeedsFilter4(q1q0, p1p0, qp1, qp0, v_outer_thresh, v_inner_thresh);

   __m128i oqp1;
   __m128i oqp0;
   Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);

   // 00 10 01 11 02 12 03 13
   const __m128i w2 = _mm_unpacklo_epi16(oqp1, oqp0);
   // 20 30 21 31 22 32 23 33
   const __m128i w3 = _mm_unpackhi_epi16(oqp0, oqp1);
   // 00 10 20 30 01 11 21 31
   const __m128i op0p1 = _mm_unpacklo_epi32(w2, w3);
   // 02 12 22 32 03 13 23 33
   const __m128i oq1q0 = _mm_unpackhi_epi32(w2, w3);

   StoreLo8(dst - 2 + 0 * stride, op0p1);
   StoreHi8(dst - 2 + 1 * stride, op0p1);
   StoreLo8(dst - 2 + 2 * stride, oq1q0);
   StoreHi8(dst - 2 + 3 * stride, oq1q0);
 }

 //------------------------------------------------------------------------------
 // 5-tap (chroma) filters

 inline __m128i CheckOuterThreshF6(const __m128i& qp1, const __m128i& qp0,
                                   const __m128i& outer_thresh) {
   //  abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh;
   const __m128i q1q0 = _mm_unpackhi_epi64(qp0, qp1);
   const __m128i p1p0 = _mm_unpacklo_epi64(qp0, qp1);
   return CheckOuterThreshF4(q1q0, p1p0, outer_thresh);
 }

 inline __m128i NeedsFilter6(const __m128i& qp2, const __m128i& qp1,
                             const __m128i& qp0, const __m128i& outer_thresh,
                             const __m128i& inner_thresh) {
   const __m128i outer_mask = CheckOuterThreshF6(qp1, qp0, outer_thresh);
   const __m128i abs_qp2mqp1 = AbsDiff(qp2, qp1);
   const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
   const __m128i max_pq = _mm_max_epu16(abs_qp2mqp1, abs_qp1mqp0);
   const __m128i inner_mask = _mm_subs_epu16(
       _mm_max_epu16(max_pq, _mm_srli_si128(max_pq, 8)), inner_thresh);
   // ~mask
   const __m128i zero = _mm_setzero_si128();
   const __m128i a = _mm_or_si128(outer_mask, inner_mask);
   const __m128i b = _mm_cmpeq_epi16(a, zero);
   return b;
 }

 inline __m128i IsFlat3(const __m128i& qp2, const __m128i& qp1,
                        const __m128i& qp0, const __m128i& flat_thresh) {
   const __m128i abs_pq2mpq0 = AbsDiff(qp2, qp0);
   const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
   const __m128i max_pq = _mm_max_epu16(abs_pq2mpq0, abs_qp1mqp0);
   const __m128i flat_mask = _mm_subs_epu16(
       _mm_max_epu16(max_pq, _mm_srli_si128(max_pq, 8)), flat_thresh);
   // ~mask
   const __m128i zero = _mm_setzero_si128();
   const __m128i a = _mm_cmpeq_epi16(flat_mask, zero);
   return a;
 }

 inline void Filter6(const __m128i& qp2, const __m128i& qp1, const __m128i& qp0,
                     __m128i* oqp1, __m128i* oqp0) {
   const __m128i four = _mm_set1_epi16(4);
   const __m128i qp2_lo = qp2;
   const __m128i qp1_lo = qp1;
   const __m128i qp0_lo = qp0;
   const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
   const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);

   __m128i f6_lo;
   f6_lo =
       _mm_add_epi16(_mm_add_epi16(qp2_lo, four), _mm_add_epi16(qp2_lo, qp2_lo));

   f6_lo = _mm_add_epi16(_mm_add_epi16(f6_lo, qp1_lo), qp1_lo);

   f6_lo = _mm_add_epi16(_mm_add_epi16(f6_lo, qp0_lo),
                         _mm_add_epi16(qp0_lo, pq0_lo));

   // p2 * 3 + p1 * 2 + p0 * 2 + q0
   // q2 * 3 + q1 * 2 + q0 * 2 + p0
   *oqp1 = _mm_srli_epi16(f6_lo, 3);

   // p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1
   // q2 + q1 * 2 + q0 * 2 + p0 * 2 + p1
   f6_lo = FilterAdd2Sub2(f6_lo, pq0_lo, pq1_lo, qp2_lo, qp2_lo);
   *oqp0 = _mm_srli_epi16(f6_lo, 3);
 }

 template <int bitdepth>
 void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal6(void* dest,
                                                    ptrdiff_t stride8,
                                                    int outer_thresh,
                                                    int inner_thresh,
                                                    int hev_thresh) {
   auto* const dst = static_cast<uint16_t*>(dest);
   const ptrdiff_t stride = stride8 / 2;
   const __m128i v_flat_thresh =
       _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
   const __m128i v_outer_thresh =
       _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
   const __m128i v_inner_thresh =
       _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
   const __m128i v_hev_thresh =
       _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);

   const __m128i p2 = LoadLo8(dst - 3 * stride);
   const __m128i p1 = LoadLo8(dst - 2 * stride);
   const __m128i p0 = LoadLo8(dst - 1 * stride);
   const __m128i q0 = LoadLo8(dst + 0 * stride);
   const __m128i q1 = LoadLo8(dst + 1 * stride);
   const __m128i q2 = LoadLo8(dst + 2 * stride);

   const __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
   const __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
   const __m128i qp0 = _mm_unpacklo_epi64(p0, q0);

   const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
   const __m128i v_needs_mask =
       NeedsFilter6(qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
   __m128i oqp1;
   __m128i oqp0;

   Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);

   const __m128i v_isflat3_mask = IsFlat3(qp2, qp1, qp0, v_flat_thresh);
   const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat3_mask);
   const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);

   if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
     __m128i oqp1_f6;
     __m128i oqp0_f6;

     Filter6(qp2, qp1, qp0, &oqp1_f6, &oqp0_f6);

     oqp1 = _mm_blendv_epi8(oqp1, oqp1_f6, v_mask);
     oqp0 = _mm_blendv_epi8(oqp0, oqp0_f6, v_mask);
   }

   StoreLo8(dst - 2 * stride, oqp1);
   StoreLo8(dst - 1 * stride, oqp0);
   StoreHi8(dst + 0 * stride, oqp0);
   StoreHi8(dst + 1 * stride, oqp1);
 }

 inline void Transpose8x4To4x8(const __m128i& x0, const __m128i& x1,
                               const __m128i& x2, const __m128i& x3, __m128i* d0,
                               __m128i* d1, __m128i* d2, __m128i* d3,
                               __m128i* d4, __m128i* d5, __m128i* d6,
                               __m128i* d7) {
   // input
   // x0   00 01 02 03 04 05 06 07
   // x1   10 11 12 13 14 15 16 17
   // x2   20 21 22 23 24 25 26 27
   // x3   30 31 32 33 34 35 36 37
   // output
   // 00 10 20 30 xx xx xx xx
   // 01 11 21 31 xx xx xx xx
   // 02 12 22 32 xx xx xx xx
   // 03 13 23 33 xx xx xx xx
   // 04 14 24 34 xx xx xx xx
   // 05 15 25 35 xx xx xx xx
   // 06 16 26 36 xx xx xx xx
   // 07 17 27 37 xx xx xx xx

   // 00 10 01 11 02 12 03 13
   const __m128i w0 = _mm_unpacklo_epi16(x0, x1);
   // 20 30 21 31 22 32 23 33
   const __m128i w1 = _mm_unpacklo_epi16(x2, x3);
   // 04 14 05 15 06 16 07 17
   const __m128i w2 = _mm_unpackhi_epi16(x0, x1);
   // 24 34 25 35 26 36 27 37
   const __m128i w3 = _mm_unpackhi_epi16(x2, x3);

   // 00 10 20 30 01 11 21 31
   const __m128i ww0 = _mm_unpacklo_epi32(w0, w1);
   // 04 14 24 34 05 15 25 35
   const __m128i ww1 = _mm_unpacklo_epi32(w2, w3);
   // 02 12 22 32 03 13 23 33
   const __m128i ww2 = _mm_unpackhi_epi32(w0, w1);
   // 06 16 26 36 07 17 27 37
   const __m128i ww3 = _mm_unpackhi_epi32(w2, w3);

   // 00 10 20 30 xx xx xx xx
   *d0 = ww0;
   // 01 11 21 31 xx xx xx xx
   *d1 = _mm_srli_si128(ww0, 8);
   // 02 12 22 32 xx xx xx xx
   *d2 = ww2;
   // 03 13 23 33 xx xx xx xx
   *d3 = _mm_srli_si128(ww2, 8);
   // 04 14 24 34 xx xx xx xx
   *d4 = ww1;
   // 05 15 25 35 xx xx xx xx
   *d5 = _mm_srli_si128(ww1, 8);
   // 06 16 26 36 xx xx xx xx
   *d6 = ww3;
   // 07 17 27 37 xx xx xx xx
   *d7 = _mm_srli_si128(ww3, 8);
 }

 template <int bitdepth>
 void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical6(void* dest, ptrdiff_t stride8,
                                                  int outer_thresh,
                                                  int inner_thresh,
                                                  int hev_thresh) {
   auto* const dst = static_cast<uint16_t*>(dest);
   const ptrdiff_t stride = stride8 / 2;
   const __m128i v_flat_thresh =
       _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
   const __m128i v_outer_thresh =
       _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
   const __m128i v_inner_thresh =
       _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
   const __m128i v_hev_thresh =
       _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);

   __m128i x0 = LoadUnaligned16(dst - 3 + 0 * stride);
   __m128i x1 = LoadUnaligned16(dst - 3 + 1 * stride);
   __m128i x2 = LoadUnaligned16(dst - 3 + 2 * stride);
   __m128i x3 = LoadUnaligned16(dst - 3 + 3 * stride);

   __m128i p2, p1, p0, q0, q1, q2;
   __m128i z0, z1;  // not used

   Transpose8x4To4x8(x0, x1, x2, x3, &p2, &p1, &p0, &q0, &q1, &q2, &z0, &z1);

   const __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
   const __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
   const __m128i qp0 = _mm_unpacklo_epi64(p0, q0);

   const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
   const __m128i v_needs_mask =
       NeedsFilter6(qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
   __m128i oqp1;
   __m128i oqp0;

   Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);

   const __m128i v_isflat3_mask = IsFlat3(qp2, qp1, qp0, v_flat_thresh);
   const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat3_mask);
   const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);

   if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
     __m128i oqp1_f6;
     __m128i oqp0_f6;

     Filter6(qp2, qp1, qp0, &oqp1_f6, &oqp0_f6);

     oqp1 = _mm_blendv_epi8(oqp1, oqp1_f6, v_mask);
     oqp0 = _mm_blendv_epi8(oqp0, oqp0_f6, v_mask);
   }

   // 00 10 01 11 02 12 03 13
   const __m128i w2 = _mm_unpacklo_epi16(oqp1, oqp0);
   // 20 30 21 31 22 32 23 33
   const __m128i w3 = _mm_unpackhi_epi16(oqp0, oqp1);
   // 00 10 20 30 01 11 21 31
   const __m128i op0p1 = _mm_unpacklo_epi32(w2, w3);
   // 02 12 22 32 03 13 23 33
   const __m128i oq1q0 = _mm_unpackhi_epi32(w2, w3);

   StoreLo8(dst - 2 + 0 * stride, op0p1);
   StoreHi8(dst - 2 + 1 * stride, op0p1);
   StoreLo8(dst - 2 + 2 * stride, oq1q0);
   StoreHi8(dst - 2 + 3 * stride, oq1q0);
 }

 //------------------------------------------------------------------------------
 // 7-tap filters
 inline __m128i NeedsFilter8(const __m128i& qp3, const __m128i& qp2,
                             const __m128i& qp1, const __m128i& qp0,
                             const __m128i& outer_thresh,
                             const __m128i& inner_thresh) {
   const __m128i outer_mask = CheckOuterThreshF6(qp1, qp0, outer_thresh);
   const __m128i abs_qp2mqp1 = AbsDiff(qp2, qp1);
   const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
   const __m128i max_pq_a = _mm_max_epu16(abs_qp2mqp1, abs_qp1mqp0);
   const __m128i abs_pq3mpq2 = AbsDiff(qp3, qp2);
   const __m128i max_pq = _mm_max_epu16(max_pq_a, abs_pq3mpq2);
   const __m128i inner_mask = _mm_subs_epu16(
       _mm_max_epu16(max_pq, _mm_srli_si128(max_pq, 8)), inner_thresh);
   // ~mask
   const __m128i zero = _mm_setzero_si128();
   const __m128i a = _mm_or_si128(outer_mask, inner_mask);
   const __m128i b = _mm_cmpeq_epi16(a, zero);
   return b;
 }

 inline __m128i IsFlat4(const __m128i& qp3, const __m128i& qp2,
                        const __m128i& qp1, const __m128i& qp0,
                        const __m128i& flat_thresh) {
   const __m128i abs_pq2mpq0 = AbsDiff(qp2, qp0);
   const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
   const __m128i max_pq_a = _mm_max_epu16(abs_pq2mpq0, abs_qp1mqp0);
   const __m128i abs_pq3mpq0 = AbsDiff(qp3, qp0);
   const __m128i max_pq = _mm_max_epu16(max_pq_a, abs_pq3mpq0);
   const __m128i flat_mask = _mm_subs_epu16(
       _mm_max_epu16(max_pq, _mm_srli_si128(max_pq, 8)), flat_thresh);
   // ~mask
   const __m128i zero = _mm_setzero_si128();
   const __m128i a = _mm_cmpeq_epi16(flat_mask, zero);
   return a;
 }

 inline void Filter8(const __m128i& qp3, const __m128i& qp2, const __m128i& qp1,
                     const __m128i& qp0, __m128i* oqp2, __m128i* oqp1,
                     __m128i* oqp0) {
   const __m128i four = _mm_set1_epi16(4);
   const __m128i qp3_lo = qp3;
   const __m128i qp2_lo = qp2;
   const __m128i qp1_lo = qp1;
   const __m128i qp0_lo = qp0;
   const __m128i pq2_lo = _mm_shuffle_epi32(qp2_lo, 0x4e);
   const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
   const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);

   __m128i f8_lo =
       _mm_add_epi16(_mm_add_epi16(qp3_lo, four), _mm_add_epi16(qp3_lo, qp3_lo));

   f8_lo = _mm_add_epi16(_mm_add_epi16(f8_lo, qp2_lo), qp2_lo);

   f8_lo = _mm_add_epi16(_mm_add_epi16(f8_lo, qp1_lo),
                         _mm_add_epi16(qp0_lo, pq0_lo));

   // p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0
   // q3 + q3 + q3 + 2 * q2 + q1 + q0 + p0
   *oqp2 = _mm_srli_epi16(f8_lo, 3);

   // p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1
   // q3 + q3 + q2 + 2 * q1 + q0 + p0 + p1
   f8_lo = FilterAdd2Sub2(f8_lo, qp1_lo, pq1_lo, qp3_lo, qp2_lo);
   *oqp1 = _mm_srli_epi16(f8_lo, 3);

   // p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2
   // q3 + q2 + q1 + 2 * q0 + p0 + p1 + p2
   f8_lo = FilterAdd2Sub2(f8_lo, qp0_lo, pq2_lo, qp3_lo, qp1_lo);
   *oqp0 = _mm_srli_epi16(f8_lo, 3);
 }

 template <int bitdepth>
 void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal8(void* dest,
                                                    ptrdiff_t stride8,
                                                    int outer_thresh,
                                                    int inner_thresh,
                                                    int hev_thresh) {
   auto* const dst = static_cast<uint16_t*>(dest);
   const ptrdiff_t stride = stride8 / 2;
   const __m128i v_flat_thresh =
       _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
   const __m128i v_outer_thresh =
       _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
   const __m128i v_inner_thresh =
       _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
   const __m128i v_hev_thresh =
       _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);

   const __m128i p3 = LoadLo8(dst - 4 * stride);
   const __m128i p2 = LoadLo8(dst - 3 * stride);
   const __m128i p1 = LoadLo8(dst - 2 * stride);
   const __m128i p0 = LoadLo8(dst - 1 * stride);
   const __m128i q0 = LoadLo8(dst + 0 * stride);
   const __m128i q1 = LoadLo8(dst + 1 * stride);
   const __m128i q2 = LoadLo8(dst + 2 * stride);
   const __m128i q3 = LoadLo8(dst + 3 * stride);
   const __m128i qp3 = _mm_unpacklo_epi64(p3, q3);
   const __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
   const __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
   const __m128i qp0 = _mm_unpacklo_epi64(p0, q0);

   const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
   const __m128i v_needs_mask =
       NeedsFilter8(qp3, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
   __m128i oqp1;
   __m128i oqp0;

   Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);

   const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
   const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
   const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);

   if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
     __m128i oqp2_f8;
     __m128i oqp1_f8;
     __m128i oqp0_f8;

     Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);

     oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
     oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
     oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
     StoreLo8(dst - 3 * stride, oqp2_f8);
     StoreHi8(dst + 2 * stride, oqp2_f8);
   }

   StoreLo8(dst - 2 * stride, oqp1);
   StoreLo8(dst - 1 * stride, oqp0);
   StoreHi8(dst + 0 * stride, oqp0);
   StoreHi8(dst + 1 * stride, oqp1);
 }

 inline void TransposeLower4x8To8x4(const __m128i& x0, const __m128i& x1,
                                    const __m128i& x2, const __m128i& x3,
                                    const __m128i& x4, const __m128i& x5,
                                    const __m128i& x6, const __m128i& x7,
                                    __m128i* d0, __m128i* d1, __m128i* d2,
                                    __m128i* d3) {
   // input
   // x0 00 01 02 03 04 05 06 07
   // x1 10 11 12 13 14 15 16 17
   // x2 20 21 22 23 24 25 26 27
   // x3 30 31 32 33 34 35 36 37
   // x4 40 41 42 43 44 45 46 47
   // x5 50 51 52 53 54 55 56 57
   // x6 60 61 62 63 64 65 66 67
   // x7 70 71 72 73 74 75 76 77
   // output
   // d0 00 10 20 30 40 50 60 70
   // d1 01 11 21 31 41 51 61 71
   // d2 02 12 22 32 42 52 62 72
   // d3 03 13 23 33 43 53 63 73

   // 00 10 01 11 02 12 03 13
   const __m128i w0 = _mm_unpacklo_epi16(x0, x1);
   // 20 30 21 31 22 32 23 33
   const __m128i w1 = _mm_unpacklo_epi16(x2, x3);
   // 40 50 41 51 42 52 43 53
   const __m128i w2 = _mm_unpacklo_epi16(x4, x5);
   // 60 70 61 71 62 72 63 73
   const __m128i w3 = _mm_unpacklo_epi16(x6, x7);

   // 00 10 20 30 01 11 21 31
   const __m128i w4 = _mm_unpacklo_epi32(w0, w1);
   // 40 50 60 70 41 51 61 71
   const __m128i w5 = _mm_unpacklo_epi32(w2, w3);
   // 02 12 22 32 03 13 23 33
   const __m128i w6 = _mm_unpackhi_epi32(w0, w1);
   // 42 52 62 72 43 53 63 73
   const __m128i w7 = _mm_unpackhi_epi32(w2, w3);

   // 00 10 20 30 40 50 60 70
   *d0 = _mm_unpacklo_epi64(w4, w5);
   // 01 11 21 31 41 51 61 71
   *d1 = _mm_unpackhi_epi64(w4, w5);
   // 02 12 22 32 42 52 62 72
   *d2 = _mm_unpacklo_epi64(w6, w7);
   // 03 13 23 33 43 53 63 73
   *d3 = _mm_unpackhi_epi64(w6, w7);
 }

 template <int bitdepth>
 void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical8(void* dest, ptrdiff_t stride8,
                                                  int outer_thresh,
                                                  int inner_thresh,
                                                  int hev_thresh) {
   auto* const dst = static_cast<uint16_t*>(dest);
   const ptrdiff_t stride = stride8 / 2;
   const __m128i v_flat_thresh =
       _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
   const __m128i v_outer_thresh =
       _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
   const __m128i v_inner_thresh =
       _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
   const __m128i v_hev_thresh =
       _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);

   __m128i x0 = LoadUnaligned16(dst - 4 + 0 * stride);
   __m128i x1 = LoadUnaligned16(dst - 4 + 1 * stride);
   __m128i x2 = LoadUnaligned16(dst - 4 + 2 * stride);
   __m128i x3 = LoadUnaligned16(dst - 4 + 3 * stride);

   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
   Transpose8x4To4x8(x0, x1, x2, x3, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);

   const __m128i qp3 = _mm_unpacklo_epi64(p3, q3);
   const __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
   const __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
   const __m128i qp0 = _mm_unpacklo_epi64(p0, q0);

   const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
   const __m128i v_needs_mask =
       NeedsFilter8(qp3, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
   __m128i oqp1;
   __m128i oqp0;

   Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);

   const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
   const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
   const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);

   if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
     __m128i oqp2_f8;
     __m128i oqp1_f8;
     __m128i oqp0_f8;

     Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);

     oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
     oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
     oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);

     p2 = oqp2_f8;
     q2 = _mm_srli_si128(oqp2_f8, 8);
   }

   p1 = oqp1;
   p0 = oqp0;
   q0 = _mm_srli_si128(oqp0, 8);
   q1 = _mm_srli_si128(oqp1, 8);

   TransposeLower4x8To8x4(p3, p2, p1, p0, q0, q1, q2, q3, &x0, &x1, &x2, &x3);

   StoreUnaligned16(dst - 4 + 0 * stride, x0);
   StoreUnaligned16(dst - 4 + 1 * stride, x1);
   StoreUnaligned16(dst - 4 + 2 * stride, x2);
   StoreUnaligned16(dst - 4 + 3 * stride, x3);
 }

 //------------------------------------------------------------------------------
 // 13-tap filters

 inline void Filter14(const __m128i& qp6, const __m128i& qp5, const __m128i& qp4,
                      const __m128i& qp3, const __m128i& qp2, const __m128i& qp1,
                      const __m128i& qp0, __m128i* oqp5, __m128i* oqp4,
                      __m128i* oqp3, __m128i* oqp2, __m128i* oqp1,
                      __m128i* oqp0) {
   const __m128i eight = _mm_set1_epi16(8);
   const __m128i qp6_lo = qp6;
   const __m128i qp5_lo = qp5;
   const __m128i qp4_lo = qp4;
   const __m128i qp3_lo = qp3;
   const __m128i qp2_lo = qp2;
   const __m128i qp1_lo = qp1;
   const __m128i qp0_lo = qp0;
   const __m128i pq5_lo = _mm_shuffle_epi32(qp5_lo, 0x4e);
   const __m128i pq4_lo = _mm_shuffle_epi32(qp4_lo, 0x4e);
   const __m128i pq3_lo = _mm_shuffle_epi32(qp3_lo, 0x4e);
   const __m128i pq2_lo = _mm_shuffle_epi32(qp2_lo, 0x4e);
   const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
   const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);

   __m128i f14_lo =
       _mm_add_epi16(eight, _mm_sub_epi16(_mm_slli_epi16(qp6_lo, 3), qp6_lo));

   f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp5_lo),
                          _mm_add_epi16(qp5_lo, qp4_lo));

   f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp4_lo),
                          _mm_add_epi16(qp3_lo, qp2_lo));

   f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp1_lo),
                          _mm_add_epi16(qp0_lo, pq0_lo));

   // p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0
   // q6 * 7 + q5 * 2 + q4 * 2 + q3 + q2 + q1 + q0 + p0
   *oqp5 = _mm_srli_epi16(f14_lo, 4);

   // p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1
   // q6 * 5 + q5 * 2 + q4 * 2 + q3 * 2 + q2 + q1 + q0 + p0 + p1
   f14_lo = FilterAdd2Sub2(f14_lo, qp3_lo, pq1_lo, qp6_lo, qp6_lo);
   *oqp4 = _mm_srli_epi16(f14_lo, 4);

   // p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2
   // q6 * 4 + q5 + q4 * 2 + q3 * 2 + q2 * 2 + q1 + q0 + p0 + p1 + p2
   f14_lo = FilterAdd2Sub2(f14_lo, qp2_lo, pq2_lo, qp6_lo, qp5_lo);
   *oqp3 = _mm_srli_epi16(f14_lo, 4);

   // p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3
   // q6 * 3 + q5 + q4 + q3 * 2 + q2 * 2 + q1 * 2 + q0 + p0 + p1 + p2 + p3
   f14_lo = FilterAdd2Sub2(f14_lo, qp1_lo, pq3_lo, qp6_lo, qp4_lo);
   *oqp2 = _mm_srli_epi16(f14_lo, 4);

   // p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 + q0 + q1 + q2 + q3 + q4
   // q6 * 2 + q5 + q4 + q3 + q2 * 2 + q1 * 2 + q0 * 2 + p0 + p1 + p2 + p3 + p4
   f14_lo = FilterAdd2Sub2(f14_lo, qp0_lo, pq4_lo, qp6_lo, qp3_lo);
   *oqp1 = _mm_srli_epi16(f14_lo, 4);

   // p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1 + q2 + q3 + q4 + q5
   // q6 + q5 + q4 + q3 + q2 + q1 * 2 + q0 * 2 + p0 * 2 + p1 + p2 + p3 + p4 + p5
   f14_lo = FilterAdd2Sub2(f14_lo, pq0_lo, pq5_lo, qp6_lo, qp2_lo);
   *oqp0 = _mm_srli_epi16(f14_lo, 4);
 }

 template <int bitdepth>
 void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal14(void* dest,
                                                     ptrdiff_t stride8,
                                                     int outer_thresh,
                                                     int inner_thresh,
                                                     int hev_thresh) {
   auto* const dst = static_cast<uint16_t*>(dest);
   const ptrdiff_t stride = stride8 / 2;
   const __m128i v_flat_thresh =
       _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
   const __m128i v_outer_thresh =
       _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
   const __m128i v_inner_thresh =
       _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
   const __m128i v_hev_thresh =
       _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);

   const __m128i p3 = LoadLo8(dst - 4 * stride);
   const __m128i p2 = LoadLo8(dst - 3 * stride);
   const __m128i p1 = LoadLo8(dst - 2 * stride);
   const __m128i p0 = LoadLo8(dst - 1 * stride);
   const __m128i q0 = LoadLo8(dst + 0 * stride);
   const __m128i q1 = LoadLo8(dst + 1 * stride);
   const __m128i q2 = LoadLo8(dst + 2 * stride);
   const __m128i q3 = LoadLo8(dst + 3 * stride);
   const __m128i qp3 = _mm_unpacklo_epi64(p3, q3);
   const __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
   const __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
   const __m128i qp0 = _mm_unpacklo_epi64(p0, q0);

   const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
   const __m128i v_needs_mask =
       NeedsFilter8(qp3, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);

   __m128i oqp1;
   __m128i oqp0;

   Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);

   const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
   const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
   const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);

   if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
     const __m128i p6 = LoadLo8(dst - 7 * stride);
     const __m128i p5 = LoadLo8(dst - 6 * stride);
     const __m128i p4 = LoadLo8(dst - 5 * stride);
     const __m128i q4 = LoadLo8(dst + 4 * stride);
     const __m128i q5 = LoadLo8(dst + 5 * stride);
     const __m128i q6 = LoadLo8(dst + 6 * stride);
     const __m128i qp6 = _mm_unpacklo_epi64(p6, q6);
     const __m128i qp5 = _mm_unpacklo_epi64(p5, q5);
     const __m128i qp4 = _mm_unpacklo_epi64(p4, q4);

     const __m128i v_isflatouter4_mask =
         IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh);
     const __m128i v_flat4_mask_lo = _mm_and_si128(v_mask, v_isflatouter4_mask);
     const __m128i v_flat4_mask =
         _mm_unpacklo_epi64(v_flat4_mask_lo, v_flat4_mask_lo);

     __m128i oqp2_f8;
     __m128i oqp1_f8;
     __m128i oqp0_f8;

     Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);

     oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
     oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
     oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);

     if (_mm_test_all_zeros(v_flat4_mask,
                            _mm_cmpeq_epi16(v_flat4_mask, v_flat4_mask)) == 0) {
       __m128i oqp5_f14;
       __m128i oqp4_f14;
       __m128i oqp3_f14;
       __m128i oqp2_f14;
       __m128i oqp1_f14;
       __m128i oqp0_f14;

       Filter14(qp6, qp5, qp4, qp3, qp2, qp1, qp0, &oqp5_f14, &oqp4_f14,
                &oqp3_f14, &oqp2_f14, &oqp1_f14, &oqp0_f14);

       oqp5_f14 = _mm_blendv_epi8(qp5, oqp5_f14, v_flat4_mask);
       oqp4_f14 = _mm_blendv_epi8(qp4, oqp4_f14, v_flat4_mask);
       oqp3_f14 = _mm_blendv_epi8(qp3, oqp3_f14, v_flat4_mask);
       oqp2_f8 = _mm_blendv_epi8(oqp2_f8, oqp2_f14, v_flat4_mask);
       oqp1 = _mm_blendv_epi8(oqp1, oqp1_f14, v_flat4_mask);
       oqp0 = _mm_blendv_epi8(oqp0, oqp0_f14, v_flat4_mask);

       StoreLo8(dst - 6 * stride, oqp5_f14);
       StoreLo8(dst - 5 * stride, oqp4_f14);
       StoreLo8(dst - 4 * stride, oqp3_f14);

       StoreHi8(dst + 3 * stride, oqp3_f14);
       StoreHi8(dst + 4 * stride, oqp4_f14);
       StoreHi8(dst + 5 * stride, oqp5_f14);
     }

     StoreLo8(dst - 3 * stride, oqp2_f8);
     StoreHi8(dst + 2 * stride, oqp2_f8);
   }

   StoreLo8(dst - 2 * stride, oqp1);
   StoreLo8(dst - 1 * stride, oqp0);
   StoreHi8(dst + 0 * stride, oqp0);
   StoreHi8(dst + 1 * stride, oqp1);
 }

 inline void TransposeUpper4x8To8x4(const __m128i& x0, const __m128i& x1,
                                    const __m128i& x2, const __m128i& x3,
                                    const __m128i& x4, const __m128i& x5,
                                    const __m128i& x6, const __m128i& x7,
                                    __m128i* d0, __m128i* d1, __m128i* d2,
                                    __m128i* d3) {
   // input
   // x0 00 01 02 03 xx xx xx xx
   // x1 10 11 12 13 xx xx xx xx
   // x2 20 21 22 23 xx xx xx xx
   // x3 30 31 32 33 xx xx xx xx
   // x4 40 41 42 43 xx xx xx xx
   // x5 50 51 52 53 xx xx xx xx
   // x6 60 61 62 63 xx xx xx xx
   // x7 70 71 72 73 xx xx xx xx
   // output
   // d0 00 10 20 30 40 50 60 70
   // d1 01 11 21 31 41 51 61 71
   // d2 02 12 22 32 42 52 62 72
   // d3 03 13 23 33 43 53 63 73

   // 00 10 01 11 02 12 03 13
   const __m128i w0 = _mm_unpackhi_epi16(x0, x1);
   // 20 30 21 31 22 32 23 33
   const __m128i w1 = _mm_unpackhi_epi16(x2, x3);
   // 40 50 41 51 42 52 43 53
   const __m128i w2 = _mm_unpackhi_epi16(x4, x5);
   // 60 70 61 71 62 72 63 73
   const __m128i w3 = _mm_unpackhi_epi16(x6, x7);

   // 00 10 20 30 01 11 21 31
   const __m128i w4 = _mm_unpacklo_epi32(w0, w1);
   // 40 50 60 70 41 51 61 71
   const __m128i w5 = _mm_unpacklo_epi32(w2, w3);
   // 02 12 22 32 03 13 23 33
   const __m128i w6 = _mm_unpackhi_epi32(w0, w1);
   // 42 52 62 72 43 53 63 73
   const __m128i w7 = _mm_unpackhi_epi32(w2, w3);

   // 00 10 20 30 40 50 60 70
   *d0 = _mm_unpacklo_epi64(w4, w5);
   // 01 11 21 31 41 51 61 71
   *d1 = _mm_unpackhi_epi64(w4, w5);
   // 02 12 22 32 42 52 62 72
   *d2 = _mm_unpacklo_epi64(w6, w7);
   // 03 13 23 33 43 53 63 73
   *d3 = _mm_unpackhi_epi64(w6, w7);
 }

 template <int bitdepth>
 void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical14(void* dest, ptrdiff_t stride8,
                                                   int outer_thresh,
                                                   int inner_thresh,
                                                   int hev_thresh) {
   auto* const dst = static_cast<uint16_t*>(dest);
   const ptrdiff_t stride = stride8 / 2;
   const __m128i v_flat_thresh =
       _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
   const __m128i v_outer_thresh =
       _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
   const __m128i v_inner_thresh =
       _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
   const __m128i v_hev_thresh =
       _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);

   // p7 p6 p5 p4 p3 p2 p1 p0  q0 q1 q2 q3 q4 q5 q6 q7
   //
   // 00 01 02 03 04 05 06 07  08 09 0a 0b 0c 0d 0e 0f
   // 10 11 12 13 14 15 16 17  18 19 1a 1b 1c 1d 1e 1f
   // 20 21 22 23 24 25 26 27  28 29 2a 2b 2c 2d 2e 2f
   // 30 31 32 33 34 35 36 37  38 39 3a 3b 3c 3d 3e 3f

   __m128i x0 = LoadUnaligned16(dst - 8 + 0 * stride);
   __m128i x1 = LoadUnaligned16(dst - 8 + 1 * stride);
   __m128i x2 = LoadUnaligned16(dst - 8 + 2 * stride);
   __m128i x3 = LoadUnaligned16(dst - 8 + 3 * stride);

   __m128i p7, p6, p5, p4, p3, p2, p1, p0;
   __m128i q7, q6, q5, q4, q3, q2, q1, q0;

   Transpose8x4To4x8(x0, x1, x2, x3, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0);

   x0 = LoadUnaligned16(dst - 8 + 8 + 0 * stride);
   x1 = LoadUnaligned16(dst - 8 + 8 + 1 * stride);
   x2 = LoadUnaligned16(dst - 8 + 8 + 2 * stride);
   x3 = LoadUnaligned16(dst - 8 + 8 + 3 * stride);

   Transpose8x4To4x8(x0, x1, x2, x3, &q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);

   __m128i qp7 = _mm_unpacklo_epi64(p7, q7);
   __m128i qp6 = _mm_unpacklo_epi64(p6, q6);
   __m128i qp5 = _mm_unpacklo_epi64(p5, q5);
   __m128i qp4 = _mm_unpacklo_epi64(p4, q4);
   __m128i qp3 = _mm_unpacklo_epi64(p3, q3);
   __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
   __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
   __m128i qp0 = _mm_unpacklo_epi64(p0, q0);

   const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
   const __m128i v_needs_mask =
       NeedsFilter8(qp3, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);

   __m128i oqp1;
   __m128i oqp0;

   Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);

   const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
   const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
   const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);

   if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
     const __m128i v_isflatouter4_mask =
         IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh);
     const __m128i v_flat4_mask_lo = _mm_and_si128(v_mask, v_isflatouter4_mask);
     const __m128i v_flat4_mask =
         _mm_unpacklo_epi64(v_flat4_mask_lo, v_flat4_mask_lo);

     __m128i oqp2_f8;
     __m128i oqp1_f8;
     __m128i oqp0_f8;

     Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);

     oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
     oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
     oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);

     if (_mm_test_all_zeros(v_flat4_mask,
                            _mm_cmpeq_epi16(v_flat4_mask, v_flat4_mask)) == 0) {
       __m128i oqp5_f14;
       __m128i oqp4_f14;
       __m128i oqp3_f14;
       __m128i oqp2_f14;
       __m128i oqp1_f14;
       __m128i oqp0_f14;

       Filter14(qp6, qp5, qp4, qp3, qp2, qp1, qp0, &oqp5_f14, &oqp4_f14,
                &oqp3_f14, &oqp2_f14, &oqp1_f14, &oqp0_f14);

       oqp5_f14 = _mm_blendv_epi8(qp5, oqp5_f14, v_flat4_mask);
       oqp4_f14 = _mm_blendv_epi8(qp4, oqp4_f14, v_flat4_mask);
       oqp3_f14 = _mm_blendv_epi8(qp3, oqp3_f14, v_flat4_mask);
       oqp2_f8 = _mm_blendv_epi8(oqp2_f8, oqp2_f14, v_flat4_mask);
       oqp1 = _mm_blendv_epi8(oqp1, oqp1_f14, v_flat4_mask);
       oqp0 = _mm_blendv_epi8(oqp0, oqp0_f14, v_flat4_mask);
       qp3 = oqp3_f14;
       qp4 = oqp4_f14;
       qp5 = oqp5_f14;
     }
     qp2 = oqp2_f8;
   }

   TransposeLower4x8To8x4(qp7, qp6, qp5, qp4, qp3, qp2, oqp1, oqp0, &x0, &x1,
                          &x2, &x3);

   StoreUnaligned16(dst - 8 + 0 * stride, x0);
   StoreUnaligned16(dst - 8 + 1 * stride, x1);
   StoreUnaligned16(dst - 8 + 2 * stride, x2);
   StoreUnaligned16(dst - 8 + 3 * stride, x3);

   TransposeUpper4x8To8x4(oqp0, oqp1, qp2, qp3, qp4, qp5, qp6, qp7, &x0, &x1,
                          &x2, &x3);

   StoreUnaligned16(dst - 8 + 8 + 0 * stride, x0);
   StoreUnaligned16(dst - 8 + 8 + 1 * stride, x1);
   StoreUnaligned16(dst - 8 + 8 + 2 * stride, x2);
   StoreUnaligned16(dst - 8 + 8 + 3 * stride, x3);
 }

 using Defs10bpp = LoopFilterFuncs_SSE4_1<kBitdepth10>;

 void Init10bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
   assert(dsp != nullptr);
   static_cast<void>(dsp);
 #if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize4_LoopFilterTypeHorizontal)
   dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
       Defs10bpp::Horizontal4;
 #endif
 #if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize6_LoopFilterTypeHorizontal)
   dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
       Defs10bpp::Horizontal6;
 #endif
 #if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize8_LoopFilterTypeHorizontal)
   dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
       Defs10bpp::Horizontal8;
 #endif
 #if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize14_LoopFilterTypeHorizontal)
   dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
       Defs10bpp::Horizontal14;
 #endif
 #if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize4_LoopFilterTypeVertical)
   dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] =
       Defs10bpp::Vertical4;
 #endif
 #if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize6_LoopFilterTypeVertical)
   dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] =
       Defs10bpp::Vertical6;
 #endif
 #if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize8_LoopFilterTypeVertical)
   dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] =
       Defs10bpp::Vertical8;
 #endif
 #if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize14_LoopFilterTypeVertical)
   dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
       Defs10bpp::Vertical14;
 #endif
 }
 #endif
 }  // namespace
 }  // namespace high_bitdepth

 void LoopFilterInit_SSE4_1() {
   low_bitdepth::Init8bpp();
 #if LIBGAV1_MAX_BITDEPTH >= 10
   high_bitdepth::Init10bpp();
 #endif
 }

 }  // namespace dsp
 }  // namespace libgav1

 #else  // !LIBGAV1_ENABLE_SSE4_1
 namespace libgav1 {
 namespace dsp {

 void LoopFilterInit_SSE4_1() {}

 }  // namespace dsp
 }  // namespace libgav1
 #endif  // LIBGAV1_ENABLE_SSE4_1