libgav1/src/dsp/x86/intrapred_filter_sse4.cc - platform/external/libgav1 - Git at Google

 // Copyright 2021 The libgav1 Authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include "src/dsp/intrapred_filter.h"
 #include "src/utils/cpu.h"

 #if LIBGAV1_TARGETING_SSE4_1

 #include <xmmintrin.h>

 #include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
 #include <cstring>

 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
 #include "src/dsp/x86/common_sse4.h"
 #include "src/dsp/x86/transpose_sse4.h"
 #include "src/utils/common.h"
 #include "src/utils/constants.h"

 namespace libgav1 {
 namespace dsp {
 namespace {

 //------------------------------------------------------------------------------
 // FilterIntraPredictor_SSE4_1
 // Section 7.11.2.3. Recursive intra prediction process
 // This filter applies recursively to 4x2 sub-blocks within the transform block,
 // meaning that the predicted pixels in each sub-block are used as inputs to
 // sub-blocks below and to the right, if present.
 //
 // Each output value in the sub-block is predicted by a different filter applied
 // to the same array of top-left, top, and left values. If fn refers to the
 // output of the nth filter, given this block:
 // TL T0 T1 T2 T3
 // L0 f0 f1 f2 f3
 // L1 f4 f5 f6 f7
 // The filter input order is p0, p1, p2, p3, p4, p5, p6:
 // p0 p1 p2 p3 p4
 // p5 f0 f1 f2 f3
 // p6 f4 f5 f6 f7
 // Filters usually apply to 8 values for convenience, so in this case we fix
 // the 8th filter tap to 0 and disregard the value of the 8th input.

 // This shuffle mask selects 32-bit blocks in the order 0, 1, 0, 1, which
 // duplicates the first 8 bytes of a 128-bit vector into the second 8 bytes.
 constexpr int kDuplicateFirstHalf = 0x44;

 // Apply all filter taps to the given 7 packed 16-bit values, keeping the 8th
 // at zero to preserve the sum.
 // |pixels| contains p0-p7 in order as shown above.
 // |taps_0_1| contains the filter kernels used to predict f0 and f1, and so on.
 inline void Filter4x2_SSE4_1(uint8_t* dst, const ptrdiff_t stride,
                              const __m128i& pixels, const __m128i& taps_0_1,
                              const __m128i& taps_2_3, const __m128i& taps_4_5,
                              const __m128i& taps_6_7) {
   const __m128i mul_0_01 = _mm_maddubs_epi16(pixels, taps_0_1);
   const __m128i mul_0_23 = _mm_maddubs_epi16(pixels, taps_2_3);
   // |output_half| contains 8 partial sums for f0-f7.
   __m128i output_half = _mm_hadd_epi16(mul_0_01, mul_0_23);
   __m128i output = _mm_hadd_epi16(output_half, output_half);
   const __m128i output_row0 =
       _mm_packus_epi16(RightShiftWithRounding_S16(output, 4),
                        /* unused half */ output);
   Store4(dst, output_row0);
   const __m128i mul_1_01 = _mm_maddubs_epi16(pixels, taps_4_5);
   const __m128i mul_1_23 = _mm_maddubs_epi16(pixels, taps_6_7);
   output_half = _mm_hadd_epi16(mul_1_01, mul_1_23);
   output = _mm_hadd_epi16(output_half, output_half);
   const __m128i output_row1 =
       _mm_packus_epi16(RightShiftWithRounding_S16(output, 4),
                        /* arbitrary pack arg */ output);
   Store4(dst + stride, output_row1);
 }

 // 4xH transform sizes are given special treatment because LoadLo8 goes out
 // of bounds and every block involves the left column. The top-left pixel, p0,
 // is stored in the top buffer for the first 4x2, but comes from the left buffer
 // for successive blocks. This implementation takes advantage of the fact
 // that the p5 and p6 for each sub-block come solely from the |left_ptr| buffer,
 // using shifts to arrange things to fit reusable shuffle vectors.
 inline void Filter4xH(uint8_t* dest, ptrdiff_t stride,
                       const uint8_t* const top_ptr,
                       const uint8_t* const left_ptr, FilterIntraPredictor pred,
                       const int height) {
   // Two filter kernels per vector.
   const __m128i taps_0_1 = LoadAligned16(kFilterIntraTaps[pred][0]);
   const __m128i taps_2_3 = LoadAligned16(kFilterIntraTaps[pred][2]);
   const __m128i taps_4_5 = LoadAligned16(kFilterIntraTaps[pred][4]);
   const __m128i taps_6_7 = LoadAligned16(kFilterIntraTaps[pred][6]);
   __m128i top = Load4(top_ptr - 1);
   __m128i pixels = _mm_insert_epi8(top, top_ptr[3], 4);
   __m128i left = (height == 4 ? Load4(left_ptr) : LoadLo8(left_ptr));
   left = _mm_slli_si128(left, 5);

   // Relative pixels: top[-1], top[0], top[1], top[2], top[3], left[0], left[1],
   // left[2], left[3], left[4], left[5], left[6], left[7]
   // Let rn represent a pixel usable as pn for the 4x2 after this one. We get:
   //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
   // p0 p1 p2 p3 p4 p5 p6 r5 r6 ...
   //                   r0
   pixels = _mm_or_si128(left, pixels);

   // Two sets of the same input pixels to apply two filters at once.
   pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
   Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
                    taps_6_7);
   dest += stride;  // Move to y = 1.
   pixels = Load4(dest);

   // Relative pixels: top[0], top[1], top[2], top[3], empty, left[-2], left[-1],
   // left[0], left[1], ...
   //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
   // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
   //                         r0
   pixels = _mm_or_si128(left, pixels);

   // This mask rearranges bytes in the order: 6, 0, 1, 2, 3, 7, 8, 15. The last
   // byte is an unused value, which shall be multiplied by 0 when we apply the
   // filter.
   constexpr int64_t kInsertTopLeftFirstMask = 0x0F08070302010006;

   // Insert left[-1] in front as TL and put left[0] and left[1] at the end.
   const __m128i pixel_order1 = _mm_set1_epi64x(kInsertTopLeftFirstMask);
   pixels = _mm_shuffle_epi8(pixels, pixel_order1);
   dest += stride;  // Move to y = 2.
   Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
                    taps_6_7);
   dest += stride;  // Move to y = 3.

   // Compute the middle 8 rows before using common code for the final 4 rows, in
   // order to fit the assumption that |left| has the next TL at position 8.
   if (height == 16) {
     // This shift allows us to use pixel_order2 twice after shifting by 2 later.
     left = _mm_slli_si128(left, 1);
     pixels = Load4(dest);

     // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, left[-4],
     // left[-3], left[-2], left[-1], left[0], left[1], left[2], left[3]
     //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
     // p1 p2 p3 p4 xx xx xx xx xx p0 p5 p6 r5 r6 ...
     //                                  r0
     pixels = _mm_or_si128(left, pixels);

     // This mask rearranges bytes in the order: 9, 0, 1, 2, 3, 7, 8, 15. The
     // last byte is an unused value, as above. The top-left was shifted to
     // position nine to keep two empty spaces after the top pixels.
     constexpr int64_t kInsertTopLeftSecondMask = 0x0F0B0A0302010009;

     // Insert (relative) left[-1] in front as TL and put left[0] and left[1] at
     // the end.
     const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftSecondMask);
     pixels = _mm_shuffle_epi8(pixels, pixel_order2);
     dest += stride;  // Move to y = 4.

     // First 4x2 in the if body.
     Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
                      taps_6_7);

     // Clear all but final pixel in the first 8 of left column.
     __m128i keep_top_left = _mm_srli_si128(left, 13);
     dest += stride;  // Move to y = 5.
     pixels = Load4(dest);
     left = _mm_srli_si128(left, 2);

     // Relative pixels: top[0], top[1], top[2], top[3], left[-6],
     // left[-5], left[-4], left[-3], left[-2], left[-1], left[0], left[1]
     //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
     // p1 p2 p3 p4 xx xx xx xx xx p0 p5 p6 r5 r6 ...
     //                                  r0
     pixels = _mm_or_si128(left, pixels);
     left = LoadLo8(left_ptr + 8);

     pixels = _mm_shuffle_epi8(pixels, pixel_order2);
     dest += stride;  // Move to y = 6.

     // Second 4x2 in the if body.
     Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
                      taps_6_7);

     // Position TL value so we can use pixel_order1.
     keep_top_left = _mm_slli_si128(keep_top_left, 6);
     dest += stride;  // Move to y = 7.
     pixels = Load4(dest);
     left = _mm_slli_si128(left, 7);
     left = _mm_or_si128(left, keep_top_left);

     // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
     // left[-1], left[0], left[1], left[2], left[3], ...
     //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
     // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
     //                         r0
     pixels = _mm_or_si128(left, pixels);
     pixels = _mm_shuffle_epi8(pixels, pixel_order1);
     dest += stride;  // Move to y = 8.

     // Third 4x2 in the if body.
     Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
                      taps_6_7);
     dest += stride;  // Move to y = 9.

     // Prepare final inputs.
     pixels = Load4(dest);
     left = _mm_srli_si128(left, 2);

     // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
     // left[-1], left[0], left[1], left[2], left[3], ...
     //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
     // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
     //                         r0
     pixels = _mm_or_si128(left, pixels);
     pixels = _mm_shuffle_epi8(pixels, pixel_order1);
     dest += stride;  // Move to y = 10.

     // Fourth 4x2 in the if body.
     Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
                      taps_6_7);
     dest += stride;  // Move to y = 11.
   }

   // In both the 8 and 16 case at this point, we can assume that |left| has the
   // next TL at position 8.
   if (height > 4) {
     // Erase prior left pixels by shifting TL to position 0.
     left = _mm_srli_si128(left, 8);
     left = _mm_slli_si128(left, 6);
     pixels = Load4(dest);

     // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
     // left[-1], left[0], left[1], left[2], left[3], ...
     //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
     // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
     //                         r0
     pixels = _mm_or_si128(left, pixels);
     pixels = _mm_shuffle_epi8(pixels, pixel_order1);
     dest += stride;  // Move to y = 12 or 4.

     // First of final two 4x2 blocks.
     Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
                      taps_6_7);
     dest += stride;  // Move to y = 13 or 5.
     pixels = Load4(dest);
     left = _mm_srli_si128(left, 2);

     // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
     // left[-1], left[0], left[1], left[2], left[3], ...
     //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
     // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
     //                         r0
     pixels = _mm_or_si128(left, pixels);
     pixels = _mm_shuffle_epi8(pixels, pixel_order1);
     dest += stride;  // Move to y = 14 or 6.

     // Last of final two 4x2 blocks.
     Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
                      taps_6_7);
   }
 }

 void FilterIntraPredictor_SSE4_1(void* const dest, ptrdiff_t stride,
                                  const void* const top_row,
                                  const void* const left_column,
                                  FilterIntraPredictor pred, const int width,
                                  const int height) {
   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
   auto* dst = static_cast<uint8_t*>(dest);
   if (width == 4) {
     Filter4xH(dst, stride, top_ptr, left_ptr, pred, height);
     return;
   }

   // There is one set of 7 taps for each of the 4x2 output pixels.
   const __m128i taps_0_1 = LoadAligned16(kFilterIntraTaps[pred][0]);
   const __m128i taps_2_3 = LoadAligned16(kFilterIntraTaps[pred][2]);
   const __m128i taps_4_5 = LoadAligned16(kFilterIntraTaps[pred][4]);
   const __m128i taps_6_7 = LoadAligned16(kFilterIntraTaps[pred][6]);

   // This mask rearranges bytes in the order: 0, 1, 2, 3, 4, 8, 9, 15. The 15 at
   // the end is an unused value, which shall be multiplied by 0 when we apply
   // the filter.
   constexpr int64_t kCondenseLeftMask = 0x0F09080403020100;

   // Takes the "left section" and puts it right after p0-p4.
   const __m128i pixel_order1 = _mm_set1_epi64x(kCondenseLeftMask);

   // This mask rearranges bytes in the order: 8, 0, 1, 2, 3, 9, 10, 15. The last
   // byte is unused as above.
   constexpr int64_t kInsertTopLeftMask = 0x0F0A090302010008;

   // Shuffles the "top left" from the left section, to the front. Used when
   // grabbing data from left_column and not top_row.
   const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftMask);

   // This first pass takes care of the cases where the top left pixel comes from
   // top_row.
   __m128i pixels = LoadLo8(top_ptr - 1);
   __m128i left = _mm_slli_si128(Load4(left_column), 8);
   pixels = _mm_or_si128(pixels, left);

   // Two sets of the same pixels to multiply with two sets of taps.
   pixels = _mm_shuffle_epi8(pixels, pixel_order1);
   Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5, taps_6_7);
   left = _mm_srli_si128(left, 1);

   // Load
   pixels = Load4(dst + stride);

   // Because of the above shift, this OR 'invades' the final of the first 8
   // bytes of |pixels|. This is acceptable because the 8th filter tap is always
   // a padded 0.
   pixels = _mm_or_si128(pixels, left);
   pixels = _mm_shuffle_epi8(pixels, pixel_order2);
   const ptrdiff_t stride2 = stride << 1;
   const ptrdiff_t stride4 = stride << 2;
   Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
                    taps_6_7);
   dst += 4;
   for (int x = 3; x < width - 4; x += 4) {
     pixels = Load4(top_ptr + x);
     pixels = _mm_insert_epi8(pixels, top_ptr[x + 4], 4);
     pixels = _mm_insert_epi8(pixels, dst[-1], 5);
     pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6);

     // Duplicate bottom half into upper half.
     pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
     Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
                      taps_6_7);
     pixels = Load4(dst + stride - 1);
     pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4);
     pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5);
     pixels = _mm_insert_epi8(pixels, dst[stride + stride2 - 1], 6);

     // Duplicate bottom half into upper half.
     pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
     Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
                      taps_4_5, taps_6_7);
     dst += 4;
   }

   // Now we handle heights that reference previous blocks rather than top_row.
   for (int y = 4; y < height; y += 4) {
     // Leftmost 4x4 block for this height.
     dst -= width;
     dst += stride4;

     // Top Left is not available by offset in these leftmost blocks.
     pixels = Load4(dst - stride);
     left = _mm_slli_si128(Load4(left_ptr + y - 1), 8);
     left = _mm_insert_epi8(left, left_ptr[y + 3], 12);
     pixels = _mm_or_si128(pixels, left);
     pixels = _mm_shuffle_epi8(pixels, pixel_order2);
     Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
                      taps_6_7);

     // The bytes shifted into positions 6 and 7 will be ignored by the shuffle.
     left = _mm_srli_si128(left, 2);
     pixels = Load4(dst + stride);
     pixels = _mm_or_si128(pixels, left);
     pixels = _mm_shuffle_epi8(pixels, pixel_order2);
     Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
                      taps_4_5, taps_6_7);

     dst += 4;

     // Remaining 4x4 blocks for this height.
     for (int x = 4; x < width; x += 4) {
       pixels = Load4(dst - stride - 1);
       pixels = _mm_insert_epi8(pixels, dst[-stride + 3], 4);
       pixels = _mm_insert_epi8(pixels, dst[-1], 5);
       pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6);

       // Duplicate bottom half into upper half.
       pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
       Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
                        taps_6_7);
       pixels = Load4(dst + stride - 1);
       pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4);
       pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5);
       pixels = _mm_insert_epi8(pixels, dst[stride2 + stride - 1], 6);

       // Duplicate bottom half into upper half.
       pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
       Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
                        taps_4_5, taps_6_7);
       dst += 4;
     }
   }
 }

 void Init8bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
   assert(dsp != nullptr);
   static_cast<void>(dsp);
 // These guards check if this version of the function was not superseded by
 // a higher optimization level, such as AVX. The corresponding #define also
 // prevents the C version from being added to the table.
 #if DSP_ENABLED_8BPP_SSE4_1(FilterIntraPredictor)
   dsp->filter_intra_predictor = FilterIntraPredictor_SSE4_1;
 #endif
 }

 }  // namespace

 void IntraPredFilterInit_SSE4_1() { Init8bpp(); }

 }  // namespace dsp
 }  // namespace libgav1

 #else   // !LIBGAV1_TARGETING_SSE4_1
 namespace libgav1 {
 namespace dsp {

 void IntraPredFilterInit_SSE4_1() {}

 }  // namespace dsp
 }  // namespace libgav1
 #endif  // LIBGAV1_TARGETING_SSE4_1
	// Copyright 2021 The libgav1 Authors
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#include "src/dsp/intrapred_filter.h"
	#include "src/utils/cpu.h"

	#if LIBGAV1_TARGETING_SSE4_1

	#include <xmmintrin.h>

	#include <algorithm>
	#include <cassert>
	#include <cstddef>
	#include <cstdint>
	#include <cstring>

	#include "src/dsp/constants.h"
	#include "src/dsp/dsp.h"
	#include "src/dsp/x86/common_sse4.h"
	#include "src/dsp/x86/transpose_sse4.h"
	#include "src/utils/common.h"
	#include "src/utils/constants.h"

	namespace libgav1 {
	namespace dsp {
	namespace {

	//------------------------------------------------------------------------------
	// FilterIntraPredictor_SSE4_1
	// Section 7.11.2.3. Recursive intra prediction process
	// This filter applies recursively to 4x2 sub-blocks within the transform block,
	// meaning that the predicted pixels in each sub-block are used as inputs to
	// sub-blocks below and to the right, if present.
	//
	// Each output value in the sub-block is predicted by a different filter applied
	// to the same array of top-left, top, and left values. If fn refers to the
	// output of the nth filter, given this block:
	// TL T0 T1 T2 T3
	// L0 f0 f1 f2 f3
	// L1 f4 f5 f6 f7
	// The filter input order is p0, p1, p2, p3, p4, p5, p6:
	// p0 p1 p2 p3 p4
	// p5 f0 f1 f2 f3
	// p6 f4 f5 f6 f7
	// Filters usually apply to 8 values for convenience, so in this case we fix
	// the 8th filter tap to 0 and disregard the value of the 8th input.

	// This shuffle mask selects 32-bit blocks in the order 0, 1, 0, 1, which
	// duplicates the first 8 bytes of a 128-bit vector into the second 8 bytes.
	constexpr int kDuplicateFirstHalf = 0x44;

	// Apply all filter taps to the given 7 packed 16-bit values, keeping the 8th
	// at zero to preserve the sum.
	// \|pixels\| contains p0-p7 in order as shown above.
	// \|taps_0_1\| contains the filter kernels used to predict f0 and f1, and so on.
	inline void Filter4x2_SSE4_1(uint8_t* dst, const ptrdiff_t stride,
	const __m128i& pixels, const __m128i& taps_0_1,
	const __m128i& taps_2_3, const __m128i& taps_4_5,
	const __m128i& taps_6_7) {
	const __m128i mul_0_01 = _mm_maddubs_epi16(pixels, taps_0_1);
	const __m128i mul_0_23 = _mm_maddubs_epi16(pixels, taps_2_3);
	// \|output_half\| contains 8 partial sums for f0-f7.
	__m128i output_half = _mm_hadd_epi16(mul_0_01, mul_0_23);
	__m128i output = _mm_hadd_epi16(output_half, output_half);
	const __m128i output_row0 =
	_mm_packus_epi16(RightShiftWithRounding_S16(output, 4),
	/* unused half */ output);
	Store4(dst, output_row0);
	const __m128i mul_1_01 = _mm_maddubs_epi16(pixels, taps_4_5);
	const __m128i mul_1_23 = _mm_maddubs_epi16(pixels, taps_6_7);
	output_half = _mm_hadd_epi16(mul_1_01, mul_1_23);
	output = _mm_hadd_epi16(output_half, output_half);
	const __m128i output_row1 =
	_mm_packus_epi16(RightShiftWithRounding_S16(output, 4),
	/* arbitrary pack arg */ output);
	Store4(dst + stride, output_row1);
	}

	// 4xH transform sizes are given special treatment because LoadLo8 goes out
	// of bounds and every block involves the left column. The top-left pixel, p0,
	// is stored in the top buffer for the first 4x2, but comes from the left buffer
	// for successive blocks. This implementation takes advantage of the fact
	// that the p5 and p6 for each sub-block come solely from the \|left_ptr\| buffer,
	// using shifts to arrange things to fit reusable shuffle vectors.
	inline void Filter4xH(uint8_t* dest, ptrdiff_t stride,
	const uint8_t* const top_ptr,
	const uint8_t* const left_ptr, FilterIntraPredictor pred,
	const int height) {
	// Two filter kernels per vector.
	const __m128i taps_0_1 = LoadAligned16(kFilterIntraTaps[pred][0]);
	const __m128i taps_2_3 = LoadAligned16(kFilterIntraTaps[pred][2]);
	const __m128i taps_4_5 = LoadAligned16(kFilterIntraTaps[pred][4]);
	const __m128i taps_6_7 = LoadAligned16(kFilterIntraTaps[pred][6]);
	__m128i top = Load4(top_ptr - 1);
	__m128i pixels = _mm_insert_epi8(top, top_ptr[3], 4);
	__m128i left = (height == 4 ? Load4(left_ptr) : LoadLo8(left_ptr));
	left = _mm_slli_si128(left, 5);

	// Relative pixels: top[-1], top[0], top[1], top[2], top[3], left[0], left[1],
	// left[2], left[3], left[4], left[5], left[6], left[7]
	// Let rn represent a pixel usable as pn for the 4x2 after this one. We get:
	// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
	// p0 p1 p2 p3 p4 p5 p6 r5 r6 ...
	// r0
	pixels = _mm_or_si128(left, pixels);

	// Two sets of the same input pixels to apply two filters at once.
	pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
	Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
	taps_6_7);
	dest += stride; // Move to y = 1.
	pixels = Load4(dest);

	// Relative pixels: top[0], top[1], top[2], top[3], empty, left[-2], left[-1],
	// left[0], left[1], ...
	// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
	// p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
	// r0
	pixels = _mm_or_si128(left, pixels);

	// This mask rearranges bytes in the order: 6, 0, 1, 2, 3, 7, 8, 15. The last
	// byte is an unused value, which shall be multiplied by 0 when we apply the
	// filter.
	constexpr int64_t kInsertTopLeftFirstMask = 0x0F08070302010006;

	// Insert left[-1] in front as TL and put left[0] and left[1] at the end.
	const __m128i pixel_order1 = _mm_set1_epi64x(kInsertTopLeftFirstMask);
	pixels = _mm_shuffle_epi8(pixels, pixel_order1);
	dest += stride; // Move to y = 2.
	Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
	taps_6_7);
	dest += stride; // Move to y = 3.

	// Compute the middle 8 rows before using common code for the final 4 rows, in
	// order to fit the assumption that \|left\| has the next TL at position 8.
	if (height == 16) {
	// This shift allows us to use pixel_order2 twice after shifting by 2 later.
	left = _mm_slli_si128(left, 1);
	pixels = Load4(dest);

	// Relative pixels: top[0], top[1], top[2], top[3], empty, empty, left[-4],
	// left[-3], left[-2], left[-1], left[0], left[1], left[2], left[3]
	// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
	// p1 p2 p3 p4 xx xx xx xx xx p0 p5 p6 r5 r6 ...
	// r0
	pixels = _mm_or_si128(left, pixels);

	// This mask rearranges bytes in the order: 9, 0, 1, 2, 3, 7, 8, 15. The
	// last byte is an unused value, as above. The top-left was shifted to
	// position nine to keep two empty spaces after the top pixels.
	constexpr int64_t kInsertTopLeftSecondMask = 0x0F0B0A0302010009;

	// Insert (relative) left[-1] in front as TL and put left[0] and left[1] at
	// the end.
	const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftSecondMask);
	pixels = _mm_shuffle_epi8(pixels, pixel_order2);
	dest += stride; // Move to y = 4.

	// First 4x2 in the if body.
	Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
	taps_6_7);

	// Clear all but final pixel in the first 8 of left column.
	__m128i keep_top_left = _mm_srli_si128(left, 13);
	dest += stride; // Move to y = 5.
	pixels = Load4(dest);
	left = _mm_srli_si128(left, 2);

	// Relative pixels: top[0], top[1], top[2], top[3], left[-6],
	// left[-5], left[-4], left[-3], left[-2], left[-1], left[0], left[1]
	// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
	// p1 p2 p3 p4 xx xx xx xx xx p0 p5 p6 r5 r6 ...
	// r0
	pixels = _mm_or_si128(left, pixels);
	left = LoadLo8(left_ptr + 8);

	pixels = _mm_shuffle_epi8(pixels, pixel_order2);
	dest += stride; // Move to y = 6.

	// Second 4x2 in the if body.
	Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
	taps_6_7);

	// Position TL value so we can use pixel_order1.
	keep_top_left = _mm_slli_si128(keep_top_left, 6);
	dest += stride; // Move to y = 7.
	pixels = Load4(dest);
	left = _mm_slli_si128(left, 7);
	left = _mm_or_si128(left, keep_top_left);

	// Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
	// left[-1], left[0], left[1], left[2], left[3], ...
	// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
	// p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
	// r0
	pixels = _mm_or_si128(left, pixels);
	pixels = _mm_shuffle_epi8(pixels, pixel_order1);
	dest += stride; // Move to y = 8.

	// Third 4x2 in the if body.
	Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
	taps_6_7);
	dest += stride; // Move to y = 9.

	// Prepare final inputs.
	pixels = Load4(dest);
	left = _mm_srli_si128(left, 2);

	// Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
	// left[-1], left[0], left[1], left[2], left[3], ...
	// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
	// p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
	// r0
	pixels = _mm_or_si128(left, pixels);
	pixels = _mm_shuffle_epi8(pixels, pixel_order1);
	dest += stride; // Move to y = 10.

	// Fourth 4x2 in the if body.
	Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
	taps_6_7);
	dest += stride; // Move to y = 11.
	}

	// In both the 8 and 16 case at this point, we can assume that \|left\| has the
	// next TL at position 8.
	if (height > 4) {
	// Erase prior left pixels by shifting TL to position 0.
	left = _mm_srli_si128(left, 8);
	left = _mm_slli_si128(left, 6);
	pixels = Load4(dest);

	// Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
	// left[-1], left[0], left[1], left[2], left[3], ...
	// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
	// p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
	// r0
	pixels = _mm_or_si128(left, pixels);
	pixels = _mm_shuffle_epi8(pixels, pixel_order1);
	dest += stride; // Move to y = 12 or 4.

	// First of final two 4x2 blocks.
	Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
	taps_6_7);
	dest += stride; // Move to y = 13 or 5.
	pixels = Load4(dest);
	left = _mm_srli_si128(left, 2);

	// Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
	// left[-1], left[0], left[1], left[2], left[3], ...
	// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
	// p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
	// r0
	pixels = _mm_or_si128(left, pixels);
	pixels = _mm_shuffle_epi8(pixels, pixel_order1);
	dest += stride; // Move to y = 14 or 6.

	// Last of final two 4x2 blocks.
	Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
	taps_6_7);
	}
	}

	void FilterIntraPredictor_SSE4_1(void* const dest, ptrdiff_t stride,
	const void* const top_row,
	const void* const left_column,
	FilterIntraPredictor pred, const int width,
	const int height) {
	const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
	const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
	auto* dst = static_cast<uint8_t*>(dest);
	if (width == 4) {
	Filter4xH(dst, stride, top_ptr, left_ptr, pred, height);
	return;
	}

	// There is one set of 7 taps for each of the 4x2 output pixels.
	const __m128i taps_0_1 = LoadAligned16(kFilterIntraTaps[pred][0]);
	const __m128i taps_2_3 = LoadAligned16(kFilterIntraTaps[pred][2]);
	const __m128i taps_4_5 = LoadAligned16(kFilterIntraTaps[pred][4]);
	const __m128i taps_6_7 = LoadAligned16(kFilterIntraTaps[pred][6]);

	// This mask rearranges bytes in the order: 0, 1, 2, 3, 4, 8, 9, 15. The 15 at
	// the end is an unused value, which shall be multiplied by 0 when we apply
	// the filter.
	constexpr int64_t kCondenseLeftMask = 0x0F09080403020100;

	// Takes the "left section" and puts it right after p0-p4.
	const __m128i pixel_order1 = _mm_set1_epi64x(kCondenseLeftMask);

	// This mask rearranges bytes in the order: 8, 0, 1, 2, 3, 9, 10, 15. The last
	// byte is unused as above.
	constexpr int64_t kInsertTopLeftMask = 0x0F0A090302010008;

	// Shuffles the "top left" from the left section, to the front. Used when
	// grabbing data from left_column and not top_row.
	const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftMask);

	// This first pass takes care of the cases where the top left pixel comes from
	// top_row.
	__m128i pixels = LoadLo8(top_ptr - 1);
	__m128i left = _mm_slli_si128(Load4(left_column), 8);
	pixels = _mm_or_si128(pixels, left);

	// Two sets of the same pixels to multiply with two sets of taps.
	pixels = _mm_shuffle_epi8(pixels, pixel_order1);
	Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5, taps_6_7);
	left = _mm_srli_si128(left, 1);

	// Load
	pixels = Load4(dst + stride);

	// Because of the above shift, this OR 'invades' the final of the first 8
	// bytes of \|pixels\|. This is acceptable because the 8th filter tap is always
	// a padded 0.
	pixels = _mm_or_si128(pixels, left);
	pixels = _mm_shuffle_epi8(pixels, pixel_order2);
	const ptrdiff_t stride2 = stride << 1;
	const ptrdiff_t stride4 = stride << 2;
	Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
	taps_6_7);
	dst += 4;
	for (int x = 3; x < width - 4; x += 4) {
	pixels = Load4(top_ptr + x);
	pixels = _mm_insert_epi8(pixels, top_ptr[x + 4], 4);
	pixels = _mm_insert_epi8(pixels, dst[-1], 5);
	pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6);

	// Duplicate bottom half into upper half.
	pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
	Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
	taps_6_7);
	pixels = Load4(dst + stride - 1);
	pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4);
	pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5);
	pixels = _mm_insert_epi8(pixels, dst[stride + stride2 - 1], 6);

	// Duplicate bottom half into upper half.
	pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
	Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
	taps_4_5, taps_6_7);
	dst += 4;
	}

	// Now we handle heights that reference previous blocks rather than top_row.
	for (int y = 4; y < height; y += 4) {
	// Leftmost 4x4 block for this height.
	dst -= width;
	dst += stride4;

	// Top Left is not available by offset in these leftmost blocks.
	pixels = Load4(dst - stride);
	left = _mm_slli_si128(Load4(left_ptr + y - 1), 8);
	left = _mm_insert_epi8(left, left_ptr[y + 3], 12);
	pixels = _mm_or_si128(pixels, left);
	pixels = _mm_shuffle_epi8(pixels, pixel_order2);
	Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
	taps_6_7);

	// The bytes shifted into positions 6 and 7 will be ignored by the shuffle.
	left = _mm_srli_si128(left, 2);
	pixels = Load4(dst + stride);
	pixels = _mm_or_si128(pixels, left);
	pixels = _mm_shuffle_epi8(pixels, pixel_order2);
	Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
	taps_4_5, taps_6_7);

	dst += 4;

	// Remaining 4x4 blocks for this height.
	for (int x = 4; x < width; x += 4) {
	pixels = Load4(dst - stride - 1);
	pixels = _mm_insert_epi8(pixels, dst[-stride + 3], 4);
	pixels = _mm_insert_epi8(pixels, dst[-1], 5);
	pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6);

	// Duplicate bottom half into upper half.
	pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
	Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
	taps_6_7);
	pixels = Load4(dst + stride - 1);
	pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4);
	pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5);
	pixels = _mm_insert_epi8(pixels, dst[stride2 + stride - 1], 6);

	// Duplicate bottom half into upper half.
	pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
	Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
	taps_4_5, taps_6_7);
	dst += 4;
	}
	}
	}

	void Init8bpp() {
	Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
	assert(dsp != nullptr);
	static_cast<void>(dsp);
	// These guards check if this version of the function was not superseded by
	// a higher optimization level, such as AVX. The corresponding #define also
	// prevents the C version from being added to the table.
	#if DSP_ENABLED_8BPP_SSE4_1(FilterIntraPredictor)
	dsp->filter_intra_predictor = FilterIntraPredictor_SSE4_1;
	#endif
	}

	} // namespace

	void IntraPredFilterInit_SSE4_1() { Init8bpp(); }

	} // namespace dsp
	} // namespace libgav1

	#else // !LIBGAV1_TARGETING_SSE4_1
	namespace libgav1 {
	namespace dsp {

	void IntraPredFilterInit_SSE4_1() {}

	} // namespace dsp
	} // namespace libgav1
	#endif // LIBGAV1_TARGETING_SSE4_1