| // Copyright 2021 The libgav1 Authors |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| #include "src/dsp/intrapred_filter.h" |
| #include "src/utils/cpu.h" |
| |
| #if LIBGAV1_TARGETING_SSE4_1 |
| |
| #include <xmmintrin.h> |
| |
| #include <algorithm> |
| #include <cassert> |
| #include <cstddef> |
| #include <cstdint> |
| #include <cstring> |
| |
| #include "src/dsp/constants.h" |
| #include "src/dsp/dsp.h" |
| #include "src/dsp/x86/common_sse4.h" |
| #include "src/dsp/x86/transpose_sse4.h" |
| #include "src/utils/common.h" |
| #include "src/utils/constants.h" |
| |
| namespace libgav1 { |
| namespace dsp { |
| namespace { |
| |
| //------------------------------------------------------------------------------ |
| // FilterIntraPredictor_SSE4_1 |
| // Section 7.11.2.3. Recursive intra prediction process |
| // This filter applies recursively to 4x2 sub-blocks within the transform block, |
| // meaning that the predicted pixels in each sub-block are used as inputs to |
| // sub-blocks below and to the right, if present. |
| // |
| // Each output value in the sub-block is predicted by a different filter applied |
| // to the same array of top-left, top, and left values. If fn refers to the |
| // output of the nth filter, given this block: |
| // TL T0 T1 T2 T3 |
| // L0 f0 f1 f2 f3 |
| // L1 f4 f5 f6 f7 |
| // The filter input order is p0, p1, p2, p3, p4, p5, p6: |
| // p0 p1 p2 p3 p4 |
| // p5 f0 f1 f2 f3 |
| // p6 f4 f5 f6 f7 |
| // Filters usually apply to 8 values for convenience, so in this case we fix |
| // the 8th filter tap to 0 and disregard the value of the 8th input. |
| |
| // This shuffle mask selects 32-bit blocks in the order 0, 1, 0, 1, which |
| // duplicates the first 8 bytes of a 128-bit vector into the second 8 bytes. |
| constexpr int kDuplicateFirstHalf = 0x44; |
| |
| // Apply all filter taps to the given 7 packed 16-bit values, keeping the 8th |
| // at zero to preserve the sum. |
| // |pixels| contains p0-p7 in order as shown above. |
| // |taps_0_1| contains the filter kernels used to predict f0 and f1, and so on. |
| inline void Filter4x2_SSE4_1(uint8_t* dst, const ptrdiff_t stride, |
| const __m128i& pixels, const __m128i& taps_0_1, |
| const __m128i& taps_2_3, const __m128i& taps_4_5, |
| const __m128i& taps_6_7) { |
| const __m128i mul_0_01 = _mm_maddubs_epi16(pixels, taps_0_1); |
| const __m128i mul_0_23 = _mm_maddubs_epi16(pixels, taps_2_3); |
| // |output_half| contains 8 partial sums for f0-f7. |
| __m128i output_half = _mm_hadd_epi16(mul_0_01, mul_0_23); |
| __m128i output = _mm_hadd_epi16(output_half, output_half); |
| const __m128i output_row0 = |
| _mm_packus_epi16(RightShiftWithRounding_S16(output, 4), |
| /* unused half */ output); |
| Store4(dst, output_row0); |
| const __m128i mul_1_01 = _mm_maddubs_epi16(pixels, taps_4_5); |
| const __m128i mul_1_23 = _mm_maddubs_epi16(pixels, taps_6_7); |
| output_half = _mm_hadd_epi16(mul_1_01, mul_1_23); |
| output = _mm_hadd_epi16(output_half, output_half); |
| const __m128i output_row1 = |
| _mm_packus_epi16(RightShiftWithRounding_S16(output, 4), |
| /* arbitrary pack arg */ output); |
| Store4(dst + stride, output_row1); |
| } |
| |
| // 4xH transform sizes are given special treatment because LoadLo8 goes out |
| // of bounds and every block involves the left column. The top-left pixel, p0, |
| // is stored in the top buffer for the first 4x2, but comes from the left buffer |
| // for successive blocks. This implementation takes advantage of the fact |
| // that the p5 and p6 for each sub-block come solely from the |left_ptr| buffer, |
| // using shifts to arrange things to fit reusable shuffle vectors. |
| inline void Filter4xH(uint8_t* dest, ptrdiff_t stride, |
| const uint8_t* const top_ptr, |
| const uint8_t* const left_ptr, FilterIntraPredictor pred, |
| const int height) { |
| // Two filter kernels per vector. |
| const __m128i taps_0_1 = LoadAligned16(kFilterIntraTaps[pred][0]); |
| const __m128i taps_2_3 = LoadAligned16(kFilterIntraTaps[pred][2]); |
| const __m128i taps_4_5 = LoadAligned16(kFilterIntraTaps[pred][4]); |
| const __m128i taps_6_7 = LoadAligned16(kFilterIntraTaps[pred][6]); |
| __m128i top = Load4(top_ptr - 1); |
| __m128i pixels = _mm_insert_epi8(top, top_ptr[3], 4); |
| __m128i left = (height == 4 ? Load4(left_ptr) : LoadLo8(left_ptr)); |
| left = _mm_slli_si128(left, 5); |
| |
| // Relative pixels: top[-1], top[0], top[1], top[2], top[3], left[0], left[1], |
| // left[2], left[3], left[4], left[5], left[6], left[7] |
| // Let rn represent a pixel usable as pn for the 4x2 after this one. We get: |
| // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
| // p0 p1 p2 p3 p4 p5 p6 r5 r6 ... |
| // r0 |
| pixels = _mm_or_si128(left, pixels); |
| |
| // Two sets of the same input pixels to apply two filters at once. |
| pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf); |
| Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5, |
| taps_6_7); |
| dest += stride; // Move to y = 1. |
| pixels = Load4(dest); |
| |
| // Relative pixels: top[0], top[1], top[2], top[3], empty, left[-2], left[-1], |
| // left[0], left[1], ... |
| // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
| // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ... |
| // r0 |
| pixels = _mm_or_si128(left, pixels); |
| |
| // This mask rearranges bytes in the order: 6, 0, 1, 2, 3, 7, 8, 15. The last |
| // byte is an unused value, which shall be multiplied by 0 when we apply the |
| // filter. |
| constexpr int64_t kInsertTopLeftFirstMask = 0x0F08070302010006; |
| |
| // Insert left[-1] in front as TL and put left[0] and left[1] at the end. |
| const __m128i pixel_order1 = _mm_set1_epi64x(kInsertTopLeftFirstMask); |
| pixels = _mm_shuffle_epi8(pixels, pixel_order1); |
| dest += stride; // Move to y = 2. |
| Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5, |
| taps_6_7); |
| dest += stride; // Move to y = 3. |
| |
| // Compute the middle 8 rows before using common code for the final 4 rows, in |
| // order to fit the assumption that |left| has the next TL at position 8. |
| if (height == 16) { |
| // This shift allows us to use pixel_order2 twice after shifting by 2 later. |
| left = _mm_slli_si128(left, 1); |
| pixels = Load4(dest); |
| |
| // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, left[-4], |
| // left[-3], left[-2], left[-1], left[0], left[1], left[2], left[3] |
| // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
| // p1 p2 p3 p4 xx xx xx xx xx p0 p5 p6 r5 r6 ... |
| // r0 |
| pixels = _mm_or_si128(left, pixels); |
| |
| // This mask rearranges bytes in the order: 9, 0, 1, 2, 3, 7, 8, 15. The |
| // last byte is an unused value, as above. The top-left was shifted to |
| // position nine to keep two empty spaces after the top pixels. |
| constexpr int64_t kInsertTopLeftSecondMask = 0x0F0B0A0302010009; |
| |
| // Insert (relative) left[-1] in front as TL and put left[0] and left[1] at |
| // the end. |
| const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftSecondMask); |
| pixels = _mm_shuffle_epi8(pixels, pixel_order2); |
| dest += stride; // Move to y = 4. |
| |
| // First 4x2 in the if body. |
| Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5, |
| taps_6_7); |
| |
| // Clear all but final pixel in the first 8 of left column. |
| __m128i keep_top_left = _mm_srli_si128(left, 13); |
| dest += stride; // Move to y = 5. |
| pixels = Load4(dest); |
| left = _mm_srli_si128(left, 2); |
| |
| // Relative pixels: top[0], top[1], top[2], top[3], left[-6], |
| // left[-5], left[-4], left[-3], left[-2], left[-1], left[0], left[1] |
| // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
| // p1 p2 p3 p4 xx xx xx xx xx p0 p5 p6 r5 r6 ... |
| // r0 |
| pixels = _mm_or_si128(left, pixels); |
| left = LoadLo8(left_ptr + 8); |
| |
| pixels = _mm_shuffle_epi8(pixels, pixel_order2); |
| dest += stride; // Move to y = 6. |
| |
| // Second 4x2 in the if body. |
| Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5, |
| taps_6_7); |
| |
| // Position TL value so we can use pixel_order1. |
| keep_top_left = _mm_slli_si128(keep_top_left, 6); |
| dest += stride; // Move to y = 7. |
| pixels = Load4(dest); |
| left = _mm_slli_si128(left, 7); |
| left = _mm_or_si128(left, keep_top_left); |
| |
| // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, |
| // left[-1], left[0], left[1], left[2], left[3], ... |
| // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
| // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ... |
| // r0 |
| pixels = _mm_or_si128(left, pixels); |
| pixels = _mm_shuffle_epi8(pixels, pixel_order1); |
| dest += stride; // Move to y = 8. |
| |
| // Third 4x2 in the if body. |
| Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5, |
| taps_6_7); |
| dest += stride; // Move to y = 9. |
| |
| // Prepare final inputs. |
| pixels = Load4(dest); |
| left = _mm_srli_si128(left, 2); |
| |
| // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2] |
| // left[-1], left[0], left[1], left[2], left[3], ... |
| // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
| // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ... |
| // r0 |
| pixels = _mm_or_si128(left, pixels); |
| pixels = _mm_shuffle_epi8(pixels, pixel_order1); |
| dest += stride; // Move to y = 10. |
| |
| // Fourth 4x2 in the if body. |
| Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5, |
| taps_6_7); |
| dest += stride; // Move to y = 11. |
| } |
| |
| // In both the 8 and 16 case at this point, we can assume that |left| has the |
| // next TL at position 8. |
| if (height > 4) { |
| // Erase prior left pixels by shifting TL to position 0. |
| left = _mm_srli_si128(left, 8); |
| left = _mm_slli_si128(left, 6); |
| pixels = Load4(dest); |
| |
| // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, |
| // left[-1], left[0], left[1], left[2], left[3], ... |
| // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
| // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ... |
| // r0 |
| pixels = _mm_or_si128(left, pixels); |
| pixels = _mm_shuffle_epi8(pixels, pixel_order1); |
| dest += stride; // Move to y = 12 or 4. |
| |
| // First of final two 4x2 blocks. |
| Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5, |
| taps_6_7); |
| dest += stride; // Move to y = 13 or 5. |
| pixels = Load4(dest); |
| left = _mm_srli_si128(left, 2); |
| |
| // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2] |
| // left[-1], left[0], left[1], left[2], left[3], ... |
| // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
| // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ... |
| // r0 |
| pixels = _mm_or_si128(left, pixels); |
| pixels = _mm_shuffle_epi8(pixels, pixel_order1); |
| dest += stride; // Move to y = 14 or 6. |
| |
| // Last of final two 4x2 blocks. |
| Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5, |
| taps_6_7); |
| } |
| } |
| |
| void FilterIntraPredictor_SSE4_1(void* const dest, ptrdiff_t stride, |
| const void* const top_row, |
| const void* const left_column, |
| FilterIntraPredictor pred, const int width, |
| const int height) { |
| const auto* const top_ptr = static_cast<const uint8_t*>(top_row); |
| const auto* const left_ptr = static_cast<const uint8_t*>(left_column); |
| auto* dst = static_cast<uint8_t*>(dest); |
| if (width == 4) { |
| Filter4xH(dst, stride, top_ptr, left_ptr, pred, height); |
| return; |
| } |
| |
| // There is one set of 7 taps for each of the 4x2 output pixels. |
| const __m128i taps_0_1 = LoadAligned16(kFilterIntraTaps[pred][0]); |
| const __m128i taps_2_3 = LoadAligned16(kFilterIntraTaps[pred][2]); |
| const __m128i taps_4_5 = LoadAligned16(kFilterIntraTaps[pred][4]); |
| const __m128i taps_6_7 = LoadAligned16(kFilterIntraTaps[pred][6]); |
| |
| // This mask rearranges bytes in the order: 0, 1, 2, 3, 4, 8, 9, 15. The 15 at |
| // the end is an unused value, which shall be multiplied by 0 when we apply |
| // the filter. |
| constexpr int64_t kCondenseLeftMask = 0x0F09080403020100; |
| |
| // Takes the "left section" and puts it right after p0-p4. |
| const __m128i pixel_order1 = _mm_set1_epi64x(kCondenseLeftMask); |
| |
| // This mask rearranges bytes in the order: 8, 0, 1, 2, 3, 9, 10, 15. The last |
| // byte is unused as above. |
| constexpr int64_t kInsertTopLeftMask = 0x0F0A090302010008; |
| |
| // Shuffles the "top left" from the left section, to the front. Used when |
| // grabbing data from left_column and not top_row. |
| const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftMask); |
| |
| // This first pass takes care of the cases where the top left pixel comes from |
| // top_row. |
| __m128i pixels = LoadLo8(top_ptr - 1); |
| __m128i left = _mm_slli_si128(Load4(left_column), 8); |
| pixels = _mm_or_si128(pixels, left); |
| |
| // Two sets of the same pixels to multiply with two sets of taps. |
| pixels = _mm_shuffle_epi8(pixels, pixel_order1); |
| Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5, taps_6_7); |
| left = _mm_srli_si128(left, 1); |
| |
| // Load |
| pixels = Load4(dst + stride); |
| |
| // Because of the above shift, this OR 'invades' the final of the first 8 |
| // bytes of |pixels|. This is acceptable because the 8th filter tap is always |
| // a padded 0. |
| pixels = _mm_or_si128(pixels, left); |
| pixels = _mm_shuffle_epi8(pixels, pixel_order2); |
| const ptrdiff_t stride2 = stride << 1; |
| const ptrdiff_t stride4 = stride << 2; |
| Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3, taps_4_5, |
| taps_6_7); |
| dst += 4; |
| for (int x = 3; x < width - 4; x += 4) { |
| pixels = Load4(top_ptr + x); |
| pixels = _mm_insert_epi8(pixels, top_ptr[x + 4], 4); |
| pixels = _mm_insert_epi8(pixels, dst[-1], 5); |
| pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6); |
| |
| // Duplicate bottom half into upper half. |
| pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf); |
| Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5, |
| taps_6_7); |
| pixels = Load4(dst + stride - 1); |
| pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4); |
| pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5); |
| pixels = _mm_insert_epi8(pixels, dst[stride + stride2 - 1], 6); |
| |
| // Duplicate bottom half into upper half. |
| pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf); |
| Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3, |
| taps_4_5, taps_6_7); |
| dst += 4; |
| } |
| |
| // Now we handle heights that reference previous blocks rather than top_row. |
| for (int y = 4; y < height; y += 4) { |
| // Leftmost 4x4 block for this height. |
| dst -= width; |
| dst += stride4; |
| |
| // Top Left is not available by offset in these leftmost blocks. |
| pixels = Load4(dst - stride); |
| left = _mm_slli_si128(Load4(left_ptr + y - 1), 8); |
| left = _mm_insert_epi8(left, left_ptr[y + 3], 12); |
| pixels = _mm_or_si128(pixels, left); |
| pixels = _mm_shuffle_epi8(pixels, pixel_order2); |
| Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5, |
| taps_6_7); |
| |
| // The bytes shifted into positions 6 and 7 will be ignored by the shuffle. |
| left = _mm_srli_si128(left, 2); |
| pixels = Load4(dst + stride); |
| pixels = _mm_or_si128(pixels, left); |
| pixels = _mm_shuffle_epi8(pixels, pixel_order2); |
| Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3, |
| taps_4_5, taps_6_7); |
| |
| dst += 4; |
| |
| // Remaining 4x4 blocks for this height. |
| for (int x = 4; x < width; x += 4) { |
| pixels = Load4(dst - stride - 1); |
| pixels = _mm_insert_epi8(pixels, dst[-stride + 3], 4); |
| pixels = _mm_insert_epi8(pixels, dst[-1], 5); |
| pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6); |
| |
| // Duplicate bottom half into upper half. |
| pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf); |
| Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5, |
| taps_6_7); |
| pixels = Load4(dst + stride - 1); |
| pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4); |
| pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5); |
| pixels = _mm_insert_epi8(pixels, dst[stride2 + stride - 1], 6); |
| |
| // Duplicate bottom half into upper half. |
| pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf); |
| Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3, |
| taps_4_5, taps_6_7); |
| dst += 4; |
| } |
| } |
| } |
| |
| void Init8bpp() { |
| Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); |
| assert(dsp != nullptr); |
| static_cast<void>(dsp); |
| // These guards check if this version of the function was not superseded by |
| // a higher optimization level, such as AVX. The corresponding #define also |
| // prevents the C version from being added to the table. |
| #if DSP_ENABLED_8BPP_SSE4_1(FilterIntraPredictor) |
| dsp->filter_intra_predictor = FilterIntraPredictor_SSE4_1; |
| #endif |
| } |
| |
| } // namespace |
| |
| void IntraPredFilterInit_SSE4_1() { Init8bpp(); } |
| |
| } // namespace dsp |
| } // namespace libgav1 |
| |
| #else // !LIBGAV1_TARGETING_SSE4_1 |
| namespace libgav1 { |
| namespace dsp { |
| |
| void IntraPredFilterInit_SSE4_1() {} |
| |
| } // namespace dsp |
| } // namespace libgav1 |
| #endif // LIBGAV1_TARGETING_SSE4_1 |