| #include "src/dsp/x86/loop_restoration_sse4.h" |
| |
| #if LIBGAV1_ENABLE_SSE4_1 |
| #include <smmintrin.h> |
| |
| #include <cassert> |
| #include <cstddef> |
| #include <cstdint> |
| #include <cstring> |
| |
| #include "src/dsp/common.h" |
| #include "src/dsp/x86/common_sse4.h" |
| #include "src/utils/common.h" |
| #include "src/utils/compiler_attributes.h" |
| #include "src/utils/constants.h" |
| |
| namespace libgav1 { |
| namespace dsp { |
| namespace low_bitdepth { |
| namespace { |
| |
| // Precision of a division table (mtable) |
| constexpr int kSgrProjScaleBits = 20; |
| constexpr int kSgrProjReciprocalBits = 12; |
| // Core selfguided restoration precision bits. |
| constexpr int kSgrProjSgrBits = 8; |
| // Precision bits of generated values higher than source before projection. |
| constexpr int kSgrProjRestoreBits = 4; |
| |
| // Note: range of wiener filter coefficients. |
| // Wiener filter coefficients are symmetric, and their sum is 1 (128). |
| // The range of each coefficient: |
| // filter[0] = filter[6], 4 bits, min = -5, max = 10. |
| // filter[1] = filter[5], 5 bits, min = -23, max = 8. |
| // filter[2] = filter[4], 6 bits, min = -17, max = 46. |
| // filter[3] = 128 - (filter[0] + filter[1] + filter[2]) * 2. |
| // int8_t is used for the sse4 code, so in order to fit in an int8_t, the 128 |
| // offset must be removed from filter[3]. |
| // filter[3] = 0 - (filter[0] + filter[1] + filter[2]) * 2. |
| // The 128 offset will be added back in the loop. |
| inline void PopulateWienerCoefficients( |
| const RestorationUnitInfo& restoration_info, int direction, |
| int8_t* const filter) { |
| filter[3] = 0; |
| for (int i = 0; i < 3; ++i) { |
| const int8_t coeff = restoration_info.wiener_info.filter[direction][i]; |
| filter[i] = coeff; |
| filter[6 - i] = coeff; |
| filter[3] -= MultiplyBy2(coeff); |
| } |
| |
| // The Wiener filter has only 7 coefficients, but we run it as an 8-tap |
| // filter in SIMD. The 8th coefficient of the filter must be set to 0. |
| filter[7] = 0; |
| } |
| |
| // This function calls LoadUnaligned16() to read 10 bytes from the |source| |
| // buffer. Since the LoadUnaligned16() call over-reads 6 bytes, the |source| |
| // buffer must be at least (height + kSubPixelTaps - 2) * source_stride + 6 |
| // bytes long. |
| void WienerFilter_SSE4_1(const void* source, void* const dest, |
| const RestorationUnitInfo& restoration_info, |
| ptrdiff_t source_stride, ptrdiff_t dest_stride, |
| int width, int height, |
| RestorationBuffer* const buffer) { |
| const int* const inter_round_bits = buffer->inter_round_bits; |
| int8_t filter[kSubPixelTaps]; |
| const int limit = |
| (1 << (8 + 1 + kWienerFilterBits - inter_round_bits[0])) - 1; |
| const auto* src = static_cast<const uint8_t*>(source); |
| auto* dst = static_cast<uint8_t*>(dest); |
| const ptrdiff_t buffer_stride = buffer->wiener_buffer_stride; |
| auto* wiener_buffer = buffer->wiener_buffer; |
| // horizontal filtering. |
| PopulateWienerCoefficients(restoration_info, WienerInfo::kHorizontal, filter); |
| const int center_tap = 3; |
| src -= center_tap * source_stride + center_tap; |
| |
| const int horizontal_rounding = |
| 1 << (8 + kWienerFilterBits - inter_round_bits[0] - 1); |
| const __m128i v_horizontal_rounding = |
| _mm_shufflelo_epi16(_mm_cvtsi32_si128(horizontal_rounding), 0); |
| const __m128i v_limit = _mm_shufflelo_epi16(_mm_cvtsi32_si128(limit), 0); |
| const __m128i v_horizontal_filter = LoadLo8(filter); |
| __m128i v_k1k0 = _mm_shufflelo_epi16(v_horizontal_filter, 0x0); |
| __m128i v_k3k2 = _mm_shufflelo_epi16(v_horizontal_filter, 0x55); |
| __m128i v_k5k4 = _mm_shufflelo_epi16(v_horizontal_filter, 0xaa); |
| __m128i v_k7k6 = _mm_shufflelo_epi16(v_horizontal_filter, 0xff); |
| const __m128i v_round_0 = |
| _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << (inter_round_bits[0] - 1)), 0); |
| const __m128i v_round_0_shift = _mm_cvtsi32_si128(inter_round_bits[0]); |
| const __m128i v_offset_shift = _mm_cvtsi32_si128(7 - inter_round_bits[0]); |
| |
| for (int y = 0; y < height + kSubPixelTaps - 2; ++y) { |
| for (int x = 0; x < width; x += 4) { |
| // Run the Wiener filter on four sets of source samples at a time: |
| // src[x + 0] ... src[x + 6] |
| // src[x + 1] ... src[x + 7] |
| // src[x + 2] ... src[x + 8] |
| // src[x + 3] ... src[x + 9] |
| |
| // Read 10 bytes (from src[x] to src[x + 9]). We over-read 6 bytes but |
| // their results are discarded. |
| const __m128i v_src = LoadUnaligned16(&src[x]); |
| const __m128i v_src_dup_lo = _mm_unpacklo_epi8(v_src, v_src); |
| const __m128i v_src_dup_hi = _mm_unpackhi_epi8(v_src, v_src); |
| const __m128i v_src_10 = _mm_alignr_epi8(v_src_dup_hi, v_src_dup_lo, 1); |
| const __m128i v_src_32 = _mm_alignr_epi8(v_src_dup_hi, v_src_dup_lo, 5); |
| const __m128i v_src_54 = _mm_alignr_epi8(v_src_dup_hi, v_src_dup_lo, 9); |
| // Shift right by 12 bytes instead of 13 bytes so that src[x + 10] is not |
| // shifted into the low 8 bytes of v_src_66. |
| const __m128i v_src_66 = _mm_alignr_epi8(v_src_dup_hi, v_src_dup_lo, 12); |
| const __m128i v_madd_10 = _mm_maddubs_epi16(v_src_10, v_k1k0); |
| const __m128i v_madd_32 = _mm_maddubs_epi16(v_src_32, v_k3k2); |
| const __m128i v_madd_54 = _mm_maddubs_epi16(v_src_54, v_k5k4); |
| const __m128i v_madd_76 = _mm_maddubs_epi16(v_src_66, v_k7k6); |
| const __m128i v_sum_3210 = _mm_add_epi16(v_madd_10, v_madd_32); |
| const __m128i v_sum_7654 = _mm_add_epi16(v_madd_54, v_madd_76); |
| // The sum range here is [-128 * 255, 90 * 255]. |
| const __m128i v_sum_76543210 = _mm_add_epi16(v_sum_7654, v_sum_3210); |
| const __m128i v_sum = _mm_add_epi16(v_sum_76543210, v_round_0); |
| const __m128i v_rounded_sum0 = _mm_sra_epi16(v_sum, v_round_0_shift); |
| // Add scaled down horizontal round here to prevent signed 16 bit |
| // outranging |
| const __m128i v_rounded_sum1 = |
| _mm_add_epi16(v_rounded_sum0, v_horizontal_rounding); |
| // Zero out the even bytes, calculate scaled down offset correction, and |
| // add to sum here to prevent signed 16 bit outranging. |
| // (src[3] * 128) >> inter_round_bits[0] |
| const __m128i v_src_3x128 = |
| _mm_sll_epi16(_mm_srli_epi16(v_src_32, 8), v_offset_shift); |
| const __m128i v_rounded_sum = _mm_add_epi16(v_rounded_sum1, v_src_3x128); |
| const __m128i v_a = _mm_max_epi16(v_rounded_sum, _mm_setzero_si128()); |
| const __m128i v_b = _mm_min_epi16(v_a, v_limit); |
| StoreLo8(&wiener_buffer[x], v_b); |
| } |
| src += source_stride; |
| wiener_buffer += buffer_stride; |
| } |
| |
| wiener_buffer = buffer->wiener_buffer; |
| // vertical filtering. |
| PopulateWienerCoefficients(restoration_info, WienerInfo::kVertical, filter); |
| |
| const int vertical_rounding = -(1 << (8 + inter_round_bits[1] - 1)); |
| const __m128i v_vertical_rounding = |
| _mm_shuffle_epi32(_mm_cvtsi32_si128(vertical_rounding), 0); |
| const __m128i v_offset_correction = _mm_set_epi16(0, 0, 0, 0, 128, 0, 0, 0); |
| const __m128i v_round_1 = |
| _mm_shuffle_epi32(_mm_cvtsi32_si128(1 << (inter_round_bits[1] - 1)), 0); |
| const __m128i v_round_1_shift = _mm_cvtsi32_si128(inter_round_bits[1]); |
| const __m128i v_vertical_filter0 = _mm_cvtepi8_epi16(LoadLo8(filter)); |
| const __m128i v_vertical_filter = |
| _mm_add_epi16(v_vertical_filter0, v_offset_correction); |
| v_k1k0 = _mm_shuffle_epi32(v_vertical_filter, 0x0); |
| v_k3k2 = _mm_shuffle_epi32(v_vertical_filter, 0x55); |
| v_k5k4 = _mm_shuffle_epi32(v_vertical_filter, 0xaa); |
| v_k7k6 = _mm_shuffle_epi32(v_vertical_filter, 0xff); |
| for (int y = 0; y < height; ++y) { |
| for (int x = 0; x < width; x += 4) { |
| const __m128i v_wb_0 = LoadLo8(&wiener_buffer[0 * buffer_stride + x]); |
| const __m128i v_wb_1 = LoadLo8(&wiener_buffer[1 * buffer_stride + x]); |
| const __m128i v_wb_2 = LoadLo8(&wiener_buffer[2 * buffer_stride + x]); |
| const __m128i v_wb_3 = LoadLo8(&wiener_buffer[3 * buffer_stride + x]); |
| const __m128i v_wb_4 = LoadLo8(&wiener_buffer[4 * buffer_stride + x]); |
| const __m128i v_wb_5 = LoadLo8(&wiener_buffer[5 * buffer_stride + x]); |
| const __m128i v_wb_6 = LoadLo8(&wiener_buffer[6 * buffer_stride + x]); |
| const __m128i v_wb_10 = _mm_unpacklo_epi16(v_wb_0, v_wb_1); |
| const __m128i v_wb_32 = _mm_unpacklo_epi16(v_wb_2, v_wb_3); |
| const __m128i v_wb_54 = _mm_unpacklo_epi16(v_wb_4, v_wb_5); |
| const __m128i v_wb_76 = _mm_unpacklo_epi16(v_wb_6, _mm_setzero_si128()); |
| const __m128i v_madd_10 = _mm_madd_epi16(v_wb_10, v_k1k0); |
| const __m128i v_madd_32 = _mm_madd_epi16(v_wb_32, v_k3k2); |
| const __m128i v_madd_54 = _mm_madd_epi16(v_wb_54, v_k5k4); |
| const __m128i v_madd_76 = _mm_madd_epi16(v_wb_76, v_k7k6); |
| const __m128i v_sum_3210 = _mm_add_epi32(v_madd_10, v_madd_32); |
| const __m128i v_sum_7654 = _mm_add_epi32(v_madd_54, v_madd_76); |
| const __m128i v_sum_76543210 = _mm_add_epi32(v_sum_7654, v_sum_3210); |
| const __m128i v_sum = _mm_add_epi32(v_sum_76543210, v_vertical_rounding); |
| const __m128i v_rounded_sum = |
| _mm_sra_epi32(_mm_add_epi32(v_sum, v_round_1), v_round_1_shift); |
| const __m128i v_a = _mm_packs_epi32(v_rounded_sum, v_rounded_sum); |
| const __m128i v_b = _mm_packus_epi16(v_a, v_a); |
| Store4(&dst[x], v_b); |
| } |
| dst += dest_stride; |
| wiener_buffer += buffer_stride; |
| } |
| } |
| |
| // Section 7.17.3. |
| // a2: range [1, 256]. |
| // if (z >= 255) |
| // a2 = 256; |
| // else if (z == 0) |
| // a2 = 1; |
| // else |
| // a2 = ((z << kSgrProjSgrBits) + (z >> 1)) / (z + 1); |
| constexpr int x_by_xplus1[256] = { |
| 1, 128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239, |
| 240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247, |
| 248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250, |
| 250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252, |
| 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253, |
| 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, |
| 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254, |
| 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, |
| 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, |
| 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, |
| 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, |
| 254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
| 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
| 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
| 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
| 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
| 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
| 256}; |
| |
| inline __m128i HorizontalAddVerticalSumsRadius1(const uint32_t* vert_sums) { |
| // Horizontally add vertical sums to get total box sum. |
| const __m128i v_sums_3210 = LoadUnaligned16(&vert_sums[0]); |
| const __m128i v_sums_7654 = LoadUnaligned16(&vert_sums[4]); |
| const __m128i v_sums_4321 = _mm_alignr_epi8(v_sums_7654, v_sums_3210, 4); |
| const __m128i v_sums_5432 = _mm_alignr_epi8(v_sums_7654, v_sums_3210, 8); |
| const __m128i v_s0 = _mm_add_epi32(v_sums_3210, v_sums_4321); |
| const __m128i v_s1 = _mm_add_epi32(v_s0, v_sums_5432); |
| return v_s1; |
| } |
| |
| inline __m128i HorizontalAddVerticalSumsRadius2(const uint32_t* vert_sums) { |
| // Horizontally add vertical sums to get total box sum. |
| const __m128i v_sums_3210 = LoadUnaligned16(&vert_sums[0]); |
| const __m128i v_sums_7654 = LoadUnaligned16(&vert_sums[4]); |
| const __m128i v_sums_4321 = _mm_alignr_epi8(v_sums_7654, v_sums_3210, 4); |
| const __m128i v_sums_5432 = _mm_alignr_epi8(v_sums_7654, v_sums_3210, 8); |
| const __m128i v_sums_6543 = _mm_alignr_epi8(v_sums_7654, v_sums_3210, 12); |
| const __m128i v_s0 = _mm_add_epi32(v_sums_3210, v_sums_4321); |
| const __m128i v_s1 = _mm_add_epi32(v_s0, v_sums_5432); |
| const __m128i v_s2 = _mm_add_epi32(v_s1, v_sums_6543); |
| const __m128i v_s3 = _mm_add_epi32(v_s2, v_sums_7654); |
| return v_s3; |
| } |
| |
| void BoxFilterPreProcessRadius1_SSE4_1( |
| const uint8_t* const src, ptrdiff_t stride, int width, int height, |
| uint32_t s, uint32_t* intermediate_result[2], ptrdiff_t array_stride, |
| uint32_t* vertical_sums, uint32_t* vertical_sum_of_squares) { |
| assert(s != 0); |
| const uint32_t n = 9; |
| const uint32_t one_over_n = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n; |
| const __m128i v_one_over_n = |
| _mm_shuffle_epi32(_mm_cvtsi32_si128(one_over_n), 0); |
| const __m128i v_sgrbits = |
| _mm_shuffle_epi32(_mm_cvtsi32_si128(1 << kSgrProjSgrBits), 0); |
| |
| #if LIBGAV1_MSAN |
| // Over-reads occur in the x loop, so set to a known value. |
| memset(&vertical_sums[width], 0, 8 * sizeof(vertical_sums[0])); |
| memset(&vertical_sum_of_squares[width], 0, |
| 8 * sizeof(vertical_sum_of_squares[0])); |
| #endif |
| |
| // Calculate intermediate results, including one-pixel border, for example, |
| // if unit size is 64x64, we calculate 66x66 pixels. |
| for (int y = -1; y <= height; ++y) { |
| const uint8_t* top_left = &src[(y - 1) * stride - 2]; |
| // Calculate the box vertical sums for each x position. |
| for (int vsx = -2; vsx <= width + 1; vsx += 4, top_left += 4) { |
| const __m128i v_box0 = _mm_cvtepu8_epi32(Load4(top_left)); |
| const __m128i v_box1 = _mm_cvtepu8_epi32(Load4(top_left + stride)); |
| const __m128i v_box2 = _mm_cvtepu8_epi32(Load4(top_left + stride * 2)); |
| const __m128i v_sqr0 = _mm_mullo_epi32(v_box0, v_box0); |
| const __m128i v_sqr1 = _mm_mullo_epi32(v_box1, v_box1); |
| const __m128i v_sqr2 = _mm_mullo_epi32(v_box2, v_box2); |
| const __m128i v_a01 = _mm_add_epi32(v_sqr0, v_sqr1); |
| const __m128i v_a012 = _mm_add_epi32(v_a01, v_sqr2); |
| const __m128i v_b01 = _mm_add_epi32(v_box0, v_box1); |
| const __m128i v_b012 = _mm_add_epi32(v_b01, v_box2); |
| StoreUnaligned16(&vertical_sum_of_squares[vsx], v_a012); |
| StoreUnaligned16(&vertical_sums[vsx], v_b012); |
| } |
| |
| for (int x = -1; x <= width; x += 4) { |
| const __m128i v_a = |
| HorizontalAddVerticalSumsRadius1(&vertical_sum_of_squares[x - 1]); |
| const __m128i v_b = |
| HorizontalAddVerticalSumsRadius1(&vertical_sums[x - 1]); |
| // ----------------------- |
| // calc p, z, a2 |
| // ----------------------- |
| const __m128i v_255 = _mm_shuffle_epi32(_mm_cvtsi32_si128(255), 0); |
| const __m128i v_n = _mm_shuffle_epi32(_mm_cvtsi32_si128(n), 0); |
| const __m128i v_s = _mm_shuffle_epi32(_mm_cvtsi32_si128(s), 0); |
| const __m128i v_dxd = _mm_mullo_epi32(v_b, v_b); |
| const __m128i v_axn = _mm_mullo_epi32(v_a, v_n); |
| const __m128i v_p = _mm_sub_epi32(v_axn, v_dxd); |
| const __m128i v_z = _mm_min_epi32( |
| v_255, RightShiftWithRounding_U32(_mm_mullo_epi32(v_p, v_s), |
| kSgrProjScaleBits)); |
| const __m128i v_a2 = |
| _mm_set_epi32(x_by_xplus1[_mm_extract_epi32(v_z, 3)], |
| x_by_xplus1[_mm_extract_epi32(v_z, 2)], |
| x_by_xplus1[_mm_extract_epi32(v_z, 1)], |
| x_by_xplus1[_mm_extract_epi32(v_z, 0)]); |
| // ----------------------- |
| // calc b2 and store |
| // ----------------------- |
| const __m128i v_sgrbits_sub_a2 = _mm_sub_epi32(v_sgrbits, v_a2); |
| const __m128i v_b2 = |
| _mm_mullo_epi32(v_sgrbits_sub_a2, _mm_mullo_epi32(v_b, v_one_over_n)); |
| StoreUnaligned16(&intermediate_result[0][x], v_a2); |
| StoreUnaligned16( |
| &intermediate_result[1][x], |
| RightShiftWithRounding_U32(v_b2, kSgrProjReciprocalBits)); |
| } |
| intermediate_result[0] += array_stride; |
| intermediate_result[1] += array_stride; |
| } |
| } |
| |
| void BoxFilterPreProcessRadius2_SSE4_1( |
| const uint8_t* const src, ptrdiff_t stride, int width, int height, |
| uint32_t s, uint32_t* intermediate_result[2], ptrdiff_t array_stride, |
| uint32_t* vertical_sums, uint32_t* vertical_sum_of_squares) { |
| assert(s != 0); |
| const uint32_t n = 25; |
| const uint32_t one_over_n = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n; |
| const __m128i v_one_over_n = |
| _mm_shuffle_epi32(_mm_cvtsi32_si128(one_over_n), 0); |
| const __m128i v_sgrbits = |
| _mm_shuffle_epi32(_mm_cvtsi32_si128(1 << kSgrProjSgrBits), 0); |
| |
| // Calculate intermediate results, including one-pixel border, for example, |
| // if unit size is 64x64, we calculate 66x66 pixels. |
| for (int y = -1; y <= height; y += 2) { |
| // Calculate the box vertical sums for each x position. |
| const uint8_t* top_left = &src[(y - 2) * stride - 3]; |
| for (int vsx = -3; vsx <= width + 2; vsx += 4, top_left += 4) { |
| const __m128i v_box0 = _mm_cvtepu8_epi32(Load4(top_left)); |
| const __m128i v_box1 = _mm_cvtepu8_epi32(Load4(top_left + stride)); |
| const __m128i v_box2 = _mm_cvtepu8_epi32(Load4(top_left + stride * 2)); |
| const __m128i v_box3 = _mm_cvtepu8_epi32(Load4(top_left + stride * 3)); |
| const __m128i v_box4 = _mm_cvtepu8_epi32(Load4(top_left + stride * 4)); |
| const __m128i v_sqr0 = _mm_mullo_epi32(v_box0, v_box0); |
| const __m128i v_sqr1 = _mm_mullo_epi32(v_box1, v_box1); |
| const __m128i v_sqr2 = _mm_mullo_epi32(v_box2, v_box2); |
| const __m128i v_sqr3 = _mm_mullo_epi32(v_box3, v_box3); |
| const __m128i v_sqr4 = _mm_mullo_epi32(v_box4, v_box4); |
| const __m128i v_a01 = _mm_add_epi32(v_sqr0, v_sqr1); |
| const __m128i v_a012 = _mm_add_epi32(v_a01, v_sqr2); |
| const __m128i v_a0123 = _mm_add_epi32(v_a012, v_sqr3); |
| const __m128i v_a01234 = _mm_add_epi32(v_a0123, v_sqr4); |
| const __m128i v_b01 = _mm_add_epi32(v_box0, v_box1); |
| const __m128i v_b012 = _mm_add_epi32(v_b01, v_box2); |
| const __m128i v_b0123 = _mm_add_epi32(v_b012, v_box3); |
| const __m128i v_b01234 = _mm_add_epi32(v_b0123, v_box4); |
| StoreUnaligned16(&vertical_sum_of_squares[vsx], v_a01234); |
| StoreUnaligned16(&vertical_sums[vsx], v_b01234); |
| } |
| |
| for (int x = -1; x <= width; x += 4) { |
| const __m128i v_a = |
| HorizontalAddVerticalSumsRadius2(&vertical_sum_of_squares[x - 2]); |
| const __m128i v_b = |
| HorizontalAddVerticalSumsRadius2(&vertical_sums[x - 2]); |
| // ----------------------- |
| // calc p, z, a2 |
| // ----------------------- |
| const __m128i v_255 = _mm_shuffle_epi32(_mm_cvtsi32_si128(255), 0); |
| const __m128i v_n = _mm_shuffle_epi32(_mm_cvtsi32_si128(n), 0); |
| const __m128i v_s = _mm_shuffle_epi32(_mm_cvtsi32_si128(s), 0); |
| const __m128i v_dxd = _mm_mullo_epi32(v_b, v_b); |
| const __m128i v_axn = _mm_mullo_epi32(v_a, v_n); |
| const __m128i v_p = _mm_sub_epi32(v_axn, v_dxd); |
| const __m128i v_z = _mm_min_epi32( |
| v_255, RightShiftWithRounding_U32(_mm_mullo_epi32(v_p, v_s), |
| kSgrProjScaleBits)); |
| const __m128i v_a2 = |
| _mm_set_epi32(x_by_xplus1[_mm_extract_epi32(v_z, 3)], |
| x_by_xplus1[_mm_extract_epi32(v_z, 2)], |
| x_by_xplus1[_mm_extract_epi32(v_z, 1)], |
| x_by_xplus1[_mm_extract_epi32(v_z, 0)]); |
| // ----------------------- |
| // calc b2 and store |
| // ----------------------- |
| const __m128i v_sgrbits_sub_a2 = _mm_sub_epi32(v_sgrbits, v_a2); |
| const __m128i v_b2 = |
| _mm_mullo_epi32(v_sgrbits_sub_a2, _mm_mullo_epi32(v_b, v_one_over_n)); |
| StoreUnaligned16(&intermediate_result[0][x], v_a2); |
| StoreUnaligned16( |
| &intermediate_result[1][x], |
| RightShiftWithRounding_U32(v_b2, kSgrProjReciprocalBits)); |
| } |
| intermediate_result[0] += 2 * array_stride; |
| intermediate_result[1] += 2 * array_stride; |
| } |
| } |
| |
| void BoxFilterPreProcess_SSE4_1(const RestorationUnitInfo& restoration_info, |
| const uint8_t* const src, ptrdiff_t stride, |
| int width, int height, int pass, |
| RestorationBuffer* const buffer) { |
| uint32_t vertical_sums_buf[kRestorationProcessingUnitSize + |
| 2 * kRestorationBorder + kRestorationPadding]; |
| uint32_t vertical_sum_of_squares_buf[kRestorationProcessingUnitSize + |
| 2 * kRestorationBorder + |
| kRestorationPadding]; |
| uint32_t* vertical_sums = &vertical_sums_buf[4]; |
| uint32_t* vertical_sum_of_squares = &vertical_sum_of_squares_buf[4]; |
| const ptrdiff_t array_stride = buffer->box_filter_process_intermediate_stride; |
| // The size of the intermediate result buffer is the size of the filter area |
| // plus horizontal (3) and vertical (3) padding. The processing start point |
| // is the filter area start point -1 row and -1 column. Therefore we need to |
| // set offset and use the intermediate_result as the start point for |
| // processing. |
| const ptrdiff_t intermediate_buffer_offset = |
| kRestorationBorder * array_stride + kRestorationBorder; |
| uint32_t* intermediate_result[2] = { |
| buffer->box_filter_process_intermediate[0] + intermediate_buffer_offset - |
| array_stride, |
| buffer->box_filter_process_intermediate[1] + intermediate_buffer_offset - |
| array_stride}; |
| const int sgr_proj_index = restoration_info.sgr_proj_info.index; |
| if (pass == 0) { |
| assert(kSgrProjParams[sgr_proj_index][0] == 2); |
| BoxFilterPreProcessRadius2_SSE4_1(src, stride, width, height, |
| kSgrScaleParameter[sgr_proj_index][0], |
| intermediate_result, array_stride, |
| vertical_sums, vertical_sum_of_squares); |
| } else { |
| assert(kSgrProjParams[sgr_proj_index][2] == 1); |
| BoxFilterPreProcessRadius1_SSE4_1(src, stride, width, height, |
| kSgrScaleParameter[sgr_proj_index][1], |
| intermediate_result, array_stride, |
| vertical_sums, vertical_sum_of_squares); |
| } |
| } |
| |
| inline __m128i Sum565Row(const __m128i v_DBCA, const __m128i v_XXFE) { |
| __m128i v_sum = v_DBCA; |
| const __m128i v_EDCB = _mm_alignr_epi8(v_XXFE, v_DBCA, 4); |
| v_sum = _mm_add_epi32(v_sum, v_EDCB); |
| const __m128i v_FEDC = _mm_alignr_epi8(v_XXFE, v_DBCA, 8); |
| v_sum = _mm_add_epi32(v_sum, v_FEDC); |
| // D C B A x4 |
| // + E D C B x4 |
| // + F E D C x4 |
| v_sum = _mm_slli_epi32(v_sum, 2); |
| // + D C B A |
| v_sum = _mm_add_epi32(v_sum, v_DBCA); // 5 |
| // + E D C B x2 |
| v_sum = _mm_add_epi32(v_sum, _mm_slli_epi32(v_EDCB, 1)); // 6 |
| // + F E D C |
| return _mm_add_epi32(v_sum, v_FEDC); // 5 |
| } |
| |
| inline __m128i Process3x3Block_565_Odd(const uint32_t* src, ptrdiff_t stride) { |
| // 0 0 0 |
| // 5 6 5 |
| // 0 0 0 |
| const uint32_t* top_left = src - 1; |
| const __m128i v_src1_lo = LoadUnaligned16(top_left + stride); |
| const __m128i v_src1_hi = LoadLo8(top_left + stride + 4); |
| return Sum565Row(v_src1_lo, v_src1_hi); |
| } |
| |
| inline __m128i Process3x3Block_565_Even(const uint32_t* src, ptrdiff_t stride) { |
| // 5 6 5 |
| // 0 0 0 |
| // 5 6 5 |
| const uint32_t* top_left = src - 1; |
| const __m128i v_src0_lo = LoadUnaligned16(top_left); |
| const __m128i v_src0_hi = LoadLo8(top_left + 4); |
| const __m128i v_src2_lo = LoadUnaligned16(top_left + stride * 2); |
| const __m128i v_src2_hi = LoadLo8(top_left + stride * 2 + 4); |
| const __m128i v_a0 = Sum565Row(v_src0_lo, v_src0_hi); |
| const __m128i v_a2 = Sum565Row(v_src2_lo, v_src2_hi); |
| return _mm_add_epi32(v_a0, v_a2); |
| } |
| |
| inline __m128i Sum343Row(const __m128i v_DBCA, const __m128i v_XXFE) { |
| __m128i v_sum = v_DBCA; |
| const __m128i v_EDCB = _mm_alignr_epi8(v_XXFE, v_DBCA, 4); |
| v_sum = _mm_add_epi32(v_sum, v_EDCB); |
| const __m128i v_FEDC = _mm_alignr_epi8(v_XXFE, v_DBCA, 8); |
| v_sum = _mm_add_epi32(v_sum, v_FEDC); |
| // D C B A x4 |
| // + E D C B x4 |
| // + F E D C x4 |
| v_sum = _mm_slli_epi32(v_sum, 2); // 4 |
| // - D C B A |
| v_sum = _mm_sub_epi32(v_sum, v_DBCA); // 3 |
| // - F E D C |
| return _mm_sub_epi32(v_sum, v_FEDC); // 3 |
| } |
| |
| inline __m128i Sum444Row(const __m128i v_DBCA, const __m128i v_XXFE) { |
| __m128i v_sum = v_DBCA; |
| const __m128i v_EDCB = _mm_alignr_epi8(v_XXFE, v_DBCA, 4); |
| v_sum = _mm_add_epi32(v_sum, v_EDCB); |
| const __m128i v_FEDC = _mm_alignr_epi8(v_XXFE, v_DBCA, 8); |
| v_sum = _mm_add_epi32(v_sum, v_FEDC); |
| // D C B A x4 |
| // + E D C B x4 |
| // + F E D C x4 |
| return _mm_slli_epi32(v_sum, 2); // 4 |
| } |
| |
| inline __m128i Process3x3Block_343(const uint32_t* src, ptrdiff_t stride) { |
| const uint32_t* top_left = src - 1; |
| const __m128i v_ir0_lo = LoadUnaligned16(top_left); |
| const __m128i v_ir0_hi = LoadLo8(top_left + 4); |
| const __m128i v_ir1_lo = LoadUnaligned16(top_left + stride); |
| const __m128i v_ir1_hi = LoadLo8(top_left + stride + 4); |
| const __m128i v_ir2_lo = LoadUnaligned16(top_left + stride * 2); |
| const __m128i v_ir2_hi = LoadLo8(top_left + stride * 2 + 4); |
| const __m128i v_a0 = Sum343Row(v_ir0_lo, v_ir0_hi); |
| const __m128i v_a1 = Sum444Row(v_ir1_lo, v_ir1_hi); |
| const __m128i v_a2 = Sum343Row(v_ir2_lo, v_ir2_hi); |
| return _mm_add_epi32(v_a0, _mm_add_epi32(v_a1, v_a2)); |
| } |
| |
| void BoxFilterProcess_SSE4_1(const RestorationUnitInfo& restoration_info, |
| const uint8_t* src, ptrdiff_t stride, int width, |
| int height, RestorationBuffer* const buffer) { |
| const int sgr_proj_index = restoration_info.sgr_proj_info.index; |
| for (int pass = 0; pass < 2; ++pass) { |
| const uint8_t radius = kSgrProjParams[sgr_proj_index][pass * 2]; |
| const uint8_t* src_ptr = src; |
| if (radius == 0) continue; |
| |
| BoxFilterPreProcess_SSE4_1(restoration_info, src_ptr, stride, width, height, |
| pass, buffer); |
| |
| int* filtered_output = buffer->box_filter_process_output[pass]; |
| const ptrdiff_t filtered_output_stride = |
| buffer->box_filter_process_output_stride; |
| const ptrdiff_t intermediate_stride = |
| buffer->box_filter_process_intermediate_stride; |
| // Set intermediate buffer start point to the actual start point of |
| // filtering. |
| const ptrdiff_t intermediate_buffer_offset = |
| kRestorationBorder * intermediate_stride + kRestorationBorder; |
| |
| if (pass == 0) { |
| for (int y = 0; y < height; ++y) { |
| const int shift = ((y & 1) != 0) ? 4 : 5; |
| uint32_t* const array_start[2] = { |
| buffer->box_filter_process_intermediate[0] + |
| intermediate_buffer_offset + y * intermediate_stride, |
| buffer->box_filter_process_intermediate[1] + |
| intermediate_buffer_offset + y * intermediate_stride}; |
| uint32_t* intermediate_result2[2] = { |
| array_start[0] - intermediate_stride, |
| array_start[1] - intermediate_stride}; |
| if ((y & 1) == 0) { // even row |
| for (int x = 0; x < width; x += 4) { |
| // 5 6 5 |
| // 0 0 0 |
| // 5 6 5 |
| const __m128i v_A = Process3x3Block_565_Even( |
| &intermediate_result2[0][x], intermediate_stride); |
| const __m128i v_B = Process3x3Block_565_Even( |
| &intermediate_result2[1][x], intermediate_stride); |
| const __m128i v_src = _mm_cvtepu8_epi32(Load4(src_ptr + x)); |
| const __m128i v_v0 = _mm_mullo_epi32(v_A, v_src); |
| const __m128i v_v = _mm_add_epi32(v_v0, v_B); |
| const __m128i v_filtered = RightShiftWithRounding_U32( |
| v_v, kSgrProjSgrBits + shift - kSgrProjRestoreBits); |
| |
| StoreUnaligned16(&filtered_output[x], v_filtered); |
| } |
| } else { |
| for (int x = 0; x < width; x += 4) { |
| // 0 0 0 |
| // 5 6 5 |
| // 0 0 0 |
| const __m128i v_A = Process3x3Block_565_Odd( |
| &intermediate_result2[0][x], intermediate_stride); |
| const __m128i v_B = Process3x3Block_565_Odd( |
| &intermediate_result2[1][x], intermediate_stride); |
| const __m128i v_src = _mm_cvtepu8_epi32(Load4(src_ptr + x)); |
| const __m128i v_v0 = _mm_mullo_epi32(v_A, v_src); |
| const __m128i v_v = _mm_add_epi32(v_v0, v_B); |
| const __m128i v_filtered = RightShiftWithRounding_U32( |
| v_v, kSgrProjSgrBits + shift - kSgrProjRestoreBits); |
| |
| StoreUnaligned16(&filtered_output[x], v_filtered); |
| } |
| } |
| src_ptr += stride; |
| filtered_output += filtered_output_stride; |
| } |
| } else { |
| for (int y = 0; y < height; ++y) { |
| const int shift = 5; |
| uint32_t* const array_start[2] = { |
| buffer->box_filter_process_intermediate[0] + |
| intermediate_buffer_offset + y * intermediate_stride, |
| buffer->box_filter_process_intermediate[1] + |
| intermediate_buffer_offset + y * intermediate_stride}; |
| uint32_t* intermediate_result2[2] = { |
| array_start[0] - intermediate_stride, |
| array_start[1] - intermediate_stride}; |
| for (int x = 0; x < width; x += 4) { |
| const __m128i v_A = Process3x3Block_343(&intermediate_result2[0][x], |
| intermediate_stride); |
| const __m128i v_B = Process3x3Block_343(&intermediate_result2[1][x], |
| intermediate_stride); |
| const __m128i v_src = _mm_cvtepu8_epi32(Load4(src_ptr + x)); |
| const __m128i v_v0 = _mm_mullo_epi32(v_A, v_src); |
| const __m128i v_v = _mm_add_epi32(v_v0, v_B); |
| const __m128i v_filtered = RightShiftWithRounding_U32( |
| v_v, kSgrProjSgrBits + shift - kSgrProjRestoreBits); |
| |
| StoreUnaligned16(&filtered_output[x], v_filtered); |
| } |
| src_ptr += stride; |
| filtered_output += filtered_output_stride; |
| } |
| } |
| } |
| } |
| |
| void SelfGuidedFilter_SSE4_1(const void* source, void* dest, |
| const RestorationUnitInfo& restoration_info, |
| ptrdiff_t source_stride, ptrdiff_t dest_stride, |
| int width, int height, |
| RestorationBuffer* const buffer) { |
| const auto* src = static_cast<const uint8_t*>(source); |
| auto* dst = static_cast<uint8_t*>(dest); |
| const int w0 = restoration_info.sgr_proj_info.multiplier[0]; |
| const int w1 = restoration_info.sgr_proj_info.multiplier[1]; |
| const int w2 = (1 << kSgrProjPrecisionBits) - w0 - w1; |
| const int index = restoration_info.sgr_proj_info.index; |
| const uint8_t r0 = kSgrProjParams[index][0]; |
| const uint8_t r1 = kSgrProjParams[index][2]; |
| const ptrdiff_t array_stride = buffer->box_filter_process_output_stride; |
| int* box_filter_process_output[2] = {buffer->box_filter_process_output[0], |
| buffer->box_filter_process_output[1]}; |
| |
| BoxFilterProcess_SSE4_1(restoration_info, src, source_stride, width, height, |
| buffer); |
| |
| const __m128i v_w0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(w0), 0); |
| const __m128i v_w1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(w1), 0); |
| const __m128i v_w2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(w2), 0); |
| const __m128i v_r0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(r0), 0); |
| const __m128i v_r1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(r1), 0); |
| const __m128i zero = _mm_setzero_si128(); |
| // Create masks used to select between src and box_filter_process_output. |
| const __m128i v_r0_mask = _mm_cmpeq_epi32(v_r0, zero); |
| const __m128i v_r1_mask = _mm_cmpeq_epi32(v_r1, zero); |
| |
| for (int y = 0; y < height; ++y) { |
| for (int x = 0; x < width; x += 4) { |
| const __m128i v_src = _mm_cvtepu8_epi32(Load4(src + x)); |
| const __m128i v_u = _mm_slli_epi32(v_src, kSgrProjRestoreBits); |
| const __m128i v_v_a = _mm_mullo_epi32(v_w1, v_u); |
| const __m128i v_bfp_out0 = |
| LoadUnaligned16(&box_filter_process_output[0][x]); |
| // Select u or box_filter_process_output[0][x]. |
| const __m128i v_r0_mult = _mm_blendv_epi8(v_bfp_out0, v_u, v_r0_mask); |
| const __m128i v_v_b = _mm_mullo_epi32(v_w0, v_r0_mult); |
| const __m128i v_v_c = _mm_add_epi32(v_v_a, v_v_b); |
| const __m128i v_bfp_out1 = |
| LoadUnaligned16(&box_filter_process_output[1][x]); |
| // Select u or box_filter_process_output[1][x]. |
| const __m128i v_r1_mult = _mm_blendv_epi8(v_bfp_out1, v_u, v_r1_mask); |
| const __m128i v_v_d = _mm_mullo_epi32(v_w2, v_r1_mult); |
| const __m128i v_v_e = _mm_add_epi32(v_v_c, v_v_d); |
| __m128i v_s = RightShiftWithRounding_S32( |
| v_v_e, kSgrProjRestoreBits + kSgrProjPrecisionBits); |
| v_s = _mm_packs_epi32(v_s, v_s); |
| v_s = _mm_packus_epi16(v_s, v_s); |
| Store4(&dst[x], v_s); |
| } |
| |
| src += source_stride; |
| dst += dest_stride; |
| box_filter_process_output[0] += array_stride; |
| box_filter_process_output[1] += array_stride; |
| } |
| } |
| |
| void Init8bpp() { |
| Dsp* const dsp = dsp_internal::GetWritableDspTable(8); |
| assert(dsp != nullptr); |
| #if DSP_ENABLED_8BPP_SSE4_1(WienerFilter) |
| dsp->loop_restorations[0] = WienerFilter_SSE4_1; |
| #endif |
| #if DSP_ENABLED_8BPP_SSE4_1(SelfGuidedFilter) |
| dsp->loop_restorations[1] = SelfGuidedFilter_SSE4_1; |
| #endif |
| } |
| |
| } // namespace |
| } // namespace low_bitdepth |
| |
| void LoopRestorationInit_SSE4_1() { low_bitdepth::Init8bpp(); } |
| |
| } // namespace dsp |
| } // namespace libgav1 |
| |
| #else // !LIBGAV1_ENABLE_SSE4_1 |
| namespace libgav1 { |
| namespace dsp { |
| |
| void LoopRestorationInit_SSE4_1() {} |
| |
| } // namespace dsp |
| } // namespace libgav1 |
| #endif // LIBGAV1_ENABLE_SSE4_1 |