| // Copyright 2021 The libgav1 Authors |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| #include "src/dsp/convolve.h" |
| #include "src/utils/cpu.h" |
| |
| #if LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10 |
| #include <arm_neon.h> |
| |
| #include <algorithm> |
| #include <cassert> |
| #include <cstdint> |
| |
| #include "src/dsp/arm/common_neon.h" |
| #include "src/dsp/constants.h" |
| #include "src/dsp/dsp.h" |
| #include "src/utils/common.h" |
| #include "src/utils/compiler_attributes.h" |
| #include "src/utils/constants.h" |
| |
| namespace libgav1 { |
| namespace dsp { |
| namespace { |
| |
| // Include the constants and utility functions inside the anonymous namespace. |
| #include "src/dsp/convolve.inc" |
| |
| // Output of ConvolveTest.ShowRange below. |
| // Bitdepth: 10 Input range: [ 0, 1023] |
| // Horizontal base upscaled range: [ -28644, 94116] |
| // Horizontal halved upscaled range: [ -14322, 47085] |
| // Horizontal downscaled range: [ -7161, 23529] |
| // Vertical upscaled range: [-1317624, 2365176] |
| // Pixel output range: [ 0, 1023] |
| // Compound output range: [ 3988, 61532] |
| |
| template <int filter_index> |
| int32x4x2_t SumOnePassTaps(const uint16x8_t* const src, |
| const int16x4_t* const taps) { |
| const auto* ssrc = reinterpret_cast<const int16x8_t*>(src); |
| int32x4x2_t sum; |
| if (filter_index < 2) { |
| // 6 taps. |
| sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]); |
| sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]); |
| sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[2]), taps[2]); |
| sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[3]), taps[3]); |
| sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[4]), taps[4]); |
| sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[5]), taps[5]); |
| |
| sum.val[1] = vmull_s16(vget_high_s16(ssrc[0]), taps[0]); |
| sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[1]), taps[1]); |
| sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[2]), taps[2]); |
| sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[3]), taps[3]); |
| sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[4]), taps[4]); |
| sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[5]), taps[5]); |
| } else if (filter_index == 2) { |
| // 8 taps. |
| sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]); |
| sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]); |
| sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[2]), taps[2]); |
| sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[3]), taps[3]); |
| sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[4]), taps[4]); |
| sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[5]), taps[5]); |
| sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[6]), taps[6]); |
| sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[7]), taps[7]); |
| |
| sum.val[1] = vmull_s16(vget_high_s16(ssrc[0]), taps[0]); |
| sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[1]), taps[1]); |
| sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[2]), taps[2]); |
| sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[3]), taps[3]); |
| sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[4]), taps[4]); |
| sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[5]), taps[5]); |
| sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[6]), taps[6]); |
| sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[7]), taps[7]); |
| } else if (filter_index == 3) { |
| // 2 taps. |
| sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]); |
| sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]); |
| |
| sum.val[1] = vmull_s16(vget_high_s16(ssrc[0]), taps[0]); |
| sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[1]), taps[1]); |
| } else { |
| // 4 taps. |
| sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]); |
| sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]); |
| sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[2]), taps[2]); |
| sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[3]), taps[3]); |
| |
| sum.val[1] = vmull_s16(vget_high_s16(ssrc[0]), taps[0]); |
| sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[1]), taps[1]); |
| sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[2]), taps[2]); |
| sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[3]), taps[3]); |
| } |
| return sum; |
| } |
| |
| template <int filter_index> |
| int32x4_t SumOnePassTaps(const uint16x4_t* const src, |
| const int16x4_t* const taps) { |
| const auto* ssrc = reinterpret_cast<const int16x4_t*>(src); |
| int32x4_t sum; |
| if (filter_index < 2) { |
| // 6 taps. |
| sum = vmull_s16(ssrc[0], taps[0]); |
| sum = vmlal_s16(sum, ssrc[1], taps[1]); |
| sum = vmlal_s16(sum, ssrc[2], taps[2]); |
| sum = vmlal_s16(sum, ssrc[3], taps[3]); |
| sum = vmlal_s16(sum, ssrc[4], taps[4]); |
| sum = vmlal_s16(sum, ssrc[5], taps[5]); |
| } else if (filter_index == 2) { |
| // 8 taps. |
| sum = vmull_s16(ssrc[0], taps[0]); |
| sum = vmlal_s16(sum, ssrc[1], taps[1]); |
| sum = vmlal_s16(sum, ssrc[2], taps[2]); |
| sum = vmlal_s16(sum, ssrc[3], taps[3]); |
| sum = vmlal_s16(sum, ssrc[4], taps[4]); |
| sum = vmlal_s16(sum, ssrc[5], taps[5]); |
| sum = vmlal_s16(sum, ssrc[6], taps[6]); |
| sum = vmlal_s16(sum, ssrc[7], taps[7]); |
| } else if (filter_index == 3) { |
| // 2 taps. |
| sum = vmull_s16(ssrc[0], taps[0]); |
| sum = vmlal_s16(sum, ssrc[1], taps[1]); |
| } else { |
| // 4 taps. |
| sum = vmull_s16(ssrc[0], taps[0]); |
| sum = vmlal_s16(sum, ssrc[1], taps[1]); |
| sum = vmlal_s16(sum, ssrc[2], taps[2]); |
| sum = vmlal_s16(sum, ssrc[3], taps[3]); |
| } |
| return sum; |
| } |
| |
| template <int filter_index, bool is_compound, bool is_2d> |
| void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src, |
| const ptrdiff_t src_stride, |
| void* LIBGAV1_RESTRICT const dest, |
| const ptrdiff_t pred_stride, const int width, |
| const int height, |
| const int16x4_t* const v_tap) { |
| auto* dest16 = static_cast<uint16_t*>(dest); |
| const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1); |
| if (is_2d) { |
| int x = 0; |
| do { |
| const uint16_t* s = src + x; |
| int y = height; |
| do { // Increasing loop counter x is better. |
| const uint16x8_t src_long = vld1q_u16(s); |
| const uint16x8_t src_long_hi = vld1q_u16(s + 8); |
| uint16x8_t v_src[8]; |
| int32x4x2_t v_sum; |
| if (filter_index < 2) { |
| v_src[0] = src_long; |
| v_src[1] = vextq_u16(src_long, src_long_hi, 1); |
| v_src[2] = vextq_u16(src_long, src_long_hi, 2); |
| v_src[3] = vextq_u16(src_long, src_long_hi, 3); |
| v_src[4] = vextq_u16(src_long, src_long_hi, 4); |
| v_src[5] = vextq_u16(src_long, src_long_hi, 5); |
| v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 1); |
| } else if (filter_index == 2) { |
| v_src[0] = src_long; |
| v_src[1] = vextq_u16(src_long, src_long_hi, 1); |
| v_src[2] = vextq_u16(src_long, src_long_hi, 2); |
| v_src[3] = vextq_u16(src_long, src_long_hi, 3); |
| v_src[4] = vextq_u16(src_long, src_long_hi, 4); |
| v_src[5] = vextq_u16(src_long, src_long_hi, 5); |
| v_src[6] = vextq_u16(src_long, src_long_hi, 6); |
| v_src[7] = vextq_u16(src_long, src_long_hi, 7); |
| v_sum = SumOnePassTaps<filter_index>(v_src, v_tap); |
| } else if (filter_index == 3) { |
| v_src[0] = src_long; |
| v_src[1] = vextq_u16(src_long, src_long_hi, 1); |
| v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 3); |
| } else { // filter_index > 3 |
| v_src[0] = src_long; |
| v_src[1] = vextq_u16(src_long, src_long_hi, 1); |
| v_src[2] = vextq_u16(src_long, src_long_hi, 2); |
| v_src[3] = vextq_u16(src_long, src_long_hi, 3); |
| v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 2); |
| } |
| |
| const int16x4_t d0 = |
| vqrshrn_n_s32(v_sum.val[0], kInterRoundBitsHorizontal - 1); |
| const int16x4_t d1 = |
| vqrshrn_n_s32(v_sum.val[1], kInterRoundBitsHorizontal - 1); |
| vst1_u16(&dest16[0], vreinterpret_u16_s16(d0)); |
| vst1_u16(&dest16[4], vreinterpret_u16_s16(d1)); |
| s += src_stride; |
| dest16 += 8; |
| } while (--y != 0); |
| x += 8; |
| } while (x < width); |
| return; |
| } |
| int y = height; |
| do { |
| int x = 0; |
| do { |
| const uint16x8_t src_long = vld1q_u16(src + x); |
| const uint16x8_t src_long_hi = vld1q_u16(src + x + 8); |
| uint16x8_t v_src[8]; |
| int32x4x2_t v_sum; |
| if (filter_index < 2) { |
| v_src[0] = src_long; |
| v_src[1] = vextq_u16(src_long, src_long_hi, 1); |
| v_src[2] = vextq_u16(src_long, src_long_hi, 2); |
| v_src[3] = vextq_u16(src_long, src_long_hi, 3); |
| v_src[4] = vextq_u16(src_long, src_long_hi, 4); |
| v_src[5] = vextq_u16(src_long, src_long_hi, 5); |
| v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 1); |
| } else if (filter_index == 2) { |
| v_src[0] = src_long; |
| v_src[1] = vextq_u16(src_long, src_long_hi, 1); |
| v_src[2] = vextq_u16(src_long, src_long_hi, 2); |
| v_src[3] = vextq_u16(src_long, src_long_hi, 3); |
| v_src[4] = vextq_u16(src_long, src_long_hi, 4); |
| v_src[5] = vextq_u16(src_long, src_long_hi, 5); |
| v_src[6] = vextq_u16(src_long, src_long_hi, 6); |
| v_src[7] = vextq_u16(src_long, src_long_hi, 7); |
| v_sum = SumOnePassTaps<filter_index>(v_src, v_tap); |
| } else if (filter_index == 3) { |
| v_src[0] = src_long; |
| v_src[1] = vextq_u16(src_long, src_long_hi, 1); |
| v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 3); |
| } else { // filter_index > 3 |
| v_src[0] = src_long; |
| v_src[1] = vextq_u16(src_long, src_long_hi, 1); |
| v_src[2] = vextq_u16(src_long, src_long_hi, 2); |
| v_src[3] = vextq_u16(src_long, src_long_hi, 3); |
| v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 2); |
| } |
| if (is_compound) { |
| const int16x4_t v_compound_offset = vdup_n_s16(kCompoundOffset); |
| const int16x4_t d0 = |
| vqrshrn_n_s32(v_sum.val[0], kInterRoundBitsHorizontal - 1); |
| const int16x4_t d1 = |
| vqrshrn_n_s32(v_sum.val[1], kInterRoundBitsHorizontal - 1); |
| vst1_u16(&dest16[x], |
| vreinterpret_u16_s16(vadd_s16(d0, v_compound_offset))); |
| vst1_u16(&dest16[x + 4], |
| vreinterpret_u16_s16(vadd_s16(d1, v_compound_offset))); |
| } else { |
| // Normally the Horizontal pass does the downshift in two passes: |
| // kInterRoundBitsHorizontal - 1 and then (kFilterBits - |
| // kInterRoundBitsHorizontal). Each one uses a rounding shift. |
| // Combining them requires adding the rounding offset from the skipped |
| // shift. |
| const int32x4_t v_first_shift_rounding_bit = |
| vdupq_n_s32(1 << (kInterRoundBitsHorizontal - 2)); |
| v_sum.val[0] = vaddq_s32(v_sum.val[0], v_first_shift_rounding_bit); |
| v_sum.val[1] = vaddq_s32(v_sum.val[1], v_first_shift_rounding_bit); |
| const uint16x4_t d0 = vmin_u16( |
| vqrshrun_n_s32(v_sum.val[0], kFilterBits - 1), v_max_bitdepth); |
| const uint16x4_t d1 = vmin_u16( |
| vqrshrun_n_s32(v_sum.val[1], kFilterBits - 1), v_max_bitdepth); |
| vst1_u16(&dest16[x], d0); |
| vst1_u16(&dest16[x + 4], d1); |
| } |
| x += 8; |
| } while (x < width); |
| src += src_stride; |
| dest16 += pred_stride; |
| } while (--y != 0); |
| } |
| |
| template <int filter_index, bool is_compound, bool is_2d> |
| void FilterHorizontalWidth4(const uint16_t* LIBGAV1_RESTRICT src, |
| const ptrdiff_t src_stride, |
| void* LIBGAV1_RESTRICT const dest, |
| const ptrdiff_t pred_stride, const int height, |
| const int16x4_t* const v_tap) { |
| auto* dest16 = static_cast<uint16_t*>(dest); |
| const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1); |
| int y = height; |
| do { |
| const uint16x8_t v_zero = vdupq_n_u16(0); |
| uint16x4_t v_src[4]; |
| int32x4_t v_sum; |
| const uint16x8_t src_long = vld1q_u16(src); |
| v_src[0] = vget_low_u16(src_long); |
| if (filter_index == 3) { |
| v_src[1] = vget_low_u16(vextq_u16(src_long, v_zero, 1)); |
| v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 3); |
| } else { |
| v_src[1] = vget_low_u16(vextq_u16(src_long, v_zero, 1)); |
| v_src[2] = vget_low_u16(vextq_u16(src_long, v_zero, 2)); |
| v_src[3] = vget_low_u16(vextq_u16(src_long, v_zero, 3)); |
| v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 2); |
| } |
| if (is_compound || is_2d) { |
| const int16x4_t d0 = vqrshrn_n_s32(v_sum, kInterRoundBitsHorizontal - 1); |
| if (is_compound && !is_2d) { |
| vst1_u16(&dest16[0], vreinterpret_u16_s16( |
| vadd_s16(d0, vdup_n_s16(kCompoundOffset)))); |
| } else { |
| vst1_u16(&dest16[0], vreinterpret_u16_s16(d0)); |
| } |
| } else { |
| const int32x4_t v_first_shift_rounding_bit = |
| vdupq_n_s32(1 << (kInterRoundBitsHorizontal - 2)); |
| v_sum = vaddq_s32(v_sum, v_first_shift_rounding_bit); |
| const uint16x4_t d0 = |
| vmin_u16(vqrshrun_n_s32(v_sum, kFilterBits - 1), v_max_bitdepth); |
| vst1_u16(&dest16[0], d0); |
| } |
| src += src_stride; |
| dest16 += pred_stride; |
| } while (--y != 0); |
| } |
| |
| template <int filter_index, bool is_2d> |
| void FilterHorizontalWidth2(const uint16_t* LIBGAV1_RESTRICT src, |
| const ptrdiff_t src_stride, |
| void* LIBGAV1_RESTRICT const dest, |
| const ptrdiff_t pred_stride, const int height, |
| const int16x4_t* const v_tap) { |
| auto* dest16 = static_cast<uint16_t*>(dest); |
| const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1); |
| int y = height >> 1; |
| do { |
| const int16x8_t v_zero = vdupq_n_s16(0); |
| const int16x8_t input0 = vreinterpretq_s16_u16(vld1q_u16(src)); |
| const int16x8_t input1 = vreinterpretq_s16_u16(vld1q_u16(src + src_stride)); |
| const int16x8x2_t input = vzipq_s16(input0, input1); |
| int32x4_t v_sum; |
| if (filter_index == 3) { |
| v_sum = vmull_s16(vget_low_s16(input.val[0]), v_tap[3]); |
| v_sum = vmlal_s16(v_sum, |
| vget_low_s16(vextq_s16(input.val[0], input.val[1], 2)), |
| v_tap[4]); |
| } else { |
| v_sum = vmull_s16(vget_low_s16(input.val[0]), v_tap[2]); |
| v_sum = vmlal_s16(v_sum, vget_low_s16(vextq_s16(input.val[0], v_zero, 2)), |
| v_tap[3]); |
| v_sum = vmlal_s16(v_sum, vget_low_s16(vextq_s16(input.val[0], v_zero, 4)), |
| v_tap[4]); |
| v_sum = vmlal_s16(v_sum, |
| vget_low_s16(vextq_s16(input.val[0], input.val[1], 6)), |
| v_tap[5]); |
| } |
| if (is_2d) { |
| const uint16x4_t d0 = vreinterpret_u16_s16( |
| vqrshrn_n_s32(v_sum, kInterRoundBitsHorizontal - 1)); |
| dest16[0] = vget_lane_u16(d0, 0); |
| dest16[1] = vget_lane_u16(d0, 2); |
| dest16 += pred_stride; |
| dest16[0] = vget_lane_u16(d0, 1); |
| dest16[1] = vget_lane_u16(d0, 3); |
| dest16 += pred_stride; |
| } else { |
| // Normally the Horizontal pass does the downshift in two passes: |
| // kInterRoundBitsHorizontal - 1 and then (kFilterBits - |
| // kInterRoundBitsHorizontal). Each one uses a rounding shift. |
| // Combining them requires adding the rounding offset from the skipped |
| // shift. |
| const int32x4_t v_first_shift_rounding_bit = |
| vdupq_n_s32(1 << (kInterRoundBitsHorizontal - 2)); |
| v_sum = vaddq_s32(v_sum, v_first_shift_rounding_bit); |
| const uint16x4_t d0 = |
| vmin_u16(vqrshrun_n_s32(v_sum, kFilterBits - 1), v_max_bitdepth); |
| dest16[0] = vget_lane_u16(d0, 0); |
| dest16[1] = vget_lane_u16(d0, 2); |
| dest16 += pred_stride; |
| dest16[0] = vget_lane_u16(d0, 1); |
| dest16[1] = vget_lane_u16(d0, 3); |
| dest16 += pred_stride; |
| } |
| src += src_stride << 1; |
| } while (--y != 0); |
| |
| // The 2d filters have an odd |height| because the horizontal pass |
| // generates context for the vertical pass. |
| if (is_2d) { |
| assert(height % 2 == 1); |
| const int16x8_t input = vreinterpretq_s16_u16(vld1q_u16(src)); |
| int32x4_t v_sum; |
| if (filter_index == 3) { |
| v_sum = vmull_s16(vget_low_s16(input), v_tap[3]); |
| v_sum = |
| vmlal_s16(v_sum, vget_low_s16(vextq_s16(input, input, 1)), v_tap[4]); |
| } else { |
| v_sum = vmull_s16(vget_low_s16(input), v_tap[2]); |
| v_sum = |
| vmlal_s16(v_sum, vget_low_s16(vextq_s16(input, input, 1)), v_tap[3]); |
| v_sum = |
| vmlal_s16(v_sum, vget_low_s16(vextq_s16(input, input, 2)), v_tap[4]); |
| v_sum = |
| vmlal_s16(v_sum, vget_low_s16(vextq_s16(input, input, 3)), v_tap[5]); |
| } |
| const uint16x4_t d0 = vreinterpret_u16_s16( |
| vqrshrn_n_s32(v_sum, kInterRoundBitsHorizontal - 1)); |
| Store2<0>(dest16, d0); |
| } |
| } |
| |
| template <int filter_index, bool is_compound, bool is_2d> |
| void FilterHorizontal(const uint16_t* LIBGAV1_RESTRICT const src, |
| const ptrdiff_t src_stride, |
| void* LIBGAV1_RESTRICT const dest, |
| const ptrdiff_t pred_stride, const int width, |
| const int height, const int16x4_t* const v_tap) { |
| assert(width < 8 || filter_index <= 3); |
| // Don't simplify the redundant if conditions with the template parameters, |
| // which helps the compiler generate compact code. |
| if (width >= 8 && filter_index <= 3) { |
| FilterHorizontalWidth8AndUp<filter_index, is_compound, is_2d>( |
| src, src_stride, dest, pred_stride, width, height, v_tap); |
| return; |
| } |
| |
| // Horizontal passes only needs to account for number of taps 2 and 4 when |
| // |width| <= 4. |
| assert(width <= 4); |
| assert(filter_index >= 3 && filter_index <= 5); |
| if (filter_index >= 3 && filter_index <= 5) { |
| if (width == 4) { |
| FilterHorizontalWidth4<filter_index, is_compound, is_2d>( |
| src, src_stride, dest, pred_stride, height, v_tap); |
| return; |
| } |
| assert(width == 2); |
| if (!is_compound) { |
| FilterHorizontalWidth2<filter_index, is_2d>(src, src_stride, dest, |
| pred_stride, height, v_tap); |
| } |
| } |
| } |
| |
| template <bool is_compound = false, bool is_2d = false> |
| LIBGAV1_ALWAYS_INLINE void DoHorizontalPass( |
| const uint16_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride, |
| void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, |
| const int width, const int height, const int filter_id, |
| const int filter_index) { |
| // Duplicate the absolute value for each tap. Negative taps are corrected |
| // by using the vmlsl_u8 instruction. Positive taps use vmlal_u8. |
| int16x4_t v_tap[kSubPixelTaps]; |
| assert(filter_id != 0); |
| |
| for (int k = 0; k < kSubPixelTaps; ++k) { |
| v_tap[k] = vdup_n_s16(kHalfSubPixelFilters[filter_index][filter_id][k]); |
| } |
| |
| if (filter_index == 2) { // 8 tap. |
| FilterHorizontal<2, is_compound, is_2d>(src, src_stride, dst, dst_stride, |
| width, height, v_tap); |
| } else if (filter_index == 1) { // 6 tap. |
| FilterHorizontal<1, is_compound, is_2d>(src + 1, src_stride, dst, |
| dst_stride, width, height, v_tap); |
| } else if (filter_index == 0) { // 6 tap. |
| FilterHorizontal<0, is_compound, is_2d>(src + 1, src_stride, dst, |
| dst_stride, width, height, v_tap); |
| } else if (filter_index == 4) { // 4 tap. |
| FilterHorizontal<4, is_compound, is_2d>(src + 2, src_stride, dst, |
| dst_stride, width, height, v_tap); |
| } else if (filter_index == 5) { // 4 tap. |
| FilterHorizontal<5, is_compound, is_2d>(src + 2, src_stride, dst, |
| dst_stride, width, height, v_tap); |
| } else { // 2 tap. |
| FilterHorizontal<3, is_compound, is_2d>(src + 3, src_stride, dst, |
| dst_stride, width, height, v_tap); |
| } |
| } |
| |
| void ConvolveHorizontal_NEON( |
| const void* LIBGAV1_RESTRICT const reference, |
| const ptrdiff_t reference_stride, const int horizontal_filter_index, |
| const int /*vertical_filter_index*/, const int horizontal_filter_id, |
| const int /*vertical_filter_id*/, const int width, const int height, |
| void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) { |
| const int filter_index = GetFilterIndex(horizontal_filter_index, width); |
| // Set |src| to the outermost tap. |
| const auto* const src = |
| static_cast<const uint16_t*>(reference) - kHorizontalOffset; |
| auto* const dest = static_cast<uint16_t*>(prediction); |
| const ptrdiff_t src_stride = reference_stride >> 1; |
| const ptrdiff_t dst_stride = pred_stride >> 1; |
| |
| DoHorizontalPass(src, src_stride, dest, dst_stride, width, height, |
| horizontal_filter_id, filter_index); |
| } |
| |
| void ConvolveCompoundHorizontal_NEON( |
| const void* LIBGAV1_RESTRICT const reference, |
| const ptrdiff_t reference_stride, const int horizontal_filter_index, |
| const int /*vertical_filter_index*/, const int horizontal_filter_id, |
| const int /*vertical_filter_id*/, const int width, const int height, |
| void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) { |
| const int filter_index = GetFilterIndex(horizontal_filter_index, width); |
| const auto* const src = |
| static_cast<const uint16_t*>(reference) - kHorizontalOffset; |
| auto* const dest = static_cast<uint16_t*>(prediction); |
| const ptrdiff_t src_stride = reference_stride >> 1; |
| |
| DoHorizontalPass</*is_compound=*/true>(src, src_stride, dest, width, width, |
| height, horizontal_filter_id, |
| filter_index); |
| } |
| |
| template <int filter_index, bool is_compound = false> |
| void FilterVertical(const uint16_t* LIBGAV1_RESTRICT const src, |
| const ptrdiff_t src_stride, |
| void* LIBGAV1_RESTRICT const dst, |
| const ptrdiff_t dst_stride, const int width, |
| const int height, const int16x4_t* const taps) { |
| const int num_taps = GetNumTapsInFilter(filter_index); |
| const int next_row = num_taps - 1; |
| const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1); |
| auto* const dst16 = static_cast<uint16_t*>(dst); |
| assert(width >= 8); |
| |
| int x = 0; |
| do { |
| const uint16_t* src_x = src + x; |
| uint16x8_t srcs[8]; |
| srcs[0] = vld1q_u16(src_x); |
| src_x += src_stride; |
| if (num_taps >= 4) { |
| srcs[1] = vld1q_u16(src_x); |
| src_x += src_stride; |
| srcs[2] = vld1q_u16(src_x); |
| src_x += src_stride; |
| if (num_taps >= 6) { |
| srcs[3] = vld1q_u16(src_x); |
| src_x += src_stride; |
| srcs[4] = vld1q_u16(src_x); |
| src_x += src_stride; |
| if (num_taps == 8) { |
| srcs[5] = vld1q_u16(src_x); |
| src_x += src_stride; |
| srcs[6] = vld1q_u16(src_x); |
| src_x += src_stride; |
| } |
| } |
| } |
| |
| // Decreasing the y loop counter produces worse code with clang. |
| // Don't unroll this loop since it generates too much code and the decoder |
| // is even slower. |
| int y = 0; |
| do { |
| srcs[next_row] = vld1q_u16(src_x); |
| src_x += src_stride; |
| |
| const int32x4x2_t v_sum = SumOnePassTaps<filter_index>(srcs, taps); |
| if (is_compound) { |
| const int16x4_t v_compound_offset = vdup_n_s16(kCompoundOffset); |
| const int16x4_t d0 = |
| vqrshrn_n_s32(v_sum.val[0], kInterRoundBitsHorizontal - 1); |
| const int16x4_t d1 = |
| vqrshrn_n_s32(v_sum.val[1], kInterRoundBitsHorizontal - 1); |
| vst1_u16(dst16 + x + y * dst_stride, |
| vreinterpret_u16_s16(vadd_s16(d0, v_compound_offset))); |
| vst1_u16(dst16 + x + 4 + y * dst_stride, |
| vreinterpret_u16_s16(vadd_s16(d1, v_compound_offset))); |
| } else { |
| const uint16x4_t d0 = vmin_u16( |
| vqrshrun_n_s32(v_sum.val[0], kFilterBits - 1), v_max_bitdepth); |
| const uint16x4_t d1 = vmin_u16( |
| vqrshrun_n_s32(v_sum.val[1], kFilterBits - 1), v_max_bitdepth); |
| vst1_u16(dst16 + x + y * dst_stride, d0); |
| vst1_u16(dst16 + x + 4 + y * dst_stride, d1); |
| } |
| |
| srcs[0] = srcs[1]; |
| if (num_taps >= 4) { |
| srcs[1] = srcs[2]; |
| srcs[2] = srcs[3]; |
| if (num_taps >= 6) { |
| srcs[3] = srcs[4]; |
| srcs[4] = srcs[5]; |
| if (num_taps == 8) { |
| srcs[5] = srcs[6]; |
| srcs[6] = srcs[7]; |
| } |
| } |
| } |
| } while (++y < height); |
| x += 8; |
| } while (x < width); |
| } |
| |
| template <int filter_index, bool is_compound = false> |
| void FilterVertical4xH(const uint16_t* LIBGAV1_RESTRICT src, |
| const ptrdiff_t src_stride, |
| void* LIBGAV1_RESTRICT const dst, |
| const ptrdiff_t dst_stride, const int height, |
| const int16x4_t* const taps) { |
| const int num_taps = GetNumTapsInFilter(filter_index); |
| const int next_row = num_taps - 1; |
| const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1); |
| auto* dst16 = static_cast<uint16_t*>(dst); |
| |
| uint16x4_t srcs[9]; |
| srcs[0] = vld1_u16(src); |
| src += src_stride; |
| if (num_taps >= 4) { |
| srcs[1] = vld1_u16(src); |
| src += src_stride; |
| srcs[2] = vld1_u16(src); |
| src += src_stride; |
| if (num_taps >= 6) { |
| srcs[3] = vld1_u16(src); |
| src += src_stride; |
| srcs[4] = vld1_u16(src); |
| src += src_stride; |
| if (num_taps == 8) { |
| srcs[5] = vld1_u16(src); |
| src += src_stride; |
| srcs[6] = vld1_u16(src); |
| src += src_stride; |
| } |
| } |
| } |
| |
| int y = height; |
| do { |
| srcs[next_row] = vld1_u16(src); |
| src += src_stride; |
| srcs[num_taps] = vld1_u16(src); |
| src += src_stride; |
| |
| const int32x4_t v_sum = SumOnePassTaps<filter_index>(srcs, taps); |
| const int32x4_t v_sum_1 = SumOnePassTaps<filter_index>(srcs + 1, taps); |
| if (is_compound) { |
| const int16x4_t d0 = vqrshrn_n_s32(v_sum, kInterRoundBitsHorizontal - 1); |
| const int16x4_t d1 = |
| vqrshrn_n_s32(v_sum_1, kInterRoundBitsHorizontal - 1); |
| vst1_u16(dst16, |
| vreinterpret_u16_s16(vadd_s16(d0, vdup_n_s16(kCompoundOffset)))); |
| dst16 += dst_stride; |
| vst1_u16(dst16, |
| vreinterpret_u16_s16(vadd_s16(d1, vdup_n_s16(kCompoundOffset)))); |
| dst16 += dst_stride; |
| } else { |
| const uint16x4_t d0 = |
| vmin_u16(vqrshrun_n_s32(v_sum, kFilterBits - 1), v_max_bitdepth); |
| const uint16x4_t d1 = |
| vmin_u16(vqrshrun_n_s32(v_sum_1, kFilterBits - 1), v_max_bitdepth); |
| vst1_u16(dst16, d0); |
| dst16 += dst_stride; |
| vst1_u16(dst16, d1); |
| dst16 += dst_stride; |
| } |
| |
| srcs[0] = srcs[2]; |
| if (num_taps >= 4) { |
| srcs[1] = srcs[3]; |
| srcs[2] = srcs[4]; |
| if (num_taps >= 6) { |
| srcs[3] = srcs[5]; |
| srcs[4] = srcs[6]; |
| if (num_taps == 8) { |
| srcs[5] = srcs[7]; |
| srcs[6] = srcs[8]; |
| } |
| } |
| } |
| y -= 2; |
| } while (y != 0); |
| } |
| |
| template <int filter_index> |
| void FilterVertical2xH(const uint16_t* LIBGAV1_RESTRICT src, |
| const ptrdiff_t src_stride, |
| void* LIBGAV1_RESTRICT const dst, |
| const ptrdiff_t dst_stride, const int height, |
| const int16x4_t* const taps) { |
| const int num_taps = GetNumTapsInFilter(filter_index); |
| const int next_row = num_taps - 1; |
| const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1); |
| auto* dst16 = static_cast<uint16_t*>(dst); |
| const uint16x4_t v_zero = vdup_n_u16(0); |
| |
| uint16x4_t srcs[9]; |
| srcs[0] = Load2<0>(src, v_zero); |
| src += src_stride; |
| if (num_taps >= 4) { |
| srcs[0] = Load2<1>(src, srcs[0]); |
| src += src_stride; |
| srcs[2] = Load2<0>(src, v_zero); |
| src += src_stride; |
| srcs[1] = vext_u16(srcs[0], srcs[2], 2); |
| if (num_taps >= 6) { |
| srcs[2] = Load2<1>(src, srcs[2]); |
| src += src_stride; |
| srcs[4] = Load2<0>(src, v_zero); |
| src += src_stride; |
| srcs[3] = vext_u16(srcs[2], srcs[4], 2); |
| if (num_taps == 8) { |
| srcs[4] = Load2<1>(src, srcs[4]); |
| src += src_stride; |
| srcs[6] = Load2<0>(src, v_zero); |
| src += src_stride; |
| srcs[5] = vext_u16(srcs[4], srcs[6], 2); |
| } |
| } |
| } |
| |
| int y = height; |
| do { |
| srcs[next_row - 1] = Load2<1>(src, srcs[next_row - 1]); |
| src += src_stride; |
| srcs[num_taps] = Load2<0>(src, v_zero); |
| src += src_stride; |
| srcs[next_row] = vext_u16(srcs[next_row - 1], srcs[num_taps], 2); |
| |
| const int32x4_t v_sum = SumOnePassTaps<filter_index>(srcs, taps); |
| const uint16x4_t d0 = |
| vmin_u16(vqrshrun_n_s32(v_sum, kFilterBits - 1), v_max_bitdepth); |
| Store2<0>(dst16, d0); |
| dst16 += dst_stride; |
| Store2<1>(dst16, d0); |
| dst16 += dst_stride; |
| |
| srcs[0] = srcs[2]; |
| if (num_taps >= 4) { |
| srcs[1] = srcs[3]; |
| srcs[2] = srcs[4]; |
| if (num_taps >= 6) { |
| srcs[3] = srcs[5]; |
| srcs[4] = srcs[6]; |
| if (num_taps == 8) { |
| srcs[5] = srcs[7]; |
| srcs[6] = srcs[8]; |
| } |
| } |
| } |
| y -= 2; |
| } while (y != 0); |
| } |
| |
| template <int num_taps, bool is_compound> |
| int16x8_t SimpleSum2DVerticalTaps(const int16x8_t* const src, |
| const int16x8_t taps) { |
| const int16x4_t taps_lo = vget_low_s16(taps); |
| const int16x4_t taps_hi = vget_high_s16(taps); |
| int32x4_t sum_lo, sum_hi; |
| if (num_taps == 8) { |
| sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 0); |
| sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 0); |
| sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 1); |
| sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 1); |
| sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 2); |
| sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_lo, 2); |
| sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_lo, 3); |
| sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_lo, 3); |
| |
| sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[4]), taps_hi, 0); |
| sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[4]), taps_hi, 0); |
| sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 1); |
| sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[5]), taps_hi, 1); |
| sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[6]), taps_hi, 2); |
| sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[6]), taps_hi, 2); |
| sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[7]), taps_hi, 3); |
| sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[7]), taps_hi, 3); |
| } else if (num_taps == 6) { |
| sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 1); |
| sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 1); |
| sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 2); |
| sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 2); |
| sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 3); |
| sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_lo, 3); |
| |
| sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_hi, 0); |
| sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_hi, 0); |
| sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[4]), taps_hi, 1); |
| sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[4]), taps_hi, 1); |
| sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 2); |
| sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[5]), taps_hi, 2); |
| } else if (num_taps == 4) { |
| sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 2); |
| sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 2); |
| sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 3); |
| sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 3); |
| |
| sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_hi, 0); |
| sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_hi, 0); |
| sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_hi, 1); |
| sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_hi, 1); |
| } else if (num_taps == 2) { |
| sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 3); |
| sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 3); |
| |
| sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_hi, 0); |
| sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_hi, 0); |
| } |
| |
| if (is_compound) { |
| // Output is compound, so leave signed and do not saturate. Offset will |
| // accurately bring the value back into positive range. |
| return vcombine_s16( |
| vrshrn_n_s32(sum_lo, kInterRoundBitsCompoundVertical - 1), |
| vrshrn_n_s32(sum_hi, kInterRoundBitsCompoundVertical - 1)); |
| } |
| |
| // Output is pixel, so saturate to clip at 0. |
| return vreinterpretq_s16_u16( |
| vcombine_u16(vqrshrun_n_s32(sum_lo, kInterRoundBitsVertical - 1), |
| vqrshrun_n_s32(sum_hi, kInterRoundBitsVertical - 1))); |
| } |
| |
| template <int num_taps, bool is_compound = false> |
| void Filter2DVerticalWidth8AndUp(const int16_t* LIBGAV1_RESTRICT src, |
| void* LIBGAV1_RESTRICT const dst, |
| const ptrdiff_t dst_stride, const int width, |
| const int height, const int16x8_t taps) { |
| assert(width >= 8); |
| constexpr int next_row = num_taps - 1; |
| const uint16x8_t v_max_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1); |
| auto* const dst16 = static_cast<uint16_t*>(dst); |
| |
| int x = 0; |
| do { |
| int16x8_t srcs[9]; |
| srcs[0] = vld1q_s16(src); |
| src += 8; |
| if (num_taps >= 4) { |
| srcs[1] = vld1q_s16(src); |
| src += 8; |
| srcs[2] = vld1q_s16(src); |
| src += 8; |
| if (num_taps >= 6) { |
| srcs[3] = vld1q_s16(src); |
| src += 8; |
| srcs[4] = vld1q_s16(src); |
| src += 8; |
| if (num_taps == 8) { |
| srcs[5] = vld1q_s16(src); |
| src += 8; |
| srcs[6] = vld1q_s16(src); |
| src += 8; |
| } |
| } |
| } |
| |
| uint16_t* d16 = dst16 + x; |
| int y = height; |
| do { |
| srcs[next_row] = vld1q_s16(src); |
| src += 8; |
| srcs[next_row + 1] = vld1q_s16(src); |
| src += 8; |
| const int16x8_t sum0 = |
| SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs + 0, taps); |
| const int16x8_t sum1 = |
| SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs + 1, taps); |
| if (is_compound) { |
| const int16x8_t v_compound_offset = vdupq_n_s16(kCompoundOffset); |
| vst1q_u16(d16, |
| vreinterpretq_u16_s16(vaddq_s16(sum0, v_compound_offset))); |
| d16 += dst_stride; |
| vst1q_u16(d16, |
| vreinterpretq_u16_s16(vaddq_s16(sum1, v_compound_offset))); |
| d16 += dst_stride; |
| } else { |
| vst1q_u16(d16, vminq_u16(vreinterpretq_u16_s16(sum0), v_max_bitdepth)); |
| d16 += dst_stride; |
| vst1q_u16(d16, vminq_u16(vreinterpretq_u16_s16(sum1), v_max_bitdepth)); |
| d16 += dst_stride; |
| } |
| srcs[0] = srcs[2]; |
| if (num_taps >= 4) { |
| srcs[1] = srcs[3]; |
| srcs[2] = srcs[4]; |
| if (num_taps >= 6) { |
| srcs[3] = srcs[5]; |
| srcs[4] = srcs[6]; |
| if (num_taps == 8) { |
| srcs[5] = srcs[7]; |
| srcs[6] = srcs[8]; |
| } |
| } |
| } |
| y -= 2; |
| } while (y != 0); |
| x += 8; |
| } while (x < width); |
| } |
| |
| // Take advantage of |src_stride| == |width| to process two rows at a time. |
| template <int num_taps, bool is_compound = false> |
| void Filter2DVerticalWidth4(const int16_t* LIBGAV1_RESTRICT src, |
| void* LIBGAV1_RESTRICT const dst, |
| const ptrdiff_t dst_stride, const int height, |
| const int16x8_t taps) { |
| const uint16x8_t v_max_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1); |
| auto* dst16 = static_cast<uint16_t*>(dst); |
| |
| int16x8_t srcs[9]; |
| srcs[0] = vld1q_s16(src); |
| src += 8; |
| if (num_taps >= 4) { |
| srcs[2] = vld1q_s16(src); |
| src += 8; |
| srcs[1] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[2])); |
| if (num_taps >= 6) { |
| srcs[4] = vld1q_s16(src); |
| src += 8; |
| srcs[3] = vcombine_s16(vget_high_s16(srcs[2]), vget_low_s16(srcs[4])); |
| if (num_taps == 8) { |
| srcs[6] = vld1q_s16(src); |
| src += 8; |
| srcs[5] = vcombine_s16(vget_high_s16(srcs[4]), vget_low_s16(srcs[6])); |
| } |
| } |
| } |
| |
| int y = height; |
| do { |
| srcs[num_taps] = vld1q_s16(src); |
| src += 8; |
| srcs[num_taps - 1] = vcombine_s16(vget_high_s16(srcs[num_taps - 2]), |
| vget_low_s16(srcs[num_taps])); |
| |
| const int16x8_t sum = |
| SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps); |
| if (is_compound) { |
| const int16x8_t v_compound_offset = vdupq_n_s16(kCompoundOffset); |
| vst1q_u16(dst16, |
| vreinterpretq_u16_s16(vaddq_s16(sum, v_compound_offset))); |
| dst16 += 4 << 1; |
| } else { |
| const uint16x8_t d0 = |
| vminq_u16(vreinterpretq_u16_s16(sum), v_max_bitdepth); |
| vst1_u16(dst16, vget_low_u16(d0)); |
| dst16 += dst_stride; |
| vst1_u16(dst16, vget_high_u16(d0)); |
| dst16 += dst_stride; |
| } |
| |
| srcs[0] = srcs[2]; |
| if (num_taps >= 4) { |
| srcs[1] = srcs[3]; |
| srcs[2] = srcs[4]; |
| if (num_taps >= 6) { |
| srcs[3] = srcs[5]; |
| srcs[4] = srcs[6]; |
| if (num_taps == 8) { |
| srcs[5] = srcs[7]; |
| srcs[6] = srcs[8]; |
| } |
| } |
| } |
| y -= 2; |
| } while (y != 0); |
| } |
| |
| // Take advantage of |src_stride| == |width| to process four rows at a time. |
| template <int num_taps> |
| void Filter2DVerticalWidth2(const int16_t* LIBGAV1_RESTRICT src, |
| void* LIBGAV1_RESTRICT const dst, |
| const ptrdiff_t dst_stride, const int height, |
| const int16x8_t taps) { |
| constexpr int next_row = (num_taps < 6) ? 4 : 8; |
| const uint16x8_t v_max_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1); |
| auto* dst16 = static_cast<uint16_t*>(dst); |
| |
| int16x8_t srcs[9]; |
| srcs[0] = vld1q_s16(src); |
| src += 8; |
| if (num_taps >= 6) { |
| srcs[4] = vld1q_s16(src); |
| src += 8; |
| srcs[1] = vextq_s16(srcs[0], srcs[4], 2); |
| if (num_taps == 8) { |
| srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4])); |
| srcs[3] = vextq_s16(srcs[0], srcs[4], 6); |
| } |
| } |
| |
| int y = height; |
| do { |
| srcs[next_row] = vld1q_s16(src); |
| src += 8; |
| if (num_taps == 2) { |
| srcs[1] = vextq_s16(srcs[0], srcs[4], 2); |
| } else if (num_taps == 4) { |
| srcs[1] = vextq_s16(srcs[0], srcs[4], 2); |
| srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4])); |
| srcs[3] = vextq_s16(srcs[0], srcs[4], 6); |
| } else if (num_taps == 6) { |
| srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4])); |
| srcs[3] = vextq_s16(srcs[0], srcs[4], 6); |
| srcs[5] = vextq_s16(srcs[4], srcs[8], 2); |
| } else if (num_taps == 8) { |
| srcs[5] = vextq_s16(srcs[4], srcs[8], 2); |
| srcs[6] = vcombine_s16(vget_high_s16(srcs[4]), vget_low_s16(srcs[8])); |
| srcs[7] = vextq_s16(srcs[4], srcs[8], 6); |
| } |
| const int16x8_t sum = |
| SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps); |
| const uint16x8_t d0 = vminq_u16(vreinterpretq_u16_s16(sum), v_max_bitdepth); |
| Store2<0>(dst16, d0); |
| dst16 += dst_stride; |
| Store2<1>(dst16, d0); |
| // When |height| <= 4 the taps are restricted to 2 and 4 tap variants. |
| // Therefore we don't need to check this condition when |height| > 4. |
| if (num_taps <= 4 && height == 2) return; |
| dst16 += dst_stride; |
| Store2<2>(dst16, d0); |
| dst16 += dst_stride; |
| Store2<3>(dst16, d0); |
| dst16 += dst_stride; |
| |
| srcs[0] = srcs[4]; |
| if (num_taps == 6) { |
| srcs[1] = srcs[5]; |
| srcs[4] = srcs[8]; |
| } else if (num_taps == 8) { |
| srcs[1] = srcs[5]; |
| srcs[2] = srcs[6]; |
| srcs[3] = srcs[7]; |
| srcs[4] = srcs[8]; |
| } |
| |
| y -= 4; |
| } while (y != 0); |
| } |
| |
| template <int vertical_taps> |
| void Filter2DVertical(const int16_t* LIBGAV1_RESTRICT const intermediate_result, |
| const int width, const int height, const int16x8_t taps, |
| void* LIBGAV1_RESTRICT const prediction, |
| const ptrdiff_t pred_stride) { |
| auto* const dest = static_cast<uint16_t*>(prediction); |
| if (width >= 8) { |
| Filter2DVerticalWidth8AndUp<vertical_taps>( |
| intermediate_result, dest, pred_stride, width, height, taps); |
| } else if (width == 4) { |
| Filter2DVerticalWidth4<vertical_taps>(intermediate_result, dest, |
| pred_stride, height, taps); |
| } else { |
| assert(width == 2); |
| Filter2DVerticalWidth2<vertical_taps>(intermediate_result, dest, |
| pred_stride, height, taps); |
| } |
| } |
| |
| void Convolve2D_NEON(const void* LIBGAV1_RESTRICT const reference, |
| const ptrdiff_t reference_stride, |
| const int horizontal_filter_index, |
| const int vertical_filter_index, |
| const int horizontal_filter_id, |
| const int vertical_filter_id, const int width, |
| const int height, void* LIBGAV1_RESTRICT const prediction, |
| const ptrdiff_t pred_stride) { |
| const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); |
| const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); |
| const int vertical_taps = GetNumTapsInFilter(vert_filter_index); |
| // The output of the horizontal filter is guaranteed to fit in 16 bits. |
| int16_t intermediate_result[kMaxSuperBlockSizeInPixels * |
| (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)]; |
| const int intermediate_height = height + vertical_taps - 1; |
| const ptrdiff_t src_stride = reference_stride >> 1; |
| const auto* const src = static_cast<const uint16_t*>(reference) - |
| (vertical_taps / 2 - 1) * src_stride - |
| kHorizontalOffset; |
| const ptrdiff_t dest_stride = pred_stride >> 1; |
| |
| DoHorizontalPass</*is_compound=*/false, /*is_2d=*/true>( |
| src, src_stride, intermediate_result, width, width, intermediate_height, |
| horizontal_filter_id, horiz_filter_index); |
| |
| assert(vertical_filter_id != 0); |
| const int16x8_t taps = vmovl_s8( |
| vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id])); |
| if (vertical_taps == 8) { |
| Filter2DVertical<8>(intermediate_result, width, height, taps, prediction, |
| dest_stride); |
| } else if (vertical_taps == 6) { |
| Filter2DVertical<6>(intermediate_result, width, height, taps, prediction, |
| dest_stride); |
| } else if (vertical_taps == 4) { |
| Filter2DVertical<4>(intermediate_result, width, height, taps, prediction, |
| dest_stride); |
| } else { // |vertical_taps| == 2 |
| Filter2DVertical<2>(intermediate_result, width, height, taps, prediction, |
| dest_stride); |
| } |
| } |
| |
| template <int vertical_taps> |
| void Compound2DVertical( |
| const int16_t* LIBGAV1_RESTRICT const intermediate_result, const int width, |
| const int height, const int16x8_t taps, |
| void* LIBGAV1_RESTRICT const prediction) { |
| auto* const dest = static_cast<uint16_t*>(prediction); |
| if (width == 4) { |
| Filter2DVerticalWidth4<vertical_taps, /*is_compound=*/true>( |
| intermediate_result, dest, width, height, taps); |
| } else { |
| Filter2DVerticalWidth8AndUp<vertical_taps, /*is_compound=*/true>( |
| intermediate_result, dest, width, width, height, taps); |
| } |
| } |
| |
| void ConvolveCompound2D_NEON( |
| const void* LIBGAV1_RESTRICT const reference, |
| const ptrdiff_t reference_stride, const int horizontal_filter_index, |
| const int vertical_filter_index, const int horizontal_filter_id, |
| const int vertical_filter_id, const int width, const int height, |
| void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) { |
| // The output of the horizontal filter, i.e. the intermediate_result, is |
| // guaranteed to fit in int16_t. |
| int16_t |
| intermediate_result[(kMaxSuperBlockSizeInPixels * |
| (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1))]; |
| |
| // Horizontal filter. |
| // Filter types used for width <= 4 are different from those for width > 4. |
| // When width > 4, the valid filter index range is always [0, 3]. |
| // When width <= 4, the valid filter index range is always [4, 5]. |
| // Similarly for height. |
| const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); |
| const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); |
| const int vertical_taps = GetNumTapsInFilter(vert_filter_index); |
| const int intermediate_height = height + vertical_taps - 1; |
| const ptrdiff_t src_stride = reference_stride >> 1; |
| const auto* const src = static_cast<const uint16_t*>(reference) - |
| (vertical_taps / 2 - 1) * src_stride - |
| kHorizontalOffset; |
| |
| DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>( |
| src, src_stride, intermediate_result, width, width, intermediate_height, |
| horizontal_filter_id, horiz_filter_index); |
| |
| // Vertical filter. |
| assert(vertical_filter_id != 0); |
| const int16x8_t taps = vmovl_s8( |
| vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id])); |
| if (vertical_taps == 8) { |
| Compound2DVertical<8>(intermediate_result, width, height, taps, prediction); |
| } else if (vertical_taps == 6) { |
| Compound2DVertical<6>(intermediate_result, width, height, taps, prediction); |
| } else if (vertical_taps == 4) { |
| Compound2DVertical<4>(intermediate_result, width, height, taps, prediction); |
| } else { // |vertical_taps| == 2 |
| Compound2DVertical<2>(intermediate_result, width, height, taps, prediction); |
| } |
| } |
| |
| void ConvolveVertical_NEON( |
| const void* LIBGAV1_RESTRICT const reference, |
| const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/, |
| const int vertical_filter_index, const int /*horizontal_filter_id*/, |
| const int vertical_filter_id, const int width, const int height, |
| void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) { |
| const int filter_index = GetFilterIndex(vertical_filter_index, height); |
| const int vertical_taps = GetNumTapsInFilter(filter_index); |
| const ptrdiff_t src_stride = reference_stride >> 1; |
| const auto* src = static_cast<const uint16_t*>(reference) - |
| (vertical_taps / 2 - 1) * src_stride; |
| auto* const dest = static_cast<uint16_t*>(prediction); |
| const ptrdiff_t dest_stride = pred_stride >> 1; |
| assert(vertical_filter_id != 0); |
| |
| int16x4_t taps[8]; |
| for (int k = 0; k < kSubPixelTaps; ++k) { |
| taps[k] = |
| vdup_n_s16(kHalfSubPixelFilters[filter_index][vertical_filter_id][k]); |
| } |
| |
| if (filter_index == 0) { // 6 tap. |
| if (width == 2) { |
| FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height, |
| taps + 1); |
| } else if (width == 4) { |
| FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height, |
| taps + 1); |
| } else { |
| FilterVertical<0>(src, src_stride, dest, dest_stride, width, height, |
| taps + 1); |
| } |
| } else if ((static_cast<int>(filter_index == 1) & |
| (static_cast<int>(vertical_filter_id == 1) | |
| static_cast<int>(vertical_filter_id == 7) | |
| static_cast<int>(vertical_filter_id == 8) | |
| static_cast<int>(vertical_filter_id == 9) | |
| static_cast<int>(vertical_filter_id == 15))) != 0) { // 6 tap. |
| if (width == 2) { |
| FilterVertical2xH<1>(src, src_stride, dest, dest_stride, height, |
| taps + 1); |
| } else if (width == 4) { |
| FilterVertical4xH<1>(src, src_stride, dest, dest_stride, height, |
| taps + 1); |
| } else { |
| FilterVertical<1>(src, src_stride, dest, dest_stride, width, height, |
| taps + 1); |
| } |
| } else if (filter_index == 2) { // 8 tap. |
| if (width == 2) { |
| FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps); |
| } else if (width == 4) { |
| FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps); |
| } else { |
| FilterVertical<2>(src, src_stride, dest, dest_stride, width, height, |
| taps); |
| } |
| } else if (filter_index == 3) { // 2 tap. |
| if (width == 2) { |
| FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height, |
| taps + 3); |
| } else if (width == 4) { |
| FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height, |
| taps + 3); |
| } else { |
| FilterVertical<3>(src, src_stride, dest, dest_stride, width, height, |
| taps + 3); |
| } |
| } else { |
| // 4 tap. When |filter_index| == 1 the |vertical_filter_id| values listed |
| // below map to 4 tap filters. |
| assert(filter_index == 5 || filter_index == 4 || |
| (filter_index == 1 && |
| (vertical_filter_id == 0 || vertical_filter_id == 2 || |
| vertical_filter_id == 3 || vertical_filter_id == 4 || |
| vertical_filter_id == 5 || vertical_filter_id == 6 || |
| vertical_filter_id == 10 || vertical_filter_id == 11 || |
| vertical_filter_id == 12 || vertical_filter_id == 13 || |
| vertical_filter_id == 14))); |
| // According to GetNumTapsInFilter() this has 6 taps but here we are |
| // treating it as though it has 4. |
| if (filter_index == 1) src += src_stride; |
| if (width == 2) { |
| FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height, |
| taps + 2); |
| } else if (width == 4) { |
| FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height, |
| taps + 2); |
| } else { |
| FilterVertical<5>(src, src_stride, dest, dest_stride, width, height, |
| taps + 2); |
| } |
| } |
| } |
| |
| void ConvolveCompoundVertical_NEON( |
| const void* LIBGAV1_RESTRICT const reference, |
| const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/, |
| const int vertical_filter_index, const int /*horizontal_filter_id*/, |
| const int vertical_filter_id, const int width, const int height, |
| void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) { |
| const int filter_index = GetFilterIndex(vertical_filter_index, height); |
| const int vertical_taps = GetNumTapsInFilter(filter_index); |
| const ptrdiff_t src_stride = reference_stride >> 1; |
| const auto* src = static_cast<const uint16_t*>(reference) - |
| (vertical_taps / 2 - 1) * src_stride; |
| auto* const dest = static_cast<uint16_t*>(prediction); |
| assert(vertical_filter_id != 0); |
| |
| int16x4_t taps[8]; |
| for (int k = 0; k < kSubPixelTaps; ++k) { |
| taps[k] = |
| vdup_n_s16(kHalfSubPixelFilters[filter_index][vertical_filter_id][k]); |
| } |
| |
| if (filter_index == 0) { // 6 tap. |
| if (width == 4) { |
| FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4, |
| height, taps + 1); |
| } else { |
| FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width, |
| width, height, taps + 1); |
| } |
| } else if ((static_cast<int>(filter_index == 1) & |
| (static_cast<int>(vertical_filter_id == 1) | |
| static_cast<int>(vertical_filter_id == 7) | |
| static_cast<int>(vertical_filter_id == 8) | |
| static_cast<int>(vertical_filter_id == 9) | |
| static_cast<int>(vertical_filter_id == 15))) != 0) { // 6 tap. |
| if (width == 4) { |
| FilterVertical4xH<1, /*is_compound=*/true>(src, src_stride, dest, 4, |
| height, taps + 1); |
| } else { |
| FilterVertical<1, /*is_compound=*/true>(src, src_stride, dest, width, |
| width, height, taps + 1); |
| } |
| } else if (filter_index == 2) { // 8 tap. |
| if (width == 4) { |
| FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4, |
| height, taps); |
| } else { |
| FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width, |
| width, height, taps); |
| } |
| } else if (filter_index == 3) { // 2 tap. |
| if (width == 4) { |
| FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4, |
| height, taps + 3); |
| } else { |
| FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width, |
| width, height, taps + 3); |
| } |
| } else { |
| // 4 tap. When |filter_index| == 1 the |filter_id| values listed below map |
| // to 4 tap filters. |
| assert(filter_index == 5 || filter_index == 4 || |
| (filter_index == 1 && |
| (vertical_filter_id == 2 || vertical_filter_id == 3 || |
| vertical_filter_id == 4 || vertical_filter_id == 5 || |
| vertical_filter_id == 6 || vertical_filter_id == 10 || |
| vertical_filter_id == 11 || vertical_filter_id == 12 || |
| vertical_filter_id == 13 || vertical_filter_id == 14))); |
| // According to GetNumTapsInFilter() this has 6 taps but here we are |
| // treating it as though it has 4. |
| if (filter_index == 1) src += src_stride; |
| if (width == 4) { |
| FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4, |
| height, taps + 2); |
| } else { |
| FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width, |
| width, height, taps + 2); |
| } |
| } |
| } |
| |
| void ConvolveCompoundCopy_NEON( |
| const void* const reference, const ptrdiff_t reference_stride, |
| const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/, |
| const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/, |
| const int width, const int height, void* const prediction, |
| const ptrdiff_t /*pred_stride*/) { |
| const auto* src = static_cast<const uint16_t*>(reference); |
| const ptrdiff_t src_stride = reference_stride >> 1; |
| auto* dest = static_cast<uint16_t*>(prediction); |
| constexpr int final_shift = |
| kInterRoundBitsVertical - kInterRoundBitsCompoundVertical; |
| const uint16x8_t offset = |
| vdupq_n_u16((1 << kBitdepth10) + (1 << (kBitdepth10 - 1))); |
| |
| if (width >= 16) { |
| int y = height; |
| do { |
| int x = 0; |
| int w = width; |
| do { |
| const uint16x8_t v_src_lo = vld1q_u16(&src[x]); |
| const uint16x8_t v_src_hi = vld1q_u16(&src[x + 8]); |
| const uint16x8_t v_sum_lo = vaddq_u16(v_src_lo, offset); |
| const uint16x8_t v_sum_hi = vaddq_u16(v_src_hi, offset); |
| const uint16x8_t v_dest_lo = vshlq_n_u16(v_sum_lo, final_shift); |
| const uint16x8_t v_dest_hi = vshlq_n_u16(v_sum_hi, final_shift); |
| vst1q_u16(&dest[x], v_dest_lo); |
| vst1q_u16(&dest[x + 8], v_dest_hi); |
| x += 16; |
| w -= 16; |
| } while (w != 0); |
| src += src_stride; |
| dest += width; |
| } while (--y != 0); |
| } else if (width == 8) { |
| int y = height; |
| do { |
| const uint16x8_t v_src_lo = vld1q_u16(&src[0]); |
| const uint16x8_t v_src_hi = vld1q_u16(&src[src_stride]); |
| const uint16x8_t v_sum_lo = vaddq_u16(v_src_lo, offset); |
| const uint16x8_t v_sum_hi = vaddq_u16(v_src_hi, offset); |
| const uint16x8_t v_dest_lo = vshlq_n_u16(v_sum_lo, final_shift); |
| const uint16x8_t v_dest_hi = vshlq_n_u16(v_sum_hi, final_shift); |
| vst1q_u16(&dest[0], v_dest_lo); |
| vst1q_u16(&dest[8], v_dest_hi); |
| src += src_stride << 1; |
| dest += 16; |
| y -= 2; |
| } while (y != 0); |
| } else { // width == 4 |
| int y = height; |
| do { |
| const uint16x4_t v_src_lo = vld1_u16(&src[0]); |
| const uint16x4_t v_src_hi = vld1_u16(&src[src_stride]); |
| const uint16x4_t v_sum_lo = vadd_u16(v_src_lo, vget_low_u16(offset)); |
| const uint16x4_t v_sum_hi = vadd_u16(v_src_hi, vget_low_u16(offset)); |
| const uint16x4_t v_dest_lo = vshl_n_u16(v_sum_lo, final_shift); |
| const uint16x4_t v_dest_hi = vshl_n_u16(v_sum_hi, final_shift); |
| vst1_u16(&dest[0], v_dest_lo); |
| vst1_u16(&dest[4], v_dest_hi); |
| src += src_stride << 1; |
| dest += 8; |
| y -= 2; |
| } while (y != 0); |
| } |
| } |
| |
| inline void HalfAddHorizontal(const uint16_t* LIBGAV1_RESTRICT const src, |
| uint16_t* LIBGAV1_RESTRICT const dst) { |
| const uint16x8_t left = vld1q_u16(src); |
| const uint16x8_t right = vld1q_u16(src + 1); |
| vst1q_u16(dst, vrhaddq_u16(left, right)); |
| } |
| |
| inline void HalfAddHorizontal16(const uint16_t* LIBGAV1_RESTRICT const src, |
| uint16_t* LIBGAV1_RESTRICT const dst) { |
| HalfAddHorizontal(src, dst); |
| HalfAddHorizontal(src + 8, dst + 8); |
| } |
| |
| template <int width> |
| inline void IntraBlockCopyHorizontal(const uint16_t* LIBGAV1_RESTRICT src, |
| const ptrdiff_t src_stride, |
| const int height, |
| uint16_t* LIBGAV1_RESTRICT dst, |
| const ptrdiff_t dst_stride) { |
| const ptrdiff_t src_remainder_stride = src_stride - (width - 16); |
| const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16); |
| |
| int y = height; |
| do { |
| HalfAddHorizontal16(src, dst); |
| if (width >= 32) { |
| src += 16; |
| dst += 16; |
| HalfAddHorizontal16(src, dst); |
| if (width >= 64) { |
| src += 16; |
| dst += 16; |
| HalfAddHorizontal16(src, dst); |
| src += 16; |
| dst += 16; |
| HalfAddHorizontal16(src, dst); |
| if (width == 128) { |
| src += 16; |
| dst += 16; |
| HalfAddHorizontal16(src, dst); |
| src += 16; |
| dst += 16; |
| HalfAddHorizontal16(src, dst); |
| src += 16; |
| dst += 16; |
| HalfAddHorizontal16(src, dst); |
| src += 16; |
| dst += 16; |
| HalfAddHorizontal16(src, dst); |
| } |
| } |
| } |
| src += src_remainder_stride; |
| dst += dst_remainder_stride; |
| } while (--y != 0); |
| } |
| |
| void ConvolveIntraBlockCopyHorizontal_NEON( |
| const void* LIBGAV1_RESTRICT const reference, |
| const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/, |
| const int /*vertical_filter_index*/, const int /*subpixel_x*/, |
| const int /*subpixel_y*/, const int width, const int height, |
| void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) { |
| assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels); |
| assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels); |
| const auto* src = static_cast<const uint16_t*>(reference); |
| auto* dest = static_cast<uint16_t*>(prediction); |
| const ptrdiff_t src_stride = reference_stride >> 1; |
| const ptrdiff_t dst_stride = pred_stride >> 1; |
| |
| if (width == 128) { |
| IntraBlockCopyHorizontal<128>(src, src_stride, height, dest, dst_stride); |
| } else if (width == 64) { |
| IntraBlockCopyHorizontal<64>(src, src_stride, height, dest, dst_stride); |
| } else if (width == 32) { |
| IntraBlockCopyHorizontal<32>(src, src_stride, height, dest, dst_stride); |
| } else if (width == 16) { |
| IntraBlockCopyHorizontal<16>(src, src_stride, height, dest, dst_stride); |
| } else if (width == 8) { |
| int y = height; |
| do { |
| HalfAddHorizontal(src, dest); |
| src += src_stride; |
| dest += dst_stride; |
| } while (--y != 0); |
| } else { // width == 4 |
| int y = height; |
| do { |
| uint16x4x2_t left; |
| uint16x4x2_t right; |
| left.val[0] = vld1_u16(src); |
| right.val[0] = vld1_u16(src + 1); |
| src += src_stride; |
| left.val[1] = vld1_u16(src); |
| right.val[1] = vld1_u16(src + 1); |
| src += src_stride; |
| |
| vst1_u16(dest, vrhadd_u16(left.val[0], right.val[0])); |
| dest += dst_stride; |
| vst1_u16(dest, vrhadd_u16(left.val[1], right.val[1])); |
| dest += dst_stride; |
| y -= 2; |
| } while (y != 0); |
| } |
| } |
| |
| template <int width> |
| inline void IntraBlockCopyVertical(const uint16_t* LIBGAV1_RESTRICT src, |
| const ptrdiff_t src_stride, const int height, |
| uint16_t* LIBGAV1_RESTRICT dst, |
| const ptrdiff_t dst_stride) { |
| const ptrdiff_t src_remainder_stride = src_stride - (width - 8); |
| const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8); |
| uint16x8_t row[8], below[8]; |
| |
| row[0] = vld1q_u16(src); |
| if (width >= 16) { |
| src += 8; |
| row[1] = vld1q_u16(src); |
| if (width >= 32) { |
| src += 8; |
| row[2] = vld1q_u16(src); |
| src += 8; |
| row[3] = vld1q_u16(src); |
| if (width == 64) { |
| src += 8; |
| row[4] = vld1q_u16(src); |
| src += 8; |
| row[5] = vld1q_u16(src); |
| src += 8; |
| row[6] = vld1q_u16(src); |
| src += 8; |
| row[7] = vld1q_u16(src); |
| } |
| } |
| } |
| src += src_remainder_stride; |
| |
| int y = height; |
| do { |
| below[0] = vld1q_u16(src); |
| if (width >= 16) { |
| src += 8; |
| below[1] = vld1q_u16(src); |
| if (width >= 32) { |
| src += 8; |
| below[2] = vld1q_u16(src); |
| src += 8; |
| below[3] = vld1q_u16(src); |
| if (width == 64) { |
| src += 8; |
| below[4] = vld1q_u16(src); |
| src += 8; |
| below[5] = vld1q_u16(src); |
| src += 8; |
| below[6] = vld1q_u16(src); |
| src += 8; |
| below[7] = vld1q_u16(src); |
| } |
| } |
| } |
| src += src_remainder_stride; |
| |
| vst1q_u16(dst, vrhaddq_u16(row[0], below[0])); |
| row[0] = below[0]; |
| if (width >= 16) { |
| dst += 8; |
| vst1q_u16(dst, vrhaddq_u16(row[1], below[1])); |
| row[1] = below[1]; |
| if (width >= 32) { |
| dst += 8; |
| vst1q_u16(dst, vrhaddq_u16(row[2], below[2])); |
| row[2] = below[2]; |
| dst += 8; |
| vst1q_u16(dst, vrhaddq_u16(row[3], below[3])); |
| row[3] = below[3]; |
| if (width >= 64) { |
| dst += 8; |
| vst1q_u16(dst, vrhaddq_u16(row[4], below[4])); |
| row[4] = below[4]; |
| dst += 8; |
| vst1q_u16(dst, vrhaddq_u16(row[5], below[5])); |
| row[5] = below[5]; |
| dst += 8; |
| vst1q_u16(dst, vrhaddq_u16(row[6], below[6])); |
| row[6] = below[6]; |
| dst += 8; |
| vst1q_u16(dst, vrhaddq_u16(row[7], below[7])); |
| row[7] = below[7]; |
| } |
| } |
| } |
| dst += dst_remainder_stride; |
| } while (--y != 0); |
| } |
| |
| void ConvolveIntraBlockCopyVertical_NEON( |
| const void* LIBGAV1_RESTRICT const reference, |
| const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/, |
| const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/, |
| const int /*vertical_filter_id*/, const int width, const int height, |
| void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) { |
| assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels); |
| assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels); |
| const auto* src = static_cast<const uint16_t*>(reference); |
| auto* dest = static_cast<uint16_t*>(prediction); |
| const ptrdiff_t src_stride = reference_stride >> 1; |
| const ptrdiff_t dst_stride = pred_stride >> 1; |
| |
| if (width == 128) { |
| // Due to register pressure, process two 64xH. |
| for (int i = 0; i < 2; ++i) { |
| IntraBlockCopyVertical<64>(src, src_stride, height, dest, dst_stride); |
| src += 64; |
| dest += 64; |
| } |
| } else if (width == 64) { |
| IntraBlockCopyVertical<64>(src, src_stride, height, dest, dst_stride); |
| } else if (width == 32) { |
| IntraBlockCopyVertical<32>(src, src_stride, height, dest, dst_stride); |
| } else if (width == 16) { |
| IntraBlockCopyVertical<16>(src, src_stride, height, dest, dst_stride); |
| } else if (width == 8) { |
| IntraBlockCopyVertical<8>(src, src_stride, height, dest, dst_stride); |
| } else { // width == 4 |
| uint16x4_t row = vld1_u16(src); |
| src += src_stride; |
| int y = height; |
| do { |
| const uint16x4_t below = vld1_u16(src); |
| src += src_stride; |
| vst1_u16(dest, vrhadd_u16(row, below)); |
| dest += dst_stride; |
| row = below; |
| } while (--y != 0); |
| } |
| } |
| |
| template <int width> |
| inline void IntraBlockCopy2D(const uint16_t* LIBGAV1_RESTRICT src, |
| const ptrdiff_t src_stride, const int height, |
| uint16_t* LIBGAV1_RESTRICT dst, |
| const ptrdiff_t dst_stride) { |
| const ptrdiff_t src_remainder_stride = src_stride - (width - 8); |
| const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8); |
| uint16x8_t row[16]; |
| row[0] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1)); |
| if (width >= 16) { |
| src += 8; |
| row[1] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1)); |
| if (width >= 32) { |
| src += 8; |
| row[2] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1)); |
| src += 8; |
| row[3] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1)); |
| if (width >= 64) { |
| src += 8; |
| row[4] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1)); |
| src += 8; |
| row[5] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1)); |
| src += 8; |
| row[6] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1)); |
| src += 8; |
| row[7] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1)); |
| if (width == 128) { |
| src += 8; |
| row[8] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1)); |
| src += 8; |
| row[9] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1)); |
| src += 8; |
| row[10] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1)); |
| src += 8; |
| row[11] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1)); |
| src += 8; |
| row[12] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1)); |
| src += 8; |
| row[13] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1)); |
| src += 8; |
| row[14] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1)); |
| src += 8; |
| row[15] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1)); |
| } |
| } |
| } |
| } |
| src += src_remainder_stride; |
| |
| int y = height; |
| do { |
| const uint16x8_t below_0 = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1)); |
| vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[0], below_0), 2)); |
| row[0] = below_0; |
| if (width >= 16) { |
| src += 8; |
| dst += 8; |
| |
| const uint16x8_t below_1 = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1)); |
| vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[1], below_1), 2)); |
| row[1] = below_1; |
| if (width >= 32) { |
| src += 8; |
| dst += 8; |
| |
| const uint16x8_t below_2 = |
| vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1)); |
| vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[2], below_2), 2)); |
| row[2] = below_2; |
| src += 8; |
| dst += 8; |
| |
| const uint16x8_t below_3 = |
| vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1)); |
| vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[3], below_3), 2)); |
| row[3] = below_3; |
| if (width >= 64) { |
| src += 8; |
| dst += 8; |
| |
| const uint16x8_t below_4 = |
| vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1)); |
| vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[4], below_4), 2)); |
| row[4] = below_4; |
| src += 8; |
| dst += 8; |
| |
| const uint16x8_t below_5 = |
| vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1)); |
| vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[5], below_5), 2)); |
| row[5] = below_5; |
| src += 8; |
| dst += 8; |
| |
| const uint16x8_t below_6 = |
| vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1)); |
| vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[6], below_6), 2)); |
| row[6] = below_6; |
| src += 8; |
| dst += 8; |
| |
| const uint16x8_t below_7 = |
| vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1)); |
| vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[7], below_7), 2)); |
| row[7] = below_7; |
| if (width == 128) { |
| src += 8; |
| dst += 8; |
| |
| const uint16x8_t below_8 = |
| vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1)); |
| vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[8], below_8), 2)); |
| row[8] = below_8; |
| src += 8; |
| dst += 8; |
| |
| const uint16x8_t below_9 = |
| vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1)); |
| vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[9], below_9), 2)); |
| row[9] = below_9; |
| src += 8; |
| dst += 8; |
| |
| const uint16x8_t below_10 = |
| vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1)); |
| vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[10], below_10), 2)); |
| row[10] = below_10; |
| src += 8; |
| dst += 8; |
| |
| const uint16x8_t below_11 = |
| vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1)); |
| vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[11], below_11), 2)); |
| row[11] = below_11; |
| src += 8; |
| dst += 8; |
| |
| const uint16x8_t below_12 = |
| vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1)); |
| vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[12], below_12), 2)); |
| row[12] = below_12; |
| src += 8; |
| dst += 8; |
| |
| const uint16x8_t below_13 = |
| vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1)); |
| vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[13], below_13), 2)); |
| row[13] = below_13; |
| src += 8; |
| dst += 8; |
| |
| const uint16x8_t below_14 = |
| vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1)); |
| vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[14], below_14), 2)); |
| row[14] = below_14; |
| src += 8; |
| dst += 8; |
| |
| const uint16x8_t below_15 = |
| vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1)); |
| vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[15], below_15), 2)); |
| row[15] = below_15; |
| } |
| } |
| } |
| } |
| src += src_remainder_stride; |
| dst += dst_remainder_stride; |
| } while (--y != 0); |
| } |
| |
| void ConvolveIntraBlockCopy2D_NEON( |
| const void* LIBGAV1_RESTRICT const reference, |
| const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/, |
| const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/, |
| const int /*vertical_filter_id*/, const int width, const int height, |
| void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) { |
| assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels); |
| assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels); |
| const auto* src = static_cast<const uint16_t*>(reference); |
| auto* dest = static_cast<uint16_t*>(prediction); |
| const ptrdiff_t src_stride = reference_stride >> 1; |
| const ptrdiff_t dst_stride = pred_stride >> 1; |
| |
| // Note: allow vertical access to height + 1. Because this function is only |
| // for u/v plane of intra block copy, such access is guaranteed to be within |
| // the prediction block. |
| |
| if (width == 128) { |
| IntraBlockCopy2D<128>(src, src_stride, height, dest, dst_stride); |
| } else if (width == 64) { |
| IntraBlockCopy2D<64>(src, src_stride, height, dest, dst_stride); |
| } else if (width == 32) { |
| IntraBlockCopy2D<32>(src, src_stride, height, dest, dst_stride); |
| } else if (width == 16) { |
| IntraBlockCopy2D<16>(src, src_stride, height, dest, dst_stride); |
| } else if (width == 8) { |
| IntraBlockCopy2D<8>(src, src_stride, height, dest, dst_stride); |
| } else { // width == 4 |
| uint16x4_t row0 = vadd_u16(vld1_u16(src), vld1_u16(src + 1)); |
| src += src_stride; |
| |
| int y = height; |
| do { |
| const uint16x4_t row1 = vadd_u16(vld1_u16(src), vld1_u16(src + 1)); |
| src += src_stride; |
| const uint16x4_t row2 = vadd_u16(vld1_u16(src), vld1_u16(src + 1)); |
| src += src_stride; |
| const uint16x4_t result_01 = vrshr_n_u16(vadd_u16(row0, row1), 2); |
| const uint16x4_t result_12 = vrshr_n_u16(vadd_u16(row1, row2), 2); |
| vst1_u16(dest, result_01); |
| dest += dst_stride; |
| vst1_u16(dest, result_12); |
| dest += dst_stride; |
| row0 = row2; |
| y -= 2; |
| } while (y != 0); |
| } |
| } |
| |
| // ----------------------------------------------------------------------------- |
| // Scaled Convolve |
| |
| // There are many opportunities for overreading in scaled convolve, because the |
| // range of starting points for filter windows is anywhere from 0 to 16 for 8 |
| // destination pixels, and the window sizes range from 2 to 8. To accommodate |
| // this range concisely, we use |grade_x| to mean the most steps in src that can |
| // be traversed in a single |step_x| increment, i.e. 1 or 2. When grade_x is 2, |
| // we are guaranteed to exceed 8 whole steps in src for every 8 |step_x| |
| // increments. The first load covers the initial elements of src_x, while the |
| // final load covers the taps. |
| template <int grade_x> |
| inline uint8x16x3_t LoadSrcVals(const uint16_t* const src_x) { |
| uint8x16x3_t ret; |
| // When fractional step size is less than or equal to 1, the rightmost |
| // starting value for a filter may be at position 7. For an 8-tap filter, the |
| // rightmost value for the final tap may be at position 14. Therefore we load |
| // 2 vectors of eight 16-bit values. |
| ret.val[0] = vreinterpretq_u8_u16(vld1q_u16(src_x)); |
| ret.val[1] = vreinterpretq_u8_u16(vld1q_u16(src_x + 8)); |
| if (grade_x > 1) { |
| // When fractional step size is greater than 1 (up to 2), the rightmost |
| // starting value for a filter may be at position 15. For an 8-tap filter, |
| // the rightmost value for the final tap may be at position 22. Therefore we |
| // load 3 vectors of eight 16-bit values. |
| ret.val[2] = vreinterpretq_u8_u16(vld1q_u16(src_x + 16)); |
| } |
| return ret; |
| } |
| |
| // Assemble 4 values corresponding to one tap position across multiple filters. |
| // This is a simple case because maximum offset is 8 and only smaller filters |
| // work on 4xH. |
| inline uint16x4_t PermuteSrcVals(const uint8x16x3_t src_bytes, |
| const uint8x8_t indices) { |
| const uint8x16x2_t src_bytes2 = {src_bytes.val[0], src_bytes.val[1]}; |
| return vreinterpret_u16_u8(VQTbl2U8(src_bytes2, indices)); |
| } |
| |
| // Assemble 8 values corresponding to one tap position across multiple filters. |
| // This requires a lot of workaround on A32 architectures, so it may be worth |
| // using an overall different algorithm for that architecture. |
| template <int grade_x> |
| inline uint16x8_t PermuteSrcVals(const uint8x16x3_t src_bytes, |
| const uint8x16_t indices) { |
| if (grade_x == 1) { |
| const uint8x16x2_t src_bytes2 = {src_bytes.val[0], src_bytes.val[1]}; |
| return vreinterpretq_u16_u8(VQTbl2QU8(src_bytes2, indices)); |
| } |
| return vreinterpretq_u16_u8(VQTbl3QU8(src_bytes, indices)); |
| } |
| |
| // Pre-transpose the 2 tap filters in |kAbsHalfSubPixelFilters|[3] |
| // Although the taps need to be converted to 16-bit values, they must be |
| // arranged by table lookup, which is more expensive for larger types than |
| // lengthening in-loop. |tap_index| refers to the index within a kernel applied |
| // to a single value. |
| inline int8x16_t GetPositive2TapFilter(const int tap_index) { |
| assert(tap_index < 2); |
| alignas( |
| 16) static constexpr int8_t kAbsHalfSubPixel2TapFilterColumns[2][16] = { |
| {64, 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4}, |
| {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60}}; |
| |
| return vld1q_s8(kAbsHalfSubPixel2TapFilterColumns[tap_index]); |
| } |
| |
| template <int grade_x> |
| inline void ConvolveKernelHorizontal2Tap( |
| const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride, |
| const int width, const int subpixel_x, const int step_x, |
| const int intermediate_height, int16_t* LIBGAV1_RESTRICT intermediate) { |
| // Account for the 0-taps that precede the 2 nonzero taps in the spec. |
| const int kernel_offset = 3; |
| const int ref_x = subpixel_x >> kScaleSubPixelBits; |
| const int step_x8 = step_x << 3; |
| const int8x16_t filter_taps0 = GetPositive2TapFilter(0); |
| const int8x16_t filter_taps1 = GetPositive2TapFilter(1); |
| const uint16x8_t index_steps = vmulq_n_u16( |
| vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x)); |
| const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask); |
| |
| int p = subpixel_x; |
| if (width <= 4) { |
| const uint8_t* src_y = src; |
| // Only add steps to the 10-bit truncated p to avoid overflow. |
| const uint16x8_t p_fraction = vdupq_n_u16(p & 1023); |
| const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction); |
| const uint8x8_t filter_indices = |
| vand_u8(vshrn_n_u16(subpel_index_offsets, 6), filter_index_mask); |
| // Each lane of lane of taps[k] corresponds to one output value along the |
| // row, containing kSubPixelFilters[filter_index][filter_id][k], where |
| // filter_id depends on x. |
| const int16x4_t taps[2] = { |
| vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps0, filter_indices))), |
| vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps1, filter_indices)))}; |
| // Lower byte of Nth value is at position 2*N. |
| // Narrowing shift is not available here because the maximum shift |
| // parameter is 8. |
| const uint8x8_t src_indices0 = vshl_n_u8( |
| vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)), 1); |
| // Upper byte of Nth value is at position 2*N+1. |
| const uint8x8_t src_indices1 = vadd_u8(src_indices0, vdup_n_u8(1)); |
| // Only 4 values needed. |
| const uint8x8_t src_indices = InterleaveLow8(src_indices0, src_indices1); |
| const uint8x8_t src_lookup[2] = {src_indices, |
| vadd_u8(src_indices, vdup_n_u8(2))}; |
| |
| int y = intermediate_height; |
| do { |
| const uint16_t* src_x = reinterpret_cast<const uint16_t*>(src_y) + |
| (p >> kScaleSubPixelBits) - ref_x + kernel_offset; |
| // Load a pool of samples to select from using stepped indices. |
| const uint8x16x3_t src_bytes = LoadSrcVals<1>(src_x); |
| // Each lane corresponds to a different filter kernel. |
| const uint16x4_t src[2] = {PermuteSrcVals(src_bytes, src_lookup[0]), |
| PermuteSrcVals(src_bytes, src_lookup[1])}; |
| |
| vst1_s16(intermediate, |
| vrshrn_n_s32(SumOnePassTaps</*filter_index=*/3>(src, taps), |
| kInterRoundBitsHorizontal - 1)); |
| src_y += src_stride; |
| intermediate += kIntermediateStride; |
| } while (--y != 0); |
| return; |
| } |
| |
| // |width| >= 8 |
| int x = 0; |
| do { |
| const uint16_t* src_x = reinterpret_cast<const uint16_t*>(src) + |
| (p >> kScaleSubPixelBits) - ref_x + kernel_offset; |
| int16_t* intermediate_x = intermediate + x; |
| // Only add steps to the 10-bit truncated p to avoid overflow. |
| const uint16x8_t p_fraction = vdupq_n_u16(p & 1023); |
| const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction); |
| const uint8x8_t filter_indices = |
| vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift), |
| filter_index_mask); |
| // Each lane of lane of taps[k] corresponds to one output value along the |
| // row, containing kSubPixelFilters[filter_index][filter_id][k], where |
| // filter_id depends on x. |
| const int16x8_t taps[2] = { |
| vmovl_s8(VQTbl1S8(filter_taps0, filter_indices)), |
| vmovl_s8(VQTbl1S8(filter_taps1, filter_indices))}; |
| const int16x4_t taps_low[2] = {vget_low_s16(taps[0]), |
| vget_low_s16(taps[1])}; |
| const int16x4_t taps_high[2] = {vget_high_s16(taps[0]), |
| vget_high_s16(taps[1])}; |
| // Lower byte of Nth value is at position 2*N. |
| const uint8x8_t src_indices0 = vshl_n_u8( |
| vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)), 1); |
| // Upper byte of Nth value is at position 2*N+1. |
| const uint8x8_t src_indices1 = vadd_u8(src_indices0, vdup_n_u8(1)); |
| const uint8x8x2_t src_indices_zip = vzip_u8(src_indices0, src_indices1); |
| const uint8x16_t src_indices = |
| vcombine_u8(src_indices_zip.val[0], src_indices_zip.val[1]); |
| const uint8x16_t src_lookup[2] = {src_indices, |
| vaddq_u8(src_indices, vdupq_n_u8(2))}; |
| |
| int y = intermediate_height; |
| do { |
| // Load a pool of samples to select from using stepped indices. |
| const uint8x16x3_t src_bytes = LoadSrcVals<grade_x>(src_x); |
| // Each lane corresponds to a different filter kernel. |
| const uint16x8_t src[2] = { |
| PermuteSrcVals<grade_x>(src_bytes, src_lookup[0]), |
| PermuteSrcVals<grade_x>(src_bytes, src_lookup[1])}; |
| const uint16x4_t src_low[2] = {vget_low_u16(src[0]), |
| vget_low_u16(src[1])}; |
| const uint16x4_t src_high[2] = {vget_high_u16(src[0]), |
| vget_high_u16(src[1])}; |
| |
| vst1_s16(intermediate_x, vrshrn_n_s32(SumOnePassTaps</*filter_index=*/3>( |
| src_low, taps_low), |
| kInterRoundBitsHorizontal - 1)); |
| vst1_s16( |
| intermediate_x + 4, |
| vrshrn_n_s32(SumOnePassTaps</*filter_index=*/3>(src_high, taps_high), |
| kInterRoundBitsHorizontal - 1)); |
| // Avoid right shifting the stride. |
| src_x = reinterpret_cast<const uint16_t*>( |
| reinterpret_cast<const uint8_t*>(src_x) + src_stride); |
| intermediate_x += kIntermediateStride; |
| } while (--y != 0); |
| x += 8; |
| p += step_x8; |
| } while (x < width); |
| } |
| |
| // Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[5]. |
| inline int8x16_t GetPositive4TapFilter(const int tap_index) { |
| assert(tap_index < 4); |
| alignas( |
| 16) static constexpr int8_t kSubPixel4TapPositiveFilterColumns[4][16] = { |
| {0, 15, 13, 11, 10, 9, 8, 7, 6, 6, 5, 4, 3, 2, 2, 1}, |
| {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17}, |
| {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31}, |
| {0, 1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11, 13, 15}}; |
| |
| return vld1q_s8(kSubPixel4TapPositiveFilterColumns[tap_index]); |
| } |
| |
| // This filter is only possible when width <= 4. |
| inline void ConvolveKernelHorizontalPositive4Tap( |
| const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride, |
| const int subpixel_x, const int step_x, const int intermediate_height, |
| int16_t* LIBGAV1_RESTRICT intermediate) { |
| // Account for the 0-taps that precede the 2 nonzero taps in the spec. |
| const int kernel_offset = 2; |
| const int ref_x = subpixel_x >> kScaleSubPixelBits; |
| const int8x16_t filter_taps0 = GetPositive4TapFilter(0); |
| const int8x16_t filter_taps1 = GetPositive4TapFilter(1); |
| const int8x16_t filter_taps2 = GetPositive4TapFilter(2); |
| const int8x16_t filter_taps3 = GetPositive4TapFilter(3); |
| const uint16x8_t index_steps = vmulq_n_u16( |
| vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x)); |
| const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask); |
| |
| int p = subpixel_x; |
| // Only add steps to the 10-bit truncated p to avoid overflow. |
| const uint16x8_t p_fraction = vdupq_n_u16(p & 1023); |
| const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction); |
| const uint8x8_t filter_indices = |
| vand_u8(vshrn_n_u16(subpel_index_offsets, 6), filter_index_mask); |
| // Each lane of lane of taps[k] corresponds to one output value along the row, |
| // containing kSubPixelFilters[filter_index][filter_id][k], where filter_id |
| // depends on x. |
| const int16x4_t taps[4] = { |
| vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps0, filter_indices))), |
| vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps1, filter_indices))), |
| vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps2, filter_indices))), |
| vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps3, filter_indices)))}; |
| // Lower byte of Nth value is at position 2*N. |
| // Narrowing shift is not available here because the maximum shift |
| // parameter is 8. |
| const uint8x8_t src_indices0 = vshl_n_u8( |
| vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)), 1); |
| // Upper byte of Nth value is at position 2*N+1. |
| const uint8x8_t src_indices1 = vadd_u8(src_indices0, vdup_n_u8(1)); |
| // Only 4 values needed. |
| const uint8x8_t src_indices_base = InterleaveLow8(src_indices0, src_indices1); |
| |
| uint8x8_t src_lookup[4]; |
| const uint8x8_t two = vdup_n_u8(2); |
| src_lookup[0] = src_indices_base; |
| for (int i = 1; i < 4; ++i) { |
| src_lookup[i] = vadd_u8(src_lookup[i - 1], two); |
| } |
| |
| const uint16_t* src_y = reinterpret_cast<const uint16_t*>(src) + |
| (p >> kScaleSubPixelBits) - ref_x + kernel_offset; |
| int y = intermediate_height; |
| do { |
| // Load a pool of samples to select from using stepped indices. |
| const uint8x16x3_t src_bytes = LoadSrcVals<1>(src_y); |
| // Each lane corresponds to a different filter kernel. |
| const uint16x4_t src[4] = {PermuteSrcVals(src_bytes, src_lookup[0]), |
| PermuteSrcVals(src_bytes, src_lookup[1]), |
| PermuteSrcVals(src_bytes, src_lookup[2]), |
| PermuteSrcVals(src_bytes, src_lookup[3])}; |
| |
| vst1_s16(intermediate, |
| vrshrn_n_s32(SumOnePassTaps</*filter_index=*/5>(src, taps), |
| kInterRoundBitsHorizontal - 1)); |
| src_y = reinterpret_cast<const uint16_t*>( |
| reinterpret_cast<const uint8_t*>(src_y) + src_stride); |
| intermediate += kIntermediateStride; |
| } while (--y != 0); |
| } |
| |
| // Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[4]. |
| inline int8x16_t GetSigned4TapFilter(const int tap_index) { |
| assert(tap_index < 4); |
| alignas(16) static constexpr int8_t |
| kAbsHalfSubPixel4TapSignedFilterColumns[4][16] = { |
| {-0, -2, -4, -5, -6, -6, -7, -6, -6, -5, -5, -5, -4, -3, -2, -1}, |
| {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4}, |
| {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63}, |
| {-0, -1, -2, -3, -4, -5, -5, -5, -6, -6, -7, -6, -6, -5, -4, -2}}; |
| |
| return vld1q_s8(kAbsHalfSubPixel4TapSignedFilterColumns[tap_index]); |
| } |
| |
| // This filter is only possible when width <= 4. |
| inline void ConvolveKernelHorizontalSigned4Tap( |
| const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride, |
| const int subpixel_x, const int step_x, const int intermediate_height, |
| int16_t* LIBGAV1_RESTRICT intermediate) { |
| const int kernel_offset = 2; |
| const int ref_x = subpixel_x >> kScaleSubPixelBits; |
| const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask); |
| const int8x16_t filter_taps0 = GetSigned4TapFilter(0); |
| const int8x16_t filter_taps1 = GetSigned4TapFilter(1); |
| const int8x16_t filter_taps2 = GetSigned4TapFilter(2); |
| const int8x16_t filter_taps3 = GetSigned4TapFilter(3); |
| const uint16x8_t index_steps = vmulq_n_u16( |
| vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x)); |
| |
| const int p = subpixel_x; |
| // Only add steps to the 10-bit truncated p to avoid overflow. |
| const uint16x8_t p_fraction = vdupq_n_u16(p & 1023); |
| const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction); |
| const uint8x8_t filter_indices = |
| vand_u8(vshrn_n_u16(subpel_index_offsets, 6), filter_index_mask); |
| // Each lane of lane of taps[k] corresponds to one output value along the row, |
| // containing kSubPixelFilters[filter_index][filter_id][k], where filter_id |
| // depends on x. |
| const int16x4_t taps[4] = { |
| vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps0, filter_indices))), |
| vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps1, filter_indices))), |
| vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps2, filter_indices))), |
| vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps3, filter_indices)))}; |
| // Lower byte of Nth value is at position 2*N. |
| // Narrowing shift is not available here because the maximum shift |
| // parameter is 8. |
| const uint8x8_t src_indices0 = vshl_n_u8( |
| vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)), 1); |
| // Upper byte of Nth value is at position 2*N+1. |
| const uint8x8_t src_indices1 = vadd_u8(src_indices0, vdup_n_u8(1)); |
| // Only 4 values needed. |
| const uint8x8_t src_indices_base = InterleaveLow8(src_indices0, src_indices1); |
| |
| uint8x8_t src_lookup[4]; |
| const uint8x8_t two = vdup_n_u8(2); |
| src_lookup[0] = src_indices_base; |
| for (int i = 1; i < 4; ++i) { |
| src_lookup[i] = vadd_u8(src_lookup[i - 1], two); |
| } |
| |
| const uint16_t* src_y = reinterpret_cast<const uint16_t*>(src) + |
| (p >> kScaleSubPixelBits) - ref_x + kernel_offset; |
| int y = intermediate_height; |
| do { |
| // Load a pool of samples to select from using stepped indices. |
| const uint8x16x3_t src_bytes = LoadSrcVals<1>(src_y); |
| // Each lane corresponds to a different filter kernel. |
| const uint16x4_t src[4] = {PermuteSrcVals(src_bytes, src_lookup[0]), |
| PermuteSrcVals(src_bytes, src_lookup[1]), |
| PermuteSrcVals(src_bytes, src_lookup[2]), |
| PermuteSrcVals(src_bytes, src_lookup[3])}; |
| |
| vst1_s16(intermediate, |
| vrshrn_n_s32(SumOnePassTaps</*filter_index=*/4>(src, taps), |
| kInterRoundBitsHorizontal - 1)); |
| src_y = reinterpret_cast<const uint16_t*>( |
| reinterpret_cast<const uint8_t*>(src_y) + src_stride); |
| intermediate += kIntermediateStride; |
| } while (--y != 0); |
| } |
| |
| // Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[0]. |
| inline int8x16_t GetSigned6TapFilter(const int tap_index) { |
| assert(tap_index < 6); |
| alignas(16) static constexpr int8_t |
| kAbsHalfSubPixel6TapSignedFilterColumns[6][16] = { |
| {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0}, |
| {-0, -3, -5, -6, -7, -7, -8, -7, -7, -6, -6, -6, -5, -4, -2, -1}, |
| {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4}, |
| {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63}, |
| {-0, -1, -2, -4, -5, -6, -6, -6, -7, -7, -8, -7, -7, -6, -5, -3}, |
| {0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}}; |
| |
| return vld1q_s8(kAbsHalfSubPixel6TapSignedFilterColumns[tap_index]); |
| } |
| |
| // This filter is only possible when width >= 8. |
| template <int grade_x> |
| inline void ConvolveKernelHorizontalSigned6Tap( |
| const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride, |
| const int width, const int subpixel_x, const int step_x, |
| const int intermediate_height, |
| int16_t* LIBGAV1_RESTRICT const intermediate) { |
| const int kernel_offset = 1; |
| const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask); |
| const int ref_x = subpixel_x >> kScaleSubPixelBits; |
| const int step_x8 = step_x << 3; |
| int8x16_t filter_taps[6]; |
| for (int i = 0; i < 6; ++i) { |
| filter_taps[i] = GetSigned6TapFilter(i); |
| } |
| const uint16x8_t index_steps = vmulq_n_u16( |
| vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x)); |
| |
| int x = 0; |
| int p = subpixel_x; |
| do { |
| const uint16_t* src_x = reinterpret_cast<const uint16_t*>(src) + |
| (p >> kScaleSubPixelBits) - ref_x + kernel_offset; |
| int16_t* intermediate_x = intermediate + x; |
| // Only add steps to the 10-bit truncated p to avoid overflow. |
| const uint16x8_t p_fraction = vdupq_n_u16(p & 1023); |
| const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction); |
| const uint8x8_t filter_indices = |
| vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift), |
| filter_index_mask); |
| |
| // Each lane of lane of taps_(low|high)[k] corresponds to one output value |
| // along the row, containing kSubPixelFilters[filter_index][filter_id][k], |
| // where filter_id depends on x. |
| int16x4_t taps_low[6]; |
| int16x4_t taps_high[6]; |
| for (int i = 0; i < 6; ++i) { |
| const int16x8_t taps_i = |
| vmovl_s8(VQTbl1S8(filter_taps[i], filter_indices)); |
| taps_low[i] = vget_low_s16(taps_i); |
| taps_high[i] = vget_high_s16(taps_i); |
| } |
| |
| // Lower byte of Nth value is at position 2*N. |
| const uint8x8_t src_indices0 = vshl_n_u8( |
| vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)), 1); |
| // Upper byte of Nth value is at position 2*N+1. |
| const uint8x8_t src_indices1 = vadd_u8(src_indices0, vdup_n_u8(1)); |
| const uint8x8x2_t src_indices_zip = vzip_u8(src_indices0, src_indices1); |
| const uint8x16_t src_indices_base = |
| vcombine_u8(src_indices_zip.val[0], src_indices_zip.val[1]); |
| |
| uint8x16_t src_lookup[6]; |
| const uint8x16_t two = vdupq_n_u8(2); |
| src_lookup[0] = src_indices_base; |
| for (int i = 1; i < 6; ++i) { |
| src_lookup[i] = vaddq_u8(src_lookup[i - 1], two); |
| } |
| |
| int y = intermediate_height; |
| do { |
| // Load a pool of samples to select from using stepped indices. |
| const uint8x16x3_t src_bytes = LoadSrcVals<grade_x>(src_x); |
| |
| uint16x4_t src_low[6]; |
| uint16x4_t src_high[6]; |
| for (int i = 0; i < 6; ++i) { |
| const uint16x8_t src_i = |
| PermuteSrcVals<grade_x>(src_bytes, src_lookup[i]); |
| src_low[i] = vget_low_u16(src_i); |
| src_high[i] = vget_high_u16(src_i); |
| } |
| |
| vst1_s16(intermediate_x, vrshrn_n_s32(SumOnePassTaps</*filter_index=*/0>( |
| src_low, taps_low), |
| kInterRoundBitsHorizontal - 1)); |
| vst1_s16( |
| intermediate_x + 4, |
| vrshrn_n_s32(SumOnePassTaps</*filter_index=*/0>(src_high, taps_high), |
| kInterRoundBitsHorizontal - 1)); |
| // Avoid right shifting the stride. |
| src_x = reinterpret_cast<const uint16_t*>( |
| reinterpret_cast<const uint8_t*>(src_x) + src_stride); |
| intermediate_x += kIntermediateStride; |
| } while (--y != 0); |
| x += 8; |
| p += step_x8; |
| } while (x < width); |
| } |
| |
| // Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[1]. This filter |
| // has mixed positive and negative outer taps depending on the filter id. |
| inline int8x16_t GetMixed6TapFilter(const int tap_index) { |
| assert(tap_index < 6); |
| alignas(16) static constexpr int8_t |
| kAbsHalfSubPixel6TapMixedFilterColumns[6][16] = { |
| {0, 1, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0}, |
| {0, 14, 13, 11, 10, 9, 8, 8, 7, 6, 5, 4, 3, 2, 2, 1}, |
| {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17}, |
| {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31}, |
| {0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 13, 14}, |
| {0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 1}}; |
| |
| return vld1q_s8(kAbsHalfSubPixel6TapMixedFilterColumns[tap_index]); |
| } |
| |
| // This filter is only possible when width >= 8. |
| template <int grade_x> |
| inline void ConvolveKernelHorizontalMixed6Tap( |
| const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride, |
| const int width, const int subpixel_x, const int step_x, |
| const int intermediate_height, |
| int16_t* LIBGAV1_RESTRICT const intermediate) { |
| const int kernel_offset = 1; |
| const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask); |
| const int ref_x = subpixel_x >> kScaleSubPixelBits; |
| const int step_x8 = step_x << 3; |
| int8x16_t filter_taps[6]; |
| for (int i = 0; i < 6; ++i) { |
| filter_taps[i] = GetMixed6TapFilter(i); |
| } |
| const uint16x8_t index_steps = vmulq_n_u16( |
| vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x)); |
| |
| int x = 0; |
| int p = subpixel_x; |
| do { |
| const uint16_t* src_x = reinterpret_cast<const uint16_t*>(src) + |
| (p >> kScaleSubPixelBits) - ref_x + kernel_offset; |
| int16_t* intermediate_x = intermediate + x; |
| // Only add steps to the 10-bit truncated p to avoid overflow. |
| const uint16x8_t p_fraction = vdupq_n_u16(p & 1023); |
| const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction); |
| |
| const uint8x8_t filter_indices = |
| vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift), |
| filter_index_mask); |
| // Each lane of lane of taps_(low|high)[k] corresponds to one output value |
| // along the row, containing kSubPixelFilters[filter_index][filter_id][k], |
| // where filter_id depends on x. |
| int16x4_t taps_low[6]; |
| int16x4_t taps_high[6]; |
| for (int i = 0; i < 6; ++i) { |
| const int16x8_t taps = vmovl_s8(VQTbl1S8(filter_taps[i], filter_indices)); |
| taps_low[i] = vget_low_s16(taps); |
| taps_high[i] = vget_high_s16(taps); |
| } |
| |
| // Lower byte of Nth value is at position 2*N. |
| const uint8x8_t src_indices0 = vshl_n_u8( |
| vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)), 1); |
| // Upper byte of Nth value is at position 2*N+1. |
| const uint8x8_t src_indices1 = vadd_u8(src_indices0, vdup_n_u8(1)); |
| const uint8x8x2_t src_indices_zip = vzip_u8(src_indices0, src_indices1); |
| const uint8x16_t src_indices_base = |
| vcombine_u8(src_indices_zip.val[0], src_indices_zip.val[1]); |
| |
| uint8x16_t src_lookup[6]; |
| const uint8x16_t two = vdupq_n_u8(2); |
| src_lookup[0] = src_indices_base; |
| for (int i = 1; i < 6; ++i) { |
| src_lookup[i] = vaddq_u8(src_lookup[i - 1], two); |
| } |
| |
| int y = intermediate_height; |
| do { |
| // Load a pool of samples to select from using stepped indices. |
| const uint8x16x3_t src_bytes = LoadSrcVals<grade_x>(src_x); |
| |
| uint16x4_t src_low[6]; |
| uint16x4_t src_high[6]; |
| for (int i = 0; i < 6; ++i) { |
| const uint16x8_t src_i = |
| PermuteSrcVals<grade_x>(src_bytes, src_lookup[i]); |
| src_low[i] = vget_low_u16(src_i); |
| src_high[i] = vget_high_u16(src_i); |
| } |
| |
| vst1_s16(intermediate_x, vrshrn_n_s32(SumOnePassTaps</*filter_index=*/0>( |
| src_low, taps_low), |
| kInterRoundBitsHorizontal - 1)); |
| vst1_s16( |
| intermediate_x + 4, |
| vrshrn_n_s32(SumOnePassTaps</*filter_index=*/0>(src_high, taps_high), |
| kInterRoundBitsHorizontal - 1)); |
| // Avoid right shifting the stride. |
| src_x = reinterpret_cast<const uint16_t*>( |
| reinterpret_cast<const uint8_t*>(src_x) + src_stride); |
| intermediate_x += kIntermediateStride; |
| } while (--y != 0); |
| x += 8; |
| p += step_x8; |
| } while (x < width); |
| } |
| |
| // Pre-transpose the 8 tap filters in |kAbsHalfSubPixelFilters|[2]. |
| inline int8x16_t GetSigned8TapFilter(const int tap_index) { |
| assert(tap_index < 8); |
| alignas(16) static constexpr int8_t |
| kAbsHalfSubPixel8TapSignedFilterColumns[8][16] = { |
| {-0, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -0}, |
| {0, 1, 3, 4, 5, 5, 5, 5, 6, 5, 4, 4, 3, 3, 2, 1}, |
| {-0, -3, -6, -9, -11, -11, -12, -12, -12, -11, -10, -9, -7, -5, -3, |
| -1}, |
| {64, 63, 62, 60, 58, 54, 50, 45, 40, 35, 30, 24, 19, 13, 8, 4}, |
| {0, 4, 8, 13, 19, 24, 30, 35, 40, 45, 50, 54, 58, 60, 62, 63}, |
| {-0, -1, -3, -5, -7, -9, -10, -11, -12, -12, -12, -11, -11, -9, -6, |
| -3}, |
| {0, 1, 2, 3, 3, 4, 4, 5, 6, 5, 5, 5, 5, 4, 3, 1}, |
| {-0, -0, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1}}; |
| |
| return vld1q_s8(kAbsHalfSubPixel8TapSignedFilterColumns[tap_index]); |
| } |
| |
| // This filter is only possible when width >= 8. |
| template <int grade_x> |
| inline void ConvolveKernelHorizontalSigned8Tap( |
| const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride, |
| const int width, const int subpixel_x, const int step_x, |
| const int intermediate_height, |
| int16_t* LIBGAV1_RESTRICT const intermediate) { |
| const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask); |
| const int ref_x = subpixel_x >> kScaleSubPixelBits; |
| const int step_x8 = step_x << 3; |
| int8x16_t filter_taps[8]; |
| for (int i = 0; i < 8; ++i) { |
| filter_taps[i] = GetSigned8TapFilter(i); |
| } |
| const uint16x8_t index_steps = vmulq_n_u16( |
| vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x)); |
| int x = 0; |
| int p = subpixel_x; |
| do { |
| const uint16_t* src_x = reinterpret_cast<const uint16_t*>(src) + |
| (p >> kScaleSubPixelBits) - ref_x; |
| int16_t* intermediate_x = intermediate + x; |
| // Only add steps to the 10-bit truncated p to avoid overflow. |
| const uint16x8_t p_fraction = vdupq_n_u16(p & 1023); |
| const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction); |
| |
| const uint8x8_t filter_indices = |
| vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift), |
| filter_index_mask); |
| |
| // Lower byte of Nth value is at position 2*N. |
| const uint8x8_t src_indices0 = vshl_n_u8( |
| vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)), 1); |
| // Upper byte of Nth value is at position 2*N+1. |
| const uint8x8_t src_indices1 = vadd_u8(src_indices0, vdup_n_u8(1)); |
| const uint8x8x2_t src_indices_zip = vzip_u8(src_indices0, src_indices1); |
| const uint8x16_t src_indices_base = |
| vcombine_u8(src_indices_zip.val[0], src_indices_zip.val[1]); |
| |
| uint8x16_t src_lookup[8]; |
| const uint8x16_t two = vdupq_n_u8(2); |
| src_lookup[0] = src_indices_base; |
| for (int i = 1; i < 8; ++i) { |
| src_lookup[i] = vaddq_u8(src_lookup[i - 1], two); |
| } |
| // Each lane of lane of taps_(low|high)[k] corresponds to one output value |
| // along the row, containing kSubPixelFilters[filter_index][filter_id][k], |
| // where filter_id depends on x. |
| int16x4_t taps_low[8]; |
| int16x4_t taps_high[8]; |
| for (int i = 0; i < 8; ++i) { |
| const int16x8_t taps = vmovl_s8(VQTbl1S8(filter_taps[i], filter_indices)); |
| taps_low[i] = vget_low_s16(taps); |
| taps_high[i] = vget_high_s16(taps); |
| } |
| |
| int y = intermediate_height; |
| do { |
| // Load a pool of samples to select from using stepped indices. |
| const uint8x16x3_t src_bytes = LoadSrcVals<grade_x>(src_x); |
| |
| uint16x4_t src_low[8]; |
| uint16x4_t src_high[8]; |
| for (int i = 0; i < 8; ++i) { |
| const uint16x8_t src_i = |
| PermuteSrcVals<grade_x>(src_bytes, src_lookup[i]); |
| src_low[i] = vget_low_u16(src_i); |
| src_high[i] = vget_high_u16(src_i); |
| } |
| |
| vst1_s16(intermediate_x, vrshrn_n_s32(SumOnePassTaps</*filter_index=*/2>( |
| src_low, taps_low), |
| kInterRoundBitsHorizontal - 1)); |
| vst1_s16( |
| intermediate_x + 4, |
| vrshrn_n_s32(SumOnePassTaps</*filter_index=*/2>(src_high, taps_high), |
| kInterRoundBitsHorizontal - 1)); |
| // Avoid right shifting the stride. |
| src_x = reinterpret_cast<const uint16_t*>( |
| reinterpret_cast<const uint8_t*>(src_x) + src_stride); |
| intermediate_x += kIntermediateStride; |
| } while (--y != 0); |
| x += 8; |
| p += step_x8; |
| } while (x < width); |
| } |
| |
| // Process 16 bit inputs and output 32 bits. |
| template <int num_taps, bool is_compound> |
| inline int16x4_t Sum2DVerticalTaps4(const int16x4_t* const src, |
| const int16x8_t taps) { |
| const int16x4_t taps_lo = vget_low_s16(taps); |
| const int16x4_t taps_hi = vget_high_s16(taps); |
| int32x4_t sum; |
| if (num_taps == 8) { |
| sum = vmull_lane_s16(src[0], taps_lo, 0); |
| sum = vmlal_lane_s16(sum, src[1], taps_lo, 1); |
| sum = vmlal_lane_s16(sum, src[2], taps_lo, 2); |
| sum = vmlal_lane_s16(sum, src[3], taps_lo, 3); |
| sum = vmlal_lane_s16(sum, src[4], taps_hi, 0); |
| sum = vmlal_lane_s16(sum, src[5], taps_hi, 1); |
| sum = vmlal_lane_s16(sum, src[6], taps_hi, 2); |
| sum = vmlal_lane_s16(sum, src[7], taps_hi, 3); |
| } else if (num_taps == 6) { |
| sum = vmull_lane_s16(src[0], taps_lo, 1); |
| sum = vmlal_lane_s16(sum, src[1], taps_lo, 2); |
| sum = vmlal_lane_s16(sum, src[2], taps_lo, 3); |
| sum = vmlal_lane_s16(sum, src[3], taps_hi, 0); |
| sum = vmlal_lane_s16(sum, src[4], taps_hi, 1); |
| sum = vmlal_lane_s16(sum, src[5], taps_hi, 2); |
| } else if (num_taps == 4) { |
| sum = vmull_lane_s16(src[0], taps_lo, 2); |
| sum = vmlal_lane_s16(sum, src[1], taps_lo, 3); |
| sum = vmlal_lane_s16(sum, src[2], taps_hi, 0); |
| sum = vmlal_lane_s16(sum, src[3], taps_hi, 1); |
| } else if (num_taps == 2) { |
| sum = vmull_lane_s16(src[0], taps_lo, 3); |
| sum = vmlal_lane_s16(sum, src[1], taps_hi, 0); |
| } |
| |
| if (is_compound) { |
| return vrshrn_n_s32(sum, kInterRoundBitsCompoundVertical - 1); |
| } |
| |
| return vreinterpret_s16_u16(vqrshrun_n_s32(sum, kInterRoundBitsVertical - 1)); |
| } |
| |
| template <int num_taps, int grade_y, int width, bool is_compound> |
| void ConvolveVerticalScale2Or4xH(const int16_t* LIBGAV1_RESTRICT const src, |
| const int subpixel_y, const int filter_index, |
| const int step_y, const int height, |
| void* LIBGAV1_RESTRICT const dest, |
| const ptrdiff_t dest_stride) { |
| static_assert(width == 2 || width == 4, ""); |
| // We increment stride with the 8-bit pointer and then reinterpret to avoid |
| // shifting |dest_stride|. |
| auto* dest_y = static_cast<uint8_t*>(dest); |
| // In compound mode, |dest_stride| is based on the size of uint16_t, rather |
| // than bytes. |
| auto* compound_dest_y = static_cast<uint16_t*>(dest); |
| // This stride always corresponds to int16_t. |
| constexpr ptrdiff_t src_stride = kIntermediateStride; |
| const int16_t* src_y = src; |
| int16x4_t s[num_taps + grade_y]; |
| |
| int p = subpixel_y & 1023; |
| int prev_p = p; |
| int y = height; |
| do { |
| for (int i = 0; i < num_taps; ++i) { |
| s[i] = vld1_s16(src_y + i * src_stride); |
| } |
| int filter_id = (p >> 6) & kSubPixelMask; |
| int16x8_t filter = |
| vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id])); |
| int16x4_t sums = Sum2DVerticalTaps4<num_taps, is_compound>(s, filter); |
| if (is_compound) { |
| assert(width != 2); |
| // This offset potentially overflows into the sign bit, but should yield |
| // the correct unsigned value. |
| const uint16x4_t result = |
| vreinterpret_u16_s16(vadd_s16(sums, vdup_n_s16(kCompoundOffset))); |
| vst1_u16(compound_dest_y, result); |
| compound_dest_y += dest_stride; |
| } else { |
| const uint16x4_t result = vmin_u16(vreinterpret_u16_s16(sums), |
| vdup_n_u16((1 << kBitdepth10) - 1)); |
| if (width == 2) { |
| Store2<0>(reinterpret_cast<uint16_t*>(dest_y), result); |
| } else { |
| vst1_u16(reinterpret_cast<uint16_t*>(dest_y), result); |
| } |
| dest_y += dest_stride; |
| } |
| p += step_y; |
| const int p_diff = |
| (p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits); |
| prev_p = p; |
| // Here we load extra source in case it is needed. If |p_diff| == 0, these |
| // values will be unused, but it's faster to load than to branch. |
| s[num_taps] = vld1_s16(src_y + num_taps * src_stride); |
| if (grade_y > 1) { |
| s[num_taps + 1] = vld1_s16(src_y + (num_taps + 1) * src_stride); |
| } |
| |
| filter_id = (p >> 6) & kSubPixelMask; |
| filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id])); |
| sums = Sum2DVerticalTaps4<num_taps, is_compound>(&s[p_diff], filter); |
| if (is_compound) { |
| assert(width != 2); |
| const uint16x4_t result = |
| vreinterpret_u16_s16(vadd_s16(sums, vdup_n_s16(kCompoundOffset))); |
| vst1_u16(compound_dest_y, result); |
| compound_dest_y += dest_stride; |
| } else { |
| const uint16x4_t result = vmin_u16(vreinterpret_u16_s16(sums), |
| vdup_n_u16((1 << kBitdepth10) - 1)); |
| if (width == 2) { |
| Store2<0>(reinterpret_cast<uint16_t*>(dest_y), result); |
| } else { |
| vst1_u16(reinterpret_cast<uint16_t*>(dest_y), result); |
| } |
| dest_y += dest_stride; |
| } |
| p += step_y; |
| src_y = src + (p >> kScaleSubPixelBits) * src_stride; |
| prev_p = p; |
| y -= 2; |
| } while (y != 0); |
| } |
| |
| template <int num_taps, int grade_y, bool is_compound> |
| void ConvolveVerticalScale(const int16_t* LIBGAV1_RESTRICT const src, |
| const int width, const int subpixel_y, |
| const int filter_index, const int step_y, |
| const int height, void* LIBGAV1_RESTRICT const dest, |
| const ptrdiff_t dest_stride) { |
| // This stride always corresponds to int16_t. |
| constexpr ptrdiff_t src_stride = kIntermediateStride; |
| |
| int16x8_t s[num_taps + 2]; |
| |
| int x = 0; |
| do { |
| const int16_t* const src_x = src + x; |
| const int16_t* src_y = src_x; |
| int p = subpixel_y & 1023; |
| int prev_p = p; |
| int y = height; |
| // We increment stride with the 8-bit pointer and then reinterpret to avoid |
| // shifting |dest_stride|. |
| auto* dest_y = reinterpret_cast<uint8_t*>(static_cast<uint16_t*>(dest) + x); |
| // In compound mode, |dest_stride| is based on the size of uint16_t, rather |
| // than bytes. |
| auto* compound_dest_y = static_cast<uint16_t*>(dest) + x; |
| do { |
| for (int i = 0; i < num_taps; ++i) { |
| s[i] = vld1q_s16(src_y + i * src_stride); |
| } |
| int filter_id = (p >> 6) & kSubPixelMask; |
| int16x8_t filter = |
| vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id])); |
| int16x8_t sums = |
| SimpleSum2DVerticalTaps<num_taps, is_compound>(s, filter); |
| if (is_compound) { |
| // This offset potentially overflows int16_t, but should yield the |
| // correct unsigned value. |
| const uint16x8_t result = vreinterpretq_u16_s16( |
| vaddq_s16(sums, vdupq_n_s16(kCompoundOffset))); |
| vst1q_u16(compound_dest_y, result); |
| compound_dest_y += dest_stride; |
| } else { |
| const uint16x8_t result = vminq_u16( |
| vreinterpretq_u16_s16(sums), vdupq_n_u16((1 << kBitdepth10) - 1)); |
| vst1q_u16(reinterpret_cast<uint16_t*>(dest_y), result); |
| dest_y += dest_stride; |
| } |
| p += step_y; |
| const int p_diff = |
| (p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits); |
| prev_p = p; |
| // Here we load extra source in case it is needed. If |p_diff| == 0, these |
| // values will be unused, but it's faster to load than to branch. |
| s[num_taps] = vld1q_s16(src_y + num_taps * src_stride); |
| if (grade_y > 1) { |
| s[num_taps + 1] = vld1q_s16(src_y + (num_taps + 1) * src_stride); |
| } |
| |
| filter_id = (p >> 6) & kSubPixelMask; |
| filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id])); |
| sums = SimpleSum2DVerticalTaps<num_taps, is_compound>(&s[p_diff], filter); |
| if (is_compound) { |
| assert(width != 2); |
| const uint16x8_t result = vreinterpretq_u16_s16( |
| vaddq_s16(sums, vdupq_n_s16(kCompoundOffset))); |
| vst1q_u16(compound_dest_y, result); |
| compound_dest_y += dest_stride; |
| } else { |
| const uint16x8_t result = vminq_u16( |
| vreinterpretq_u16_s16(sums), vdupq_n_u16((1 << kBitdepth10) - 1)); |
| vst1q_u16(reinterpret_cast<uint16_t*>(dest_y), result); |
| dest_y += dest_stride; |
| } |
| p += step_y; |
| src_y = src_x + (p >> kScaleSubPixelBits) * src_stride; |
| prev_p = p; |
| |
| y -= 2; |
| } while (y != 0); |
| x += 8; |
| } while (x < width); |
| } |
| |
| template <bool is_compound> |
| void ConvolveScale2D_NEON(const void* LIBGAV1_RESTRICT const reference, |
| const ptrdiff_t reference_stride, |
| const int horizontal_filter_index, |
| const int vertical_filter_index, const int subpixel_x, |
| const int subpixel_y, const int step_x, |
| const int step_y, const int width, const int height, |
| void* LIBGAV1_RESTRICT const prediction, |
| const ptrdiff_t pred_stride) { |
| const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); |
| const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); |
| assert(step_x <= 2048); |
| assert(step_y <= 2048); |
| const int num_vert_taps = GetNumTapsInFilter(vert_filter_index); |
| const int intermediate_height = |
| (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >> |
| kScaleSubPixelBits) + |
| num_vert_taps; |
| int16_t intermediate_result[kMaxSuperBlockSizeInPixels * |
| (2 * kMaxSuperBlockSizeInPixels + 8)]; |
| // Horizontal filter. |
| // Filter types used for width <= 4 are different from those for width > 4. |
| // When width > 4, the valid filter index range is always [0, 3]. |
| // When width <= 4, the valid filter index range is always [3, 5]. |
| // The same applies to height and vertical filter index. |
| int filter_index = GetFilterIndex(horizontal_filter_index, width); |
| int16_t* intermediate = intermediate_result; |
| const ptrdiff_t src_stride = reference_stride; |
| const auto* src = static_cast<const uint8_t*>(reference); |
| const int vert_kernel_offset = (8 - num_vert_taps) / 2; |
| src += vert_kernel_offset * src_stride; |
| |
| // Derive the maximum value of |step_x| at which all source values fit in one |
| // 16-byte (8-value) load. Final index is src_x + |num_taps| - 1 < 16 |
| // step_x*7 is the final base subpel index for the shuffle mask for filter |
| // inputs in each iteration on large blocks. When step_x is large, we need a |
| // larger structure and use a larger table lookup in order to gather all |
| // filter inputs. |
| const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index); |
| // |num_taps| - 1 is the shuffle index of the final filter input. |
| const int kernel_start_ceiling = 16 - num_horiz_taps; |
| // This truncated quotient |grade_x_threshold| selects |step_x| such that: |
| // (step_x * 7) >> kScaleSubPixelBits < single load limit |
| const int grade_x_threshold = |
| (kernel_start_ceiling << kScaleSubPixelBits) / 7; |
| |
| switch (filter_index) { |
| case 0: |
| if (step_x > grade_x_threshold) { |
| ConvolveKernelHorizontalSigned6Tap<2>( |
| src, src_stride, width, subpixel_x, step_x, intermediate_height, |
| intermediate); |
| } else { |
| ConvolveKernelHorizontalSigned6Tap<1>( |
| src, src_stride, width, subpixel_x, step_x, intermediate_height, |
| intermediate); |
| } |
| break; |
| case 1: |
| if (step_x > grade_x_threshold) { |
| ConvolveKernelHorizontalMixed6Tap<2>(src, src_stride, width, subpixel_x, |
| step_x, intermediate_height, |
| intermediate); |
| |
| } else { |
| ConvolveKernelHorizontalMixed6Tap<1>(src, src_stride, width, subpixel_x, |
| step_x, intermediate_height, |
| intermediate); |
| } |
| break; |
| case 2: |
| if (step_x > grade_x_threshold) { |
| ConvolveKernelHorizontalSigned8Tap<2>( |
| src, src_stride, width, subpixel_x, step_x, intermediate_height, |
| intermediate); |
| } else { |
| ConvolveKernelHorizontalSigned8Tap<1>( |
| src, src_stride, width, subpixel_x, step_x, intermediate_height, |
| intermediate); |
| } |
| break; |
| case 3: |
| if (step_x > grade_x_threshold) { |
| ConvolveKernelHorizontal2Tap<2>(src, src_stride, width, subpixel_x, |
| step_x, intermediate_height, |
| intermediate); |
| } else { |
| ConvolveKernelHorizontal2Tap<1>(src, src_stride, width, subpixel_x, |
| step_x, intermediate_height, |
| intermediate); |
| } |
| break; |
| case 4: |
| assert(width <= 4); |
| ConvolveKernelHorizontalSigned4Tap(src, src_stride, subpixel_x, step_x, |
| intermediate_height, intermediate); |
| break; |
| default: |
| assert(filter_index == 5); |
| ConvolveKernelHorizontalPositive4Tap(src, src_stride, subpixel_x, step_x, |
| intermediate_height, intermediate); |
| } |
| |
| // Vertical filter. |
| filter_index = GetFilterIndex(vertical_filter_index, height); |
| intermediate = intermediate_result; |
| switch (filter_index) { |
| case 0: |
| case 1: |
| if (step_y <= 1024) { |
| if (!is_compound && width == 2) { |
| ConvolveVerticalScale2Or4xH<6, 1, 2, is_compound>( |
| intermediate, subpixel_y, filter_index, step_y, height, |
| prediction, pred_stride); |
| } else if (width == 4) { |
| ConvolveVerticalScale2Or4xH<6, 1, 4, is_compound>( |
| intermediate, subpixel_y, filter_index, step_y, height, |
| prediction, pred_stride); |
| } else { |
| ConvolveVerticalScale<6, 1, is_compound>( |
| intermediate, width, subpixel_y, filter_index, step_y, height, |
| prediction, pred_stride); |
| } |
| } else { |
| if (!is_compound && width == 2) { |
| ConvolveVerticalScale2Or4xH<6, 2, 2, is_compound>( |
| intermediate, subpixel_y, filter_index, step_y, height, |
| prediction, pred_stride); |
| } else if (width == 4) { |
| ConvolveVerticalScale2Or4xH<6, 2, 4, is_compound>( |
| intermediate, subpixel_y, filter_index, step_y, height, |
| prediction, pred_stride); |
| } else { |
| ConvolveVerticalScale<6, 2, is_compound>( |
| intermediate, width, subpixel_y, filter_index, step_y, height, |
| prediction, pred_stride); |
| } |
| } |
| break; |
| case 2: |
| if (step_y <= 1024) { |
| if (!is_compound && width == 2) { |
| ConvolveVerticalScale2Or4xH<8, 1, 2, is_compound>( |
| intermediate, subpixel_y, filter_index, step_y, height, |
| prediction, pred_stride); |
| } else if (width == 4) { |
| ConvolveVerticalScale2Or4xH<8, 1, 4, is_compound>( |
| intermediate, subpixel_y, filter_index, step_y, height, |
| prediction, pred_stride); |
| } else { |
| ConvolveVerticalScale<8, 1, is_compound>( |
| intermediate, width, subpixel_y, filter_index, step_y, height, |
| prediction, pred_stride); |
| } |
| } else { |
| if (!is_compound && width == 2) { |
| ConvolveVerticalScale2Or4xH<8, 2, 2, is_compound>( |
| intermediate, subpixel_y, filter_index, step_y, height, |
| prediction, pred_stride); |
| } else if (width == 4) { |
| ConvolveVerticalScale2Or4xH<8, 2, 4, is_compound>( |
| intermediate, subpixel_y, filter_index, step_y, height, |
| prediction, pred_stride); |
| } else { |
| ConvolveVerticalScale<8, 2, is_compound>( |
| intermediate, width, subpixel_y, filter_index, step_y, height, |
| prediction, pred_stride); |
| } |
| } |
| break; |
| case 3: |
| if (step_y <= 1024) { |
| if (!is_compound && width == 2) { |
| ConvolveVerticalScale2Or4xH<2, 1, 2, is_compound>( |
| intermediate, subpixel_y, filter_index, step_y, height, |
| prediction, pred_stride); |
| } else if (width == 4) { |
| ConvolveVerticalScale2Or4xH<2, 1, 4, is_compound>( |
| intermediate, subpixel_y, filter_index, step_y, height, |
| prediction, pred_stride); |
| } else { |
| ConvolveVerticalScale<2, 1, is_compound>( |
| intermediate, width, subpixel_y, filter_index, step_y, height, |
| prediction, pred_stride); |
| } |
| } else { |
| if (!is_compound && width == 2) { |
| ConvolveVerticalScale2Or4xH<2, 2, 2, is_compound>( |
| intermediate, subpixel_y, filter_index, step_y, height, |
| prediction, pred_stride); |
| } else if (width == 4) { |
| ConvolveVerticalScale2Or4xH<2, 2, 4, is_compound>( |
| intermediate, subpixel_y, filter_index, step_y, height, |
| prediction, pred_stride); |
| } else { |
| ConvolveVerticalScale<2, 2, is_compound>( |
| intermediate, width, subpixel_y, filter_index, step_y, height, |
| prediction, pred_stride); |
| } |
| } |
| break; |
| default: |
| assert(filter_index == 4 || filter_index == 5); |
| assert(height <= 4); |
| if (step_y <= 1024) { |
| if (!is_compound && width == 2) { |
| ConvolveVerticalScale2Or4xH<4, 1, 2, is_compound>( |
| intermediate, subpixel_y, filter_index, step_y, height, |
| prediction, pred_stride); |
| } else if (width == 4) { |
| ConvolveVerticalScale2Or4xH<4, 1, 4, is_compound>( |
| intermediate, subpixel_y, filter_index, step_y, height, |
| prediction, pred_stride); |
| } else { |
| ConvolveVerticalScale<4, 1, is_compound>( |
| intermediate, width, subpixel_y, filter_index, step_y, height, |
| prediction, pred_stride); |
| } |
| } else { |
| if (!is_compound && width == 2) { |
| ConvolveVerticalScale2Or4xH<4, 2, 2, is_compound>( |
| intermediate, subpixel_y, filter_index, step_y, height, |
| prediction, pred_stride); |
| } else if (width == 4) { |
| ConvolveVerticalScale2Or4xH<4, 2, 4, is_compound>( |
| intermediate, subpixel_y, filter_index, step_y, height, |
| prediction, pred_stride); |
| } else { |
| ConvolveVerticalScale<4, 2, is_compound>( |
| intermediate, width, subpixel_y, filter_index, step_y, height, |
| prediction, pred_stride); |
| } |
| } |
| } |
| } |
| |
| void Init10bpp() { |
| Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); |
| assert(dsp != nullptr); |
| dsp->convolve[0][0][0][1] = ConvolveHorizontal_NEON; |
| dsp->convolve[0][0][1][0] = ConvolveVertical_NEON; |
| dsp->convolve[0][0][1][1] = Convolve2D_NEON; |
| |
| dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_NEON; |
| dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_NEON; |
| dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_NEON; |
| dsp->convolve[0][1][1][1] = ConvolveCompound2D_NEON; |
| |
| dsp->convolve[1][0][0][1] = ConvolveIntraBlockCopyHorizontal_NEON; |
| dsp->convolve[1][0][1][0] = ConvolveIntraBlockCopyVertical_NEON; |
| dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_NEON; |
| |
| dsp->convolve_scale[0] = ConvolveScale2D_NEON<false>; |
| dsp->convolve_scale[1] = ConvolveScale2D_NEON<true>; |
| } |
| |
| } // namespace |
| |
| void ConvolveInit10bpp_NEON() { Init10bpp(); } |
| |
| } // namespace dsp |
| } // namespace libgav1 |
| |
| #else // !(LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10) |
| |
| namespace libgav1 { |
| namespace dsp { |
| |
| void ConvolveInit10bpp_NEON() {} |
| |
| } // namespace dsp |
| } // namespace libgav1 |
| #endif // LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10 |