| #include "src/dsp/dsp.h" |
| #include "src/dsp/intrapred.h" |
| |
| #if LIBGAV1_ENABLE_NEON |
| |
| #include <arm_neon.h> |
| |
| #include <cassert> |
| #include <cstddef> |
| #include <cstdint> |
| |
| #include "src/dsp/arm/common_neon.h" |
| #include "src/utils/common.h" |
| |
| namespace libgav1 { |
| namespace dsp { |
| |
| namespace low_bitdepth { |
| namespace { |
| |
| // Transpose kFilterIntraTaps and convert the first row to unsigned values. |
| // |
| // With the previous orientation we were able to multiply all the input values |
| // by a single tap. This required that all the input values be in one vector |
| // which requires expensive set up operations (shifts, vext, vtbl). All the |
| // elements of the result needed to be summed (easy on A64 - vaddvq_s16) but |
| // then the shifting, rounding, and clamping was done in GP registers. |
| // |
| // Switching to unsigned values allows multiplying the 8 bit inputs directly. |
| // When one value was negative we needed to vmovl_u8 first so that the results |
| // maintained the proper sign. |
| // |
| // We take this into account when summing the values by subtracting the product |
| // of the first row. |
| const uint8_t kTransposedTaps[kNumFilterIntraPredictors][7][8] = { |
| {{6, 5, 3, 3, 4, 3, 3, 3}, // Original values are negative. |
| {10, 2, 1, 1, 6, 2, 2, 1}, |
| {0, 10, 1, 1, 0, 6, 2, 2}, |
| {0, 0, 10, 2, 0, 0, 6, 2}, |
| {0, 0, 0, 10, 0, 0, 0, 6}, |
| {12, 9, 7, 5, 2, 2, 2, 3}, |
| {0, 0, 0, 0, 12, 9, 7, 5}}, |
| {{10, 6, 4, 2, 10, 6, 4, 2}, // Original values are negative. |
| {16, 0, 0, 0, 16, 0, 0, 0}, |
| {0, 16, 0, 0, 0, 16, 0, 0}, |
| {0, 0, 16, 0, 0, 0, 16, 0}, |
| {0, 0, 0, 16, 0, 0, 0, 16}, |
| {10, 6, 4, 2, 0, 0, 0, 0}, |
| {0, 0, 0, 0, 10, 6, 4, 2}}, |
| {{8, 8, 8, 8, 4, 4, 4, 4}, // Original values are negative. |
| {8, 0, 0, 0, 4, 0, 0, 0}, |
| {0, 8, 0, 0, 0, 4, 0, 0}, |
| {0, 0, 8, 0, 0, 0, 4, 0}, |
| {0, 0, 0, 8, 0, 0, 0, 4}, |
| {16, 16, 16, 16, 0, 0, 0, 0}, |
| {0, 0, 0, 0, 16, 16, 16, 16}}, |
| {{2, 1, 1, 0, 1, 1, 1, 1}, // Original values are negative. |
| {8, 3, 2, 1, 4, 3, 2, 2}, |
| {0, 8, 3, 2, 0, 4, 3, 2}, |
| {0, 0, 8, 3, 0, 0, 4, 3}, |
| {0, 0, 0, 8, 0, 0, 0, 4}, |
| {10, 6, 4, 2, 3, 4, 4, 3}, |
| {0, 0, 0, 0, 10, 6, 4, 3}}, |
| {{12, 10, 9, 8, 10, 9, 8, 7}, // Original values are negative. |
| {14, 0, 0, 0, 12, 1, 0, 0}, |
| {0, 14, 0, 0, 0, 12, 0, 0}, |
| {0, 0, 14, 0, 0, 0, 12, 1}, |
| {0, 0, 0, 14, 0, 0, 0, 12}, |
| {14, 12, 11, 10, 0, 0, 1, 1}, |
| {0, 0, 0, 0, 14, 12, 11, 9}}}; |
| |
| void FilterIntraPredictor_NEON(void* const dest, ptrdiff_t stride, |
| const void* const top_row, |
| const void* const left_column, |
| FilterIntraPredictor pred, int width, |
| int height) { |
| const uint8_t* const top = static_cast<const uint8_t*>(top_row); |
| const uint8_t* const left = static_cast<const uint8_t*>(left_column); |
| |
| assert(width <= 32 && height <= 32); |
| |
| uint8_t* dst = static_cast<uint8_t*>(dest); |
| |
| uint8x8_t transposed_taps[7]; |
| for (int i = 0; i < 7; ++i) { |
| transposed_taps[i] = vld1_u8(kTransposedTaps[pred][i]); |
| } |
| |
| uint8_t relative_top_left = top[-1]; |
| const uint8_t* relative_top = top; |
| uint8_t relative_left[2] = {left[0], left[1]}; |
| |
| int y = 0; |
| do { |
| uint8_t* row_dst = dst; |
| int x = 0; |
| do { |
| uint16x8_t sum = vdupq_n_u16(0); |
| const uint16x8_t subtrahend = |
| vmull_u8(transposed_taps[0], vdup_n_u8(relative_top_left)); |
| for (int i = 1; i < 5; ++i) { |
| sum = vmlal_u8(sum, transposed_taps[i], vdup_n_u8(relative_top[i - 1])); |
| } |
| for (int i = 5; i < 7; ++i) { |
| sum = |
| vmlal_u8(sum, transposed_taps[i], vdup_n_u8(relative_left[i - 5])); |
| } |
| |
| const int16x8_t sum_signed = |
| vreinterpretq_s16_u16(vsubq_u16(sum, subtrahend)); |
| const int16x8_t sum_shifted = vrshrq_n_s16(sum_signed, 4); |
| |
| uint8x8_t sum_saturated = vqmovun_s16(sum_shifted); |
| |
| StoreLo4(row_dst, sum_saturated); |
| StoreHi4(row_dst + stride, sum_saturated); |
| |
| // Progress across |
| relative_top_left = relative_top[3]; |
| relative_top += 4; |
| relative_left[0] = row_dst[3]; |
| relative_left[1] = row_dst[3 + stride]; |
| row_dst += 4; |
| x += 4; |
| } while (x < width); |
| |
| // Progress down. |
| relative_top_left = left[y + 1]; |
| relative_top = dst + stride; |
| relative_left[0] = left[y + 2]; |
| relative_left[1] = left[y + 3]; |
| |
| dst += 2 * stride; |
| y += 2; |
| } while (y < height); |
| } |
| |
| void Init8bpp() { |
| Dsp* const dsp = dsp_internal::GetWritableDspTable(8); |
| assert(dsp != nullptr); |
| dsp->filter_intra_predictor = FilterIntraPredictor_NEON; |
| } |
| |
| } // namespace |
| } // namespace low_bitdepth |
| |
| void IntraPredFilterIntraInit_NEON() { low_bitdepth::Init8bpp(); } |
| |
| } // namespace dsp |
| } // namespace libgav1 |
| |
| #else // !LIBGAV1_ENABLE_NEON |
| namespace libgav1 { |
| namespace dsp { |
| |
| void IntraPredFilterIntraInit_NEON() {} |
| |
| } // namespace dsp |
| } // namespace libgav1 |
| #endif // LIBGAV1_ENABLE_NEON |