| #include "src/dsp/dsp.h" |
| #include "src/dsp/intrapred.h" |
| |
| #if LIBGAV1_ENABLE_NEON |
| |
| #include <arm_neon.h> |
| |
| #include <cassert> |
| #include <cstddef> |
| #include <cstdint> |
| |
| #include "src/dsp/arm/common_neon.h" |
| #include "src/dsp/constants.h" |
| |
| namespace libgav1 { |
| namespace dsp { |
| |
| namespace low_bitdepth { |
| namespace { |
| |
| // Note these constants are duplicated from intrapred.cc to allow the compiler |
| // to have visibility of the values. This helps reduce loads and in the |
| // creation of the inverse weights. |
| constexpr uint8_t kSmoothWeights[] = { |
| // block dimension = 4 |
| 255, 149, 85, 64, |
| // block dimension = 8 |
| 255, 197, 146, 105, 73, 50, 37, 32, |
| // block dimension = 16 |
| 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16, |
| // block dimension = 32 |
| 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74, |
| 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8, |
| // block dimension = 64 |
| 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156, |
| 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, |
| 69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, |
| 15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4}; |
| |
| inline uint16x4_t CalculatePred(const uint16x4_t weighted_top, |
| const uint16x4_t weighted_left, |
| const uint16x4_t weighted_bl, |
| const uint16x4_t weighted_tr) { |
| const uint32x4_t pred_0 = vaddl_u16(weighted_top, weighted_left); |
| const uint32x4_t pred_1 = vaddl_u16(weighted_bl, weighted_tr); |
| const uint32x4_t pred_2 = vaddq_u32(pred_0, pred_1); |
| return vrshrn_n_u32(pred_2, kSmoothWeightScale + 1); |
| } |
| |
| template <int width, int height> |
| inline void Smooth4Or8xN_NEON(void* const dest, ptrdiff_t stride, |
| const void* const top_row, |
| const void* const left_column) { |
| const uint8_t* const top = static_cast<const uint8_t*>(top_row); |
| const uint8_t* const left = static_cast<const uint8_t*>(left_column); |
| const uint8_t top_right = top[width - 1]; |
| const uint8_t bottom_left = left[height - 1]; |
| const uint8_t* const weights_y = kSmoothWeights + height - 4; |
| uint8_t* dst = static_cast<uint8_t*>(dest); |
| |
| uint8x8_t top_v; |
| // TODO(johannkoenig): Process 16 values (4x4 / 8x2) at a time. |
| if (width == 4) { |
| top_v = vdup_n_u8(0); |
| top_v = LoadLo4(top, top_v); |
| } else { // width == 8 |
| top_v = vld1_u8(top); |
| } |
| const uint8x8_t top_right_v = vdup_n_u8(top_right); |
| const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); |
| // Over-reads for 4xN but still within the array. |
| const uint8x8_t weights_x_v = vld1_u8(kSmoothWeights + width - 4); |
| // 256 - weights = vneg_s8(weights) |
| const uint8x8_t scaled_weights_x = |
| vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_x_v))); |
| |
| for (int y = 0; y < height; ++y) { |
| const uint8x8_t left_v = vdup_n_u8(left[y]); |
| const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); |
| const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]); |
| const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v); |
| |
| const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v); |
| const uint16x8_t weighted_left = vmull_u8(weights_x_v, left_v); |
| const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); |
| const uint16x4_t dest_0 = |
| CalculatePred(vget_low_u16(weighted_top), vget_low_u16(weighted_left), |
| vget_low_u16(weighted_tr), vget_low_u16(weighted_bl)); |
| |
| if (width == 4) { |
| StoreLo4(dst, vmovn_u16(vcombine_u16(dest_0, dest_0))); |
| } else { // width == 8 |
| const uint16x4_t dest_1 = CalculatePred( |
| vget_high_u16(weighted_top), vget_high_u16(weighted_left), |
| vget_high_u16(weighted_tr), vget_high_u16(weighted_bl)); |
| vst1_u8(dst, vmovn_u16(vcombine_u16(dest_0, dest_1))); |
| } |
| dst += stride; |
| } |
| } |
| |
| inline uint8x16_t CalculateWeightsAndPred( |
| const uint8x16_t top, const uint8x8_t left, const uint8x8_t top_right, |
| const uint8x8_t weights_y, const uint8x16_t weights_x, |
| const uint8x16_t scaled_weights_x, const uint16x8_t weighted_bl) { |
| const uint16x8_t weighted_top_low = vmull_u8(weights_y, vget_low_u8(top)); |
| const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left); |
| const uint16x8_t weighted_tr_low = |
| vmull_u8(vget_low_u8(scaled_weights_x), top_right); |
| const uint16x4_t dest_0 = CalculatePred( |
| vget_low_u16(weighted_top_low), vget_low_u16(weighted_left_low), |
| vget_low_u16(weighted_tr_low), vget_low_u16(weighted_bl)); |
| const uint16x4_t dest_1 = CalculatePred( |
| vget_high_u16(weighted_top_low), vget_high_u16(weighted_left_low), |
| vget_high_u16(weighted_tr_low), vget_high_u16(weighted_bl)); |
| const uint8x8_t dest_0_u8 = vmovn_u16(vcombine_u16(dest_0, dest_1)); |
| |
| const uint16x8_t weighted_top_high = vmull_u8(weights_y, vget_high_u8(top)); |
| const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left); |
| const uint16x8_t weighted_tr_high = |
| vmull_u8(vget_high_u8(scaled_weights_x), top_right); |
| const uint16x4_t dest_2 = CalculatePred( |
| vget_low_u16(weighted_top_high), vget_low_u16(weighted_left_high), |
| vget_low_u16(weighted_tr_high), vget_low_u16(weighted_bl)); |
| const uint16x4_t dest_3 = CalculatePred( |
| vget_high_u16(weighted_top_high), vget_high_u16(weighted_left_high), |
| vget_high_u16(weighted_tr_high), vget_high_u16(weighted_bl)); |
| const uint8x8_t dest_1_u8 = vmovn_u16(vcombine_u16(dest_2, dest_3)); |
| |
| return vcombine_u8(dest_0_u8, dest_1_u8); |
| } |
| |
| template <int width, int height> |
| inline void Smooth16PlusxN_NEON(void* const dest, ptrdiff_t stride, |
| const void* const top_row, |
| const void* const left_column) { |
| const uint8_t* const top = static_cast<const uint8_t*>(top_row); |
| const uint8_t* const left = static_cast<const uint8_t*>(left_column); |
| const uint8_t top_right = top[width - 1]; |
| const uint8_t bottom_left = left[height - 1]; |
| const uint8_t* const weights_y = kSmoothWeights + height - 4; |
| uint8_t* dst = static_cast<uint8_t*>(dest); |
| |
| uint8x16_t top_v[4]; |
| top_v[0] = vld1q_u8(top); |
| if (width > 16) { |
| top_v[1] = vld1q_u8(top + 16); |
| if (width == 64) { |
| top_v[2] = vld1q_u8(top + 32); |
| top_v[3] = vld1q_u8(top + 48); |
| } |
| } |
| |
| const uint8x8_t top_right_v = vdup_n_u8(top_right); |
| const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); |
| |
| // TODO(johannkoenig): Consider re-reading top_v and weights_x_v in the loop. |
| // This currently has a performance slope similar to Paeth so it does not |
| // appear to be register bound for arm64. |
| uint8x16_t weights_x_v[4]; |
| weights_x_v[0] = vld1q_u8(kSmoothWeights + width - 4); |
| if (width > 16) { |
| weights_x_v[1] = vld1q_u8(kSmoothWeights + width + 16 - 4); |
| if (width == 64) { |
| weights_x_v[2] = vld1q_u8(kSmoothWeights + width + 32 - 4); |
| weights_x_v[3] = vld1q_u8(kSmoothWeights + width + 48 - 4); |
| } |
| } |
| |
| uint8x16_t scaled_weights_x[4]; |
| scaled_weights_x[0] = |
| vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[0]))); |
| if (width > 16) { |
| scaled_weights_x[1] = |
| vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[1]))); |
| if (width == 64) { |
| scaled_weights_x[2] = |
| vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[2]))); |
| scaled_weights_x[3] = |
| vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[3]))); |
| } |
| } |
| |
| for (int y = 0; y < height; ++y) { |
| const uint8x8_t left_v = vdup_n_u8(left[y]); |
| const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); |
| const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]); |
| const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v); |
| |
| vst1q_u8(dst, CalculateWeightsAndPred(top_v[0], left_v, top_right_v, |
| weights_y_v, weights_x_v[0], |
| scaled_weights_x[0], weighted_bl)); |
| |
| if (width > 16) { |
| vst1q_u8(dst + 16, CalculateWeightsAndPred( |
| top_v[1], left_v, top_right_v, weights_y_v, |
| weights_x_v[1], scaled_weights_x[1], weighted_bl)); |
| if (width == 64) { |
| vst1q_u8(dst + 32, |
| CalculateWeightsAndPred(top_v[2], left_v, top_right_v, |
| weights_y_v, weights_x_v[2], |
| scaled_weights_x[2], weighted_bl)); |
| vst1q_u8(dst + 48, |
| CalculateWeightsAndPred(top_v[3], left_v, top_right_v, |
| weights_y_v, weights_x_v[3], |
| scaled_weights_x[3], weighted_bl)); |
| } |
| } |
| |
| dst += stride; |
| } |
| } |
| |
| template <int width, int height> |
| inline void SmoothVertical4Or8xN_NEON(void* const dest, ptrdiff_t stride, |
| const void* const top_row, |
| const void* const left_column) { |
| const uint8_t* const top = static_cast<const uint8_t*>(top_row); |
| const uint8_t* const left = static_cast<const uint8_t*>(left_column); |
| const uint8_t bottom_left = left[height - 1]; |
| const uint8_t* const weights_y = kSmoothWeights + height - 4; |
| uint8_t* dst = static_cast<uint8_t*>(dest); |
| |
| uint8x8_t top_v; |
| if (width == 4) { |
| top_v = vdup_n_u8(0); |
| top_v = LoadLo4(top, top_v); |
| } else { // width == 8 |
| top_v = vld1_u8(top); |
| } |
| |
| const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); |
| |
| for (int y = 0; y < height; ++y) { |
| const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); |
| const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]); |
| |
| const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v); |
| const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v); |
| const uint16x8_t pred = vaddq_u16(weighted_top, weighted_bl); |
| const uint8x8_t pred_scaled = vrshrn_n_u16(pred, kSmoothWeightScale); |
| |
| if (width == 4) { |
| StoreLo4(dst, pred_scaled); |
| } else { // width == 8 |
| vst1_u8(dst, pred_scaled); |
| } |
| dst += stride; |
| } |
| } |
| |
| inline uint8x16_t CalculateVerticalWeightsAndPred( |
| const uint8x16_t top, const uint8x8_t weights_y, |
| const uint16x8_t weighted_bl) { |
| const uint16x8_t weighted_top_low = vmull_u8(weights_y, vget_low_u8(top)); |
| const uint16x8_t weighted_top_high = vmull_u8(weights_y, vget_high_u8(top)); |
| const uint16x8_t pred_low = vaddq_u16(weighted_top_low, weighted_bl); |
| const uint16x8_t pred_high = vaddq_u16(weighted_top_high, weighted_bl); |
| const uint8x8_t pred_scaled_low = vrshrn_n_u16(pred_low, kSmoothWeightScale); |
| const uint8x8_t pred_scaled_high = |
| vrshrn_n_u16(pred_high, kSmoothWeightScale); |
| return vcombine_u8(pred_scaled_low, pred_scaled_high); |
| } |
| |
| template <int width, int height> |
| inline void SmoothVertical16PlusxN_NEON(void* const dest, ptrdiff_t stride, |
| const void* const top_row, |
| const void* const left_column) { |
| const uint8_t* const top = static_cast<const uint8_t*>(top_row); |
| const uint8_t* const left = static_cast<const uint8_t*>(left_column); |
| const uint8_t bottom_left = left[height - 1]; |
| const uint8_t* const weights_y = kSmoothWeights + height - 4; |
| uint8_t* dst = static_cast<uint8_t*>(dest); |
| |
| uint8x16_t top_v[4]; |
| top_v[0] = vld1q_u8(top); |
| if (width > 16) { |
| top_v[1] = vld1q_u8(top + 16); |
| if (width == 64) { |
| top_v[2] = vld1q_u8(top + 32); |
| top_v[3] = vld1q_u8(top + 48); |
| } |
| } |
| |
| const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); |
| |
| for (int y = 0; y < height; ++y) { |
| const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); |
| const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]); |
| const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v); |
| |
| const uint8x16_t pred_0 = |
| CalculateVerticalWeightsAndPred(top_v[0], weights_y_v, weighted_bl); |
| vst1q_u8(dst, pred_0); |
| |
| if (width > 16) { |
| const uint8x16_t pred_1 = |
| CalculateVerticalWeightsAndPred(top_v[1], weights_y_v, weighted_bl); |
| vst1q_u8(dst + 16, pred_1); |
| |
| if (width == 64) { |
| const uint8x16_t pred_2 = |
| CalculateVerticalWeightsAndPred(top_v[2], weights_y_v, weighted_bl); |
| vst1q_u8(dst + 32, pred_2); |
| |
| const uint8x16_t pred_3 = |
| CalculateVerticalWeightsAndPred(top_v[3], weights_y_v, weighted_bl); |
| vst1q_u8(dst + 48, pred_3); |
| } |
| } |
| |
| dst += stride; |
| } |
| } |
| |
| template <int width, int height> |
| inline void SmoothHorizontal4Or8xN_NEON(void* const dest, ptrdiff_t stride, |
| const void* const top_row, |
| const void* const left_column) { |
| const uint8_t* const top = static_cast<const uint8_t*>(top_row); |
| const uint8_t* const left = static_cast<const uint8_t*>(left_column); |
| const uint8_t top_right = top[width - 1]; |
| uint8_t* dst = static_cast<uint8_t*>(dest); |
| |
| const uint8x8_t top_right_v = vdup_n_u8(top_right); |
| // Over-reads for 4xN but still within the array. |
| const uint8x8_t weights_x = vld1_u8(kSmoothWeights + width - 4); |
| // 256 - weights = vneg_s8(weights) |
| const uint8x8_t scaled_weights_x = |
| vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_x))); |
| |
| for (int y = 0; y < height; ++y) { |
| const uint8x8_t left_v = vdup_n_u8(left[y]); |
| |
| const uint16x8_t weighted_left = vmull_u8(weights_x, left_v); |
| const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); |
| const uint16x8_t pred = vaddq_u16(weighted_left, weighted_tr); |
| const uint8x8_t pred_scaled = vrshrn_n_u16(pred, kSmoothWeightScale); |
| |
| if (width == 4) { |
| StoreLo4(dst, pred_scaled); |
| } else { // width == 8 |
| vst1_u8(dst, pred_scaled); |
| } |
| dst += stride; |
| } |
| } |
| |
| inline uint8x16_t CalculateHorizontalWeightsAndPred( |
| const uint8x8_t left, const uint8x8_t top_right, const uint8x16_t weights_x, |
| const uint8x16_t scaled_weights_x) { |
| const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left); |
| const uint16x8_t weighted_tr_low = |
| vmull_u8(vget_low_u8(scaled_weights_x), top_right); |
| const uint16x8_t pred_low = vaddq_u16(weighted_left_low, weighted_tr_low); |
| const uint8x8_t pred_scaled_low = vrshrn_n_u16(pred_low, kSmoothWeightScale); |
| |
| const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left); |
| const uint16x8_t weighted_tr_high = |
| vmull_u8(vget_high_u8(scaled_weights_x), top_right); |
| const uint16x8_t pred_high = vaddq_u16(weighted_left_high, weighted_tr_high); |
| const uint8x8_t pred_scaled_high = |
| vrshrn_n_u16(pred_high, kSmoothWeightScale); |
| |
| return vcombine_u8(pred_scaled_low, pred_scaled_high); |
| } |
| |
| template <int width, int height> |
| inline void SmoothHorizontal16PlusxN_NEON(void* const dest, ptrdiff_t stride, |
| const void* const top_row, |
| const void* const left_column) { |
| const uint8_t* const top = static_cast<const uint8_t*>(top_row); |
| const uint8_t* const left = static_cast<const uint8_t*>(left_column); |
| const uint8_t top_right = top[width - 1]; |
| uint8_t* dst = static_cast<uint8_t*>(dest); |
| |
| const uint8x8_t top_right_v = vdup_n_u8(top_right); |
| |
| uint8x16_t weights_x[4]; |
| weights_x[0] = vld1q_u8(kSmoothWeights + width - 4); |
| if (width > 16) { |
| weights_x[1] = vld1q_u8(kSmoothWeights + width + 16 - 4); |
| if (width == 64) { |
| weights_x[2] = vld1q_u8(kSmoothWeights + width + 32 - 4); |
| weights_x[3] = vld1q_u8(kSmoothWeights + width + 48 - 4); |
| } |
| } |
| |
| uint8x16_t scaled_weights_x[4]; |
| scaled_weights_x[0] = |
| vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[0]))); |
| if (width > 16) { |
| scaled_weights_x[1] = |
| vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[1]))); |
| if (width == 64) { |
| scaled_weights_x[2] = |
| vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[2]))); |
| scaled_weights_x[3] = |
| vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[3]))); |
| } |
| } |
| |
| for (int y = 0; y < height; ++y) { |
| const uint8x8_t left_v = vdup_n_u8(left[y]); |
| |
| const uint8x16_t pred_0 = CalculateHorizontalWeightsAndPred( |
| left_v, top_right_v, weights_x[0], scaled_weights_x[0]); |
| vst1q_u8(dst, pred_0); |
| |
| if (width > 16) { |
| const uint8x16_t pred_1 = CalculateHorizontalWeightsAndPred( |
| left_v, top_right_v, weights_x[1], scaled_weights_x[1]); |
| vst1q_u8(dst + 16, pred_1); |
| |
| if (width == 64) { |
| const uint8x16_t pred_2 = CalculateHorizontalWeightsAndPred( |
| left_v, top_right_v, weights_x[2], scaled_weights_x[2]); |
| vst1q_u8(dst + 32, pred_2); |
| |
| const uint8x16_t pred_3 = CalculateHorizontalWeightsAndPred( |
| left_v, top_right_v, weights_x[3], scaled_weights_x[3]); |
| vst1q_u8(dst + 48, pred_3); |
| } |
| } |
| dst += stride; |
| } |
| } |
| |
| void Init8bpp() { |
| Dsp* const dsp = dsp_internal::GetWritableDspTable(8); |
| assert(dsp != nullptr); |
| // 4x4 |
| dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] = |
| Smooth4Or8xN_NEON<4, 4>; |
| dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] = |
| SmoothVertical4Or8xN_NEON<4, 4>; |
| dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] = |
| SmoothHorizontal4Or8xN_NEON<4, 4>; |
| |
| // 4x8 |
| dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] = |
| Smooth4Or8xN_NEON<4, 8>; |
| dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] = |
| SmoothVertical4Or8xN_NEON<4, 8>; |
| dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] = |
| SmoothHorizontal4Or8xN_NEON<4, 8>; |
| |
| // 4x16 |
| dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] = |
| Smooth4Or8xN_NEON<4, 16>; |
| dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] = |
| SmoothVertical4Or8xN_NEON<4, 16>; |
| dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] = |
| SmoothHorizontal4Or8xN_NEON<4, 16>; |
| |
| // 8x4 |
| dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] = |
| Smooth4Or8xN_NEON<8, 4>; |
| dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] = |
| SmoothVertical4Or8xN_NEON<8, 4>; |
| dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] = |
| SmoothHorizontal4Or8xN_NEON<8, 4>; |
| |
| // 8x8 |
| dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] = |
| Smooth4Or8xN_NEON<8, 8>; |
| dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] = |
| SmoothVertical4Or8xN_NEON<8, 8>; |
| dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] = |
| SmoothHorizontal4Or8xN_NEON<8, 8>; |
| |
| // 8x16 |
| dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] = |
| Smooth4Or8xN_NEON<8, 16>; |
| dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] = |
| SmoothVertical4Or8xN_NEON<8, 16>; |
| dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] = |
| SmoothHorizontal4Or8xN_NEON<8, 16>; |
| |
| // 8x32 |
| dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] = |
| Smooth4Or8xN_NEON<8, 32>; |
| dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] = |
| SmoothVertical4Or8xN_NEON<8, 32>; |
| dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] = |
| SmoothHorizontal4Or8xN_NEON<8, 32>; |
| |
| // 16x4 |
| dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] = |
| Smooth16PlusxN_NEON<16, 4>; |
| dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] = |
| SmoothVertical16PlusxN_NEON<16, 4>; |
| dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] = |
| SmoothHorizontal16PlusxN_NEON<16, 4>; |
| |
| // 16x8 |
| dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] = |
| Smooth16PlusxN_NEON<16, 8>; |
| dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] = |
| SmoothVertical16PlusxN_NEON<16, 8>; |
| dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] = |
| SmoothHorizontal16PlusxN_NEON<16, 8>; |
| |
| // 16x16 |
| dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] = |
| Smooth16PlusxN_NEON<16, 16>; |
| dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] = |
| SmoothVertical16PlusxN_NEON<16, 16>; |
| dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] = |
| SmoothHorizontal16PlusxN_NEON<16, 16>; |
| |
| // 16x32 |
| dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] = |
| Smooth16PlusxN_NEON<16, 32>; |
| dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] = |
| SmoothVertical16PlusxN_NEON<16, 32>; |
| dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] = |
| SmoothHorizontal16PlusxN_NEON<16, 32>; |
| |
| // 16x64 |
| dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] = |
| Smooth16PlusxN_NEON<16, 64>; |
| dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] = |
| SmoothVertical16PlusxN_NEON<16, 64>; |
| dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] = |
| SmoothHorizontal16PlusxN_NEON<16, 64>; |
| |
| // 32x8 |
| dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] = |
| Smooth16PlusxN_NEON<32, 8>; |
| dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] = |
| SmoothVertical16PlusxN_NEON<32, 8>; |
| dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] = |
| SmoothHorizontal16PlusxN_NEON<32, 8>; |
| |
| // 32x16 |
| dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] = |
| Smooth16PlusxN_NEON<32, 16>; |
| dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] = |
| SmoothVertical16PlusxN_NEON<32, 16>; |
| dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] = |
| SmoothHorizontal16PlusxN_NEON<32, 16>; |
| |
| // 32x32 |
| dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] = |
| Smooth16PlusxN_NEON<32, 32>; |
| dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] = |
| SmoothVertical16PlusxN_NEON<32, 32>; |
| dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] = |
| SmoothHorizontal16PlusxN_NEON<32, 32>; |
| |
| // 32x64 |
| dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] = |
| Smooth16PlusxN_NEON<32, 64>; |
| dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] = |
| SmoothVertical16PlusxN_NEON<32, 64>; |
| dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] = |
| SmoothHorizontal16PlusxN_NEON<32, 64>; |
| |
| // 64x16 |
| dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] = |
| Smooth16PlusxN_NEON<64, 16>; |
| dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] = |
| SmoothVertical16PlusxN_NEON<64, 16>; |
| dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] = |
| SmoothHorizontal16PlusxN_NEON<64, 16>; |
| |
| // 64x32 |
| dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] = |
| Smooth16PlusxN_NEON<64, 32>; |
| dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] = |
| SmoothVertical16PlusxN_NEON<64, 32>; |
| dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] = |
| SmoothHorizontal16PlusxN_NEON<64, 32>; |
| |
| // 64x64 |
| dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] = |
| Smooth16PlusxN_NEON<64, 64>; |
| dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] = |
| SmoothVertical16PlusxN_NEON<64, 64>; |
| dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] = |
| SmoothHorizontal16PlusxN_NEON<64, 64>; |
| } |
| |
| } // namespace |
| } // namespace low_bitdepth |
| |
| void IntraPredSmoothInit_NEON() { low_bitdepth::Init8bpp(); } |
| |
| } // namespace dsp |
| } // namespace libgav1 |
| |
| #else // !LIBGAV1_ENABLE_NEON |
| namespace libgav1 { |
| namespace dsp { |
| |
| void IntraPredSmoothInit_NEON() {} |
| |
| } // namespace dsp |
| } // namespace libgav1 |
| #endif // LIBGAV1_ENABLE_NEON |