| // Copyright 2019 The libgav1 Authors |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| #include "src/dsp/cdef.h" |
| #include "src/utils/cpu.h" |
| |
| #if LIBGAV1_ENABLE_NEON |
| |
| #include <arm_neon.h> |
| |
| #include <algorithm> |
| #include <cassert> |
| #include <cstddef> |
| #include <cstdint> |
| #include <cstdlib> |
| |
| #include "src/dsp/arm/common_neon.h" |
| #include "src/dsp/constants.h" |
| #include "src/dsp/dsp.h" |
| #include "src/utils/common.h" |
| #include "src/utils/constants.h" |
| |
| namespace libgav1 { |
| namespace dsp { |
| namespace low_bitdepth { |
| namespace { |
| |
| #include "src/dsp/cdef.inc" |
| |
| // ---------------------------------------------------------------------------- |
| // Refer to CdefDirection_C(). |
| // |
| // int32_t partial[8][15] = {}; |
| // for (int i = 0; i < 8; ++i) { |
| // for (int j = 0; j < 8; ++j) { |
| // const int x = 1; |
| // partial[0][i + j] += x; |
| // partial[1][i + j / 2] += x; |
| // partial[2][i] += x; |
| // partial[3][3 + i - j / 2] += x; |
| // partial[4][7 + i - j] += x; |
| // partial[5][3 - i / 2 + j] += x; |
| // partial[6][j] += x; |
| // partial[7][i / 2 + j] += x; |
| // } |
| // } |
| // |
| // Using the code above, generate the position count for partial[8][15]. |
| // |
| // partial[0]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1 |
| // partial[1]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0 |
| // partial[2]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0 |
| // partial[3]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0 |
| // partial[4]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1 |
| // partial[5]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0 |
| // partial[6]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0 |
| // partial[7]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0 |
| // |
| // The SIMD code shifts the input horizontally, then adds vertically to get the |
| // correct partial value for the given position. |
| // ---------------------------------------------------------------------------- |
| |
| // ---------------------------------------------------------------------------- |
| // partial[0][i + j] += x; |
| // |
| // 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00 |
| // 00 10 11 12 13 14 15 16 17 00 00 00 00 00 00 |
| // 00 00 20 21 22 23 24 25 26 27 00 00 00 00 00 |
| // 00 00 00 30 31 32 33 34 35 36 37 00 00 00 00 |
| // 00 00 00 00 40 41 42 43 44 45 46 47 00 00 00 |
| // 00 00 00 00 00 50 51 52 53 54 55 56 57 00 00 |
| // 00 00 00 00 00 00 60 61 62 63 64 65 66 67 00 |
| // 00 00 00 00 00 00 00 70 71 72 73 74 75 76 77 |
| // |
| // partial[4] is the same except the source is reversed. |
| LIBGAV1_ALWAYS_INLINE void AddPartial_D0_D4(uint8x8_t* v_src, |
| uint16x8_t* partial_lo, |
| uint16x8_t* partial_hi) { |
| const uint8x8_t v_zero = vdup_n_u8(0); |
| // 00 01 02 03 04 05 06 07 |
| // 00 10 11 12 13 14 15 16 |
| *partial_lo = vaddl_u8(v_src[0], vext_u8(v_zero, v_src[1], 7)); |
| |
| // 00 00 20 21 22 23 24 25 |
| *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[2], 6)); |
| // 17 00 00 00 00 00 00 00 |
| // 26 27 00 00 00 00 00 00 |
| *partial_hi = |
| vaddl_u8(vext_u8(v_src[1], v_zero, 7), vext_u8(v_src[2], v_zero, 6)); |
| |
| // 00 00 00 30 31 32 33 34 |
| *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[3], 5)); |
| // 35 36 37 00 00 00 00 00 |
| *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[3], v_zero, 5)); |
| |
| // 00 00 00 00 40 41 42 43 |
| *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[4], 4)); |
| // 44 45 46 47 00 00 00 00 |
| *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[4], v_zero, 4)); |
| |
| // 00 00 00 00 00 50 51 52 |
| *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[5], 3)); |
| // 53 54 55 56 57 00 00 00 |
| *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[5], v_zero, 3)); |
| |
| // 00 00 00 00 00 00 60 61 |
| *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[6], 2)); |
| // 62 63 64 65 66 67 00 00 |
| *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[6], v_zero, 2)); |
| |
| // 00 00 00 00 00 00 00 70 |
| *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[7], 1)); |
| // 71 72 73 74 75 76 77 00 |
| *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[7], v_zero, 1)); |
| } |
| |
| // ---------------------------------------------------------------------------- |
| // partial[1][i + j / 2] += x; |
| // |
| // A0 = src[0] + src[1], A1 = src[2] + src[3], ... |
| // |
| // A0 A1 A2 A3 00 00 00 00 00 00 00 00 00 00 00 |
| // 00 B0 B1 B2 B3 00 00 00 00 00 00 00 00 00 00 |
| // 00 00 C0 C1 C2 C3 00 00 00 00 00 00 00 00 00 |
| // 00 00 00 D0 D1 D2 D3 00 00 00 00 00 00 00 00 |
| // 00 00 00 00 E0 E1 E2 E3 00 00 00 00 00 00 00 |
| // 00 00 00 00 00 F0 F1 F2 F3 00 00 00 00 00 00 |
| // 00 00 00 00 00 00 G0 G1 G2 G3 00 00 00 00 00 |
| // 00 00 00 00 00 00 00 H0 H1 H2 H3 00 00 00 00 |
| // |
| // partial[3] is the same except the source is reversed. |
| LIBGAV1_ALWAYS_INLINE void AddPartial_D1_D3(uint8x8_t* v_src, |
| uint16x8_t* partial_lo, |
| uint16x8_t* partial_hi) { |
| uint8x16_t v_d1_temp[8]; |
| const uint8x8_t v_zero = vdup_n_u8(0); |
| const uint8x16_t v_zero_16 = vdupq_n_u8(0); |
| |
| for (int i = 0; i < 8; ++i) { |
| v_d1_temp[i] = vcombine_u8(v_src[i], v_zero); |
| } |
| |
| *partial_lo = *partial_hi = vdupq_n_u16(0); |
| // A0 A1 A2 A3 00 00 00 00 |
| *partial_lo = vpadalq_u8(*partial_lo, v_d1_temp[0]); |
| |
| // 00 B0 B1 B2 B3 00 00 00 |
| *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[1], 14)); |
| |
| // 00 00 C0 C1 C2 C3 00 00 |
| *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[2], 12)); |
| // 00 00 00 D0 D1 D2 D3 00 |
| *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[3], 10)); |
| // 00 00 00 00 E0 E1 E2 E3 |
| *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[4], 8)); |
| |
| // 00 00 00 00 00 F0 F1 F2 |
| *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[5], 6)); |
| // F3 00 00 00 00 00 00 00 |
| *partial_hi = vpadalq_u8(*partial_hi, vextq_u8(v_d1_temp[5], v_zero_16, 6)); |
| |
| // 00 00 00 00 00 00 G0 G1 |
| *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[6], 4)); |
| // G2 G3 00 00 00 00 00 00 |
| *partial_hi = vpadalq_u8(*partial_hi, vextq_u8(v_d1_temp[6], v_zero_16, 4)); |
| |
| // 00 00 00 00 00 00 00 H0 |
| *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[7], 2)); |
| // H1 H2 H3 00 00 00 00 00 |
| *partial_hi = vpadalq_u8(*partial_hi, vextq_u8(v_d1_temp[7], v_zero_16, 2)); |
| } |
| |
| // ---------------------------------------------------------------------------- |
| // partial[7][i / 2 + j] += x; |
| // |
| // 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00 |
| // 10 11 12 13 14 15 16 17 00 00 00 00 00 00 00 |
| // 00 20 21 22 23 24 25 26 27 00 00 00 00 00 00 |
| // 00 30 31 32 33 34 35 36 37 00 00 00 00 00 00 |
| // 00 00 40 41 42 43 44 45 46 47 00 00 00 00 00 |
| // 00 00 50 51 52 53 54 55 56 57 00 00 00 00 00 |
| // 00 00 00 60 61 62 63 64 65 66 67 00 00 00 00 |
| // 00 00 00 70 71 72 73 74 75 76 77 00 00 00 00 |
| // |
| // partial[5] is the same except the source is reversed. |
| LIBGAV1_ALWAYS_INLINE void AddPartial_D5_D7(uint8x8_t* v_src, |
| uint16x8_t* partial_lo, |
| uint16x8_t* partial_hi) { |
| const uint16x8_t v_zero = vdupq_n_u16(0); |
| uint16x8_t v_pair_add[4]; |
| // Add vertical source pairs. |
| v_pair_add[0] = vaddl_u8(v_src[0], v_src[1]); |
| v_pair_add[1] = vaddl_u8(v_src[2], v_src[3]); |
| v_pair_add[2] = vaddl_u8(v_src[4], v_src[5]); |
| v_pair_add[3] = vaddl_u8(v_src[6], v_src[7]); |
| |
| // 00 01 02 03 04 05 06 07 |
| // 10 11 12 13 14 15 16 17 |
| *partial_lo = v_pair_add[0]; |
| // 00 00 00 00 00 00 00 00 |
| // 00 00 00 00 00 00 00 00 |
| *partial_hi = vdupq_n_u16(0); |
| |
| // 00 20 21 22 23 24 25 26 |
| // 00 30 31 32 33 34 35 36 |
| *partial_lo = vaddq_u16(*partial_lo, vextq_u16(v_zero, v_pair_add[1], 7)); |
| // 27 00 00 00 00 00 00 00 |
| // 37 00 00 00 00 00 00 00 |
| *partial_hi = vaddq_u16(*partial_hi, vextq_u16(v_pair_add[1], v_zero, 7)); |
| |
| // 00 00 40 41 42 43 44 45 |
| // 00 00 50 51 52 53 54 55 |
| *partial_lo = vaddq_u16(*partial_lo, vextq_u16(v_zero, v_pair_add[2], 6)); |
| // 46 47 00 00 00 00 00 00 |
| // 56 57 00 00 00 00 00 00 |
| *partial_hi = vaddq_u16(*partial_hi, vextq_u16(v_pair_add[2], v_zero, 6)); |
| |
| // 00 00 00 60 61 62 63 64 |
| // 00 00 00 70 71 72 73 74 |
| *partial_lo = vaddq_u16(*partial_lo, vextq_u16(v_zero, v_pair_add[3], 5)); |
| // 65 66 67 00 00 00 00 00 |
| // 75 76 77 00 00 00 00 00 |
| *partial_hi = vaddq_u16(*partial_hi, vextq_u16(v_pair_add[3], v_zero, 5)); |
| } |
| |
| LIBGAV1_ALWAYS_INLINE void AddPartial(const void* const source, |
| ptrdiff_t stride, uint16x8_t* partial_lo, |
| uint16x8_t* partial_hi) { |
| const auto* src = static_cast<const uint8_t*>(source); |
| |
| // 8x8 input |
| // 00 01 02 03 04 05 06 07 |
| // 10 11 12 13 14 15 16 17 |
| // 20 21 22 23 24 25 26 27 |
| // 30 31 32 33 34 35 36 37 |
| // 40 41 42 43 44 45 46 47 |
| // 50 51 52 53 54 55 56 57 |
| // 60 61 62 63 64 65 66 67 |
| // 70 71 72 73 74 75 76 77 |
| uint8x8_t v_src[8]; |
| for (int i = 0; i < 8; ++i) { |
| v_src[i] = vld1_u8(src); |
| src += stride; |
| } |
| |
| // partial for direction 2 |
| // -------------------------------------------------------------------------- |
| // partial[2][i] += x; |
| // 00 10 20 30 40 50 60 70 00 00 00 00 00 00 00 00 |
| // 01 11 21 33 41 51 61 71 00 00 00 00 00 00 00 00 |
| // 02 12 22 33 42 52 62 72 00 00 00 00 00 00 00 00 |
| // 03 13 23 33 43 53 63 73 00 00 00 00 00 00 00 00 |
| // 04 14 24 34 44 54 64 74 00 00 00 00 00 00 00 00 |
| // 05 15 25 35 45 55 65 75 00 00 00 00 00 00 00 00 |
| // 06 16 26 36 46 56 66 76 00 00 00 00 00 00 00 00 |
| // 07 17 27 37 47 57 67 77 00 00 00 00 00 00 00 00 |
| partial_lo[2] = vsetq_lane_u16(SumVector(v_src[0]), partial_lo[2], 0); |
| partial_lo[2] = vsetq_lane_u16(SumVector(v_src[1]), partial_lo[2], 1); |
| partial_lo[2] = vsetq_lane_u16(SumVector(v_src[2]), partial_lo[2], 2); |
| partial_lo[2] = vsetq_lane_u16(SumVector(v_src[3]), partial_lo[2], 3); |
| partial_lo[2] = vsetq_lane_u16(SumVector(v_src[4]), partial_lo[2], 4); |
| partial_lo[2] = vsetq_lane_u16(SumVector(v_src[5]), partial_lo[2], 5); |
| partial_lo[2] = vsetq_lane_u16(SumVector(v_src[6]), partial_lo[2], 6); |
| partial_lo[2] = vsetq_lane_u16(SumVector(v_src[7]), partial_lo[2], 7); |
| |
| // partial for direction 6 |
| // -------------------------------------------------------------------------- |
| // partial[6][j] += x; |
| // 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00 00 |
| // 10 11 12 13 14 15 16 17 00 00 00 00 00 00 00 00 |
| // 20 21 22 23 24 25 26 27 00 00 00 00 00 00 00 00 |
| // 30 31 32 33 34 35 36 37 00 00 00 00 00 00 00 00 |
| // 40 41 42 43 44 45 46 47 00 00 00 00 00 00 00 00 |
| // 50 51 52 53 54 55 56 57 00 00 00 00 00 00 00 00 |
| // 60 61 62 63 64 65 66 67 00 00 00 00 00 00 00 00 |
| // 70 71 72 73 74 75 76 77 00 00 00 00 00 00 00 00 |
| const uint8x8_t v_zero = vdup_n_u8(0); |
| partial_lo[6] = vaddl_u8(v_zero, v_src[0]); |
| for (int i = 1; i < 8; ++i) { |
| partial_lo[6] = vaddw_u8(partial_lo[6], v_src[i]); |
| } |
| |
| // partial for direction 0 |
| AddPartial_D0_D4(v_src, &partial_lo[0], &partial_hi[0]); |
| |
| // partial for direction 1 |
| AddPartial_D1_D3(v_src, &partial_lo[1], &partial_hi[1]); |
| |
| // partial for direction 7 |
| AddPartial_D5_D7(v_src, &partial_lo[7], &partial_hi[7]); |
| |
| uint8x8_t v_src_reverse[8]; |
| for (int i = 0; i < 8; ++i) { |
| v_src_reverse[i] = vrev64_u8(v_src[i]); |
| } |
| |
| // partial for direction 4 |
| AddPartial_D0_D4(v_src_reverse, &partial_lo[4], &partial_hi[4]); |
| |
| // partial for direction 3 |
| AddPartial_D1_D3(v_src_reverse, &partial_lo[3], &partial_hi[3]); |
| |
| // partial for direction 5 |
| AddPartial_D5_D7(v_src_reverse, &partial_lo[5], &partial_hi[5]); |
| } |
| |
| uint32x4_t Square(uint16x4_t a) { return vmull_u16(a, a); } |
| |
| uint32x4_t SquareAccumulate(uint32x4_t a, uint16x4_t b) { |
| return vmlal_u16(a, b, b); |
| } |
| |
| // |cost[0]| and |cost[4]| square the input and sum with the corresponding |
| // element from the other end of the vector: |
| // |kCdefDivisionTable[]| element: |
| // cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) * |
| // kCdefDivisionTable[i + 1]; |
| // cost[0] += Square(partial[0][7]) * kCdefDivisionTable[8]; |
| // Because everything is being summed into a single value the distributive |
| // property allows us to mirror the division table and accumulate once. |
| uint32_t Cost0Or4(const uint16x8_t a, const uint16x8_t b, |
| const uint32x4_t division_table[4]) { |
| uint32x4_t c = vmulq_u32(Square(vget_low_u16(a)), division_table[0]); |
| c = vmlaq_u32(c, Square(vget_high_u16(a)), division_table[1]); |
| c = vmlaq_u32(c, Square(vget_low_u16(b)), division_table[2]); |
| c = vmlaq_u32(c, Square(vget_high_u16(b)), division_table[3]); |
| return SumVector(c); |
| } |
| |
| // |cost[2]| and |cost[6]| square the input and accumulate: |
| // cost[2] += Square(partial[2][i]) |
| uint32_t SquareAccumulate(const uint16x8_t a) { |
| uint32x4_t c = Square(vget_low_u16(a)); |
| c = SquareAccumulate(c, vget_high_u16(a)); |
| c = vmulq_n_u32(c, kCdefDivisionTable[7]); |
| return SumVector(c); |
| } |
| |
| uint32_t CostOdd(const uint16x8_t a, const uint16x8_t b, const uint32x4_t mask, |
| const uint32x4_t division_table[2]) { |
| // Remove elements 0-2. |
| uint32x4_t c = vandq_u32(mask, Square(vget_low_u16(a))); |
| c = vaddq_u32(c, Square(vget_high_u16(a))); |
| c = vmulq_n_u32(c, kCdefDivisionTable[7]); |
| |
| c = vmlaq_u32(c, Square(vget_low_u16(a)), division_table[0]); |
| c = vmlaq_u32(c, Square(vget_low_u16(b)), division_table[1]); |
| return SumVector(c); |
| } |
| |
| void CdefDirection_NEON(const void* const source, ptrdiff_t stride, |
| int* const direction, int* const variance) { |
| assert(direction != nullptr); |
| assert(variance != nullptr); |
| const auto* src = static_cast<const uint8_t*>(source); |
| uint32_t cost[8]; |
| uint16x8_t partial_lo[8], partial_hi[8]; |
| |
| AddPartial(src, stride, partial_lo, partial_hi); |
| |
| cost[2] = SquareAccumulate(partial_lo[2]); |
| cost[6] = SquareAccumulate(partial_lo[6]); |
| |
| const uint32x4_t division_table[4] = { |
| vld1q_u32(kCdefDivisionTable), vld1q_u32(kCdefDivisionTable + 4), |
| vld1q_u32(kCdefDivisionTable + 8), vld1q_u32(kCdefDivisionTable + 12)}; |
| |
| cost[0] = Cost0Or4(partial_lo[0], partial_hi[0], division_table); |
| cost[4] = Cost0Or4(partial_lo[4], partial_hi[4], division_table); |
| |
| const uint32x4_t division_table_odd[2] = { |
| vld1q_u32(kCdefDivisionTableOdd), vld1q_u32(kCdefDivisionTableOdd + 4)}; |
| |
| const uint32x4_t element_3_mask = {0, 0, 0, static_cast<uint32_t>(-1)}; |
| |
| cost[1] = |
| CostOdd(partial_lo[1], partial_hi[1], element_3_mask, division_table_odd); |
| cost[3] = |
| CostOdd(partial_lo[3], partial_hi[3], element_3_mask, division_table_odd); |
| cost[5] = |
| CostOdd(partial_lo[5], partial_hi[5], element_3_mask, division_table_odd); |
| cost[7] = |
| CostOdd(partial_lo[7], partial_hi[7], element_3_mask, division_table_odd); |
| |
| uint32_t best_cost = 0; |
| *direction = 0; |
| for (int i = 0; i < 8; ++i) { |
| if (cost[i] > best_cost) { |
| best_cost = cost[i]; |
| *direction = i; |
| } |
| } |
| *variance = (best_cost - cost[(*direction + 4) & 7]) >> 10; |
| } |
| |
| // ------------------------------------------------------------------------- |
| // CdefFilter |
| |
| // Load 4 vectors based on the given |direction|. |
| void LoadDirection(const uint16_t* const src, const ptrdiff_t stride, |
| uint16x8_t* output, const int direction) { |
| // Each |direction| describes a different set of source values. Expand this |
| // set by negating each set. For |direction| == 0 this gives a diagonal line |
| // from top right to bottom left. The first value is y, the second x. Negative |
| // y values move up. |
| // a b c d |
| // {-1, 1}, {1, -1}, {-2, 2}, {2, -2} |
| // c |
| // a |
| // 0 |
| // b |
| // d |
| const int y_0 = kCdefDirections[direction][0][0]; |
| const int x_0 = kCdefDirections[direction][0][1]; |
| const int y_1 = kCdefDirections[direction][1][0]; |
| const int x_1 = kCdefDirections[direction][1][1]; |
| output[0] = vld1q_u16(src + y_0 * stride + x_0); |
| output[1] = vld1q_u16(src - y_0 * stride - x_0); |
| output[2] = vld1q_u16(src + y_1 * stride + x_1); |
| output[3] = vld1q_u16(src - y_1 * stride - x_1); |
| } |
| |
| // Load 4 vectors based on the given |direction|. Use when |block_width| == 4 to |
| // do 2 rows at a time. |
| void LoadDirection4(const uint16_t* const src, const ptrdiff_t stride, |
| uint16x8_t* output, const int direction) { |
| const int y_0 = kCdefDirections[direction][0][0]; |
| const int x_0 = kCdefDirections[direction][0][1]; |
| const int y_1 = kCdefDirections[direction][1][0]; |
| const int x_1 = kCdefDirections[direction][1][1]; |
| output[0] = vcombine_u16(vld1_u16(src + y_0 * stride + x_0), |
| vld1_u16(src + y_0 * stride + stride + x_0)); |
| output[1] = vcombine_u16(vld1_u16(src - y_0 * stride - x_0), |
| vld1_u16(src - y_0 * stride + stride - x_0)); |
| output[2] = vcombine_u16(vld1_u16(src + y_1 * stride + x_1), |
| vld1_u16(src + y_1 * stride + stride + x_1)); |
| output[3] = vcombine_u16(vld1_u16(src - y_1 * stride - x_1), |
| vld1_u16(src - y_1 * stride + stride - x_1)); |
| } |
| |
| int16x8_t Constrain(const uint16x8_t pixel, const uint16x8_t reference, |
| const uint16x8_t threshold, const int16x8_t damping) { |
| // If reference > pixel, the difference will be negative, so covert to 0 or |
| // -1. |
| const uint16x8_t sign = vcgtq_u16(reference, pixel); |
| const uint16x8_t abs_diff = vabdq_u16(pixel, reference); |
| const uint16x8_t shifted_diff = vshlq_u16(abs_diff, damping); |
| // For bitdepth == 8, the threshold range is [0, 15] and the damping range is |
| // [3, 6]. If pixel == kCdefLargeValue(0x4000), shifted_diff will always be |
| // larger than threshold. Subtract using saturation will return 0 when pixel |
| // == kCdefLargeValue. |
| static_assert(kCdefLargeValue == 0x4000, "Invalid kCdefLargeValue"); |
| const uint16x8_t thresh_minus_shifted_diff = |
| vqsubq_u16(threshold, shifted_diff); |
| const uint16x8_t clamp_abs_diff = |
| vminq_u16(thresh_minus_shifted_diff, abs_diff); |
| // Restore the sign. |
| return vreinterpretq_s16_u16( |
| vsubq_u16(veorq_u16(clamp_abs_diff, sign), sign)); |
| } |
| |
| template <int width, bool enable_primary = true, bool enable_secondary = true> |
| void CdefFilter_NEON(const uint16_t* src, const ptrdiff_t src_stride, |
| const int height, const int primary_strength, |
| const int secondary_strength, const int damping, |
| const int direction, void* dest, |
| const ptrdiff_t dst_stride) { |
| static_assert(width == 8 || width == 4, ""); |
| static_assert(enable_primary || enable_secondary, ""); |
| constexpr bool clipping_required = enable_primary && enable_secondary; |
| auto* dst = static_cast<uint8_t*>(dest); |
| const uint16x8_t cdef_large_value_mask = |
| vdupq_n_u16(static_cast<uint16_t>(~kCdefLargeValue)); |
| const uint16x8_t primary_threshold = vdupq_n_u16(primary_strength); |
| const uint16x8_t secondary_threshold = vdupq_n_u16(secondary_strength); |
| |
| int16x8_t primary_damping_shift, secondary_damping_shift; |
| |
| // FloorLog2() requires input to be > 0. |
| // 8-bit damping range: Y: [3, 6], UV: [2, 5]. |
| if (enable_primary) { |
| // primary_strength: [0, 15] -> FloorLog2: [0, 3] so a clamp is necessary |
| // for UV filtering. |
| primary_damping_shift = |
| vdupq_n_s16(-std::max(0, damping - FloorLog2(primary_strength))); |
| } |
| if (enable_secondary) { |
| // secondary_strength: [0, 4] -> FloorLog2: [0, 2] so no clamp to 0 is |
| // necessary. |
| assert(damping - FloorLog2(secondary_strength) >= 0); |
| secondary_damping_shift = |
| vdupq_n_s16(-(damping - FloorLog2(secondary_strength))); |
| } |
| |
| const int primary_tap_0 = kCdefPrimaryTaps[primary_strength & 1][0]; |
| const int primary_tap_1 = kCdefPrimaryTaps[primary_strength & 1][1]; |
| |
| int y = height; |
| do { |
| uint16x8_t pixel; |
| if (width == 8) { |
| pixel = vld1q_u16(src); |
| } else { |
| pixel = vcombine_u16(vld1_u16(src), vld1_u16(src + src_stride)); |
| } |
| |
| uint16x8_t min = pixel; |
| uint16x8_t max = pixel; |
| int16x8_t sum; |
| |
| if (enable_primary) { |
| // Primary |direction|. |
| uint16x8_t primary_val[4]; |
| if (width == 8) { |
| LoadDirection(src, src_stride, primary_val, direction); |
| } else { |
| LoadDirection4(src, src_stride, primary_val, direction); |
| } |
| |
| if (clipping_required) { |
| min = vminq_u16(min, primary_val[0]); |
| min = vminq_u16(min, primary_val[1]); |
| min = vminq_u16(min, primary_val[2]); |
| min = vminq_u16(min, primary_val[3]); |
| |
| // The source is 16 bits, however, we only really care about the lower |
| // 8 bits. The upper 8 bits contain the "large" flag. After the final |
| // primary max has been calculated, zero out the upper 8 bits. Use this |
| // to find the "16 bit" max. |
| const uint8x16_t max_p01 = |
| vmaxq_u8(vreinterpretq_u8_u16(primary_val[0]), |
| vreinterpretq_u8_u16(primary_val[1])); |
| const uint8x16_t max_p23 = |
| vmaxq_u8(vreinterpretq_u8_u16(primary_val[2]), |
| vreinterpretq_u8_u16(primary_val[3])); |
| const uint16x8_t max_p = |
| vreinterpretq_u16_u8(vmaxq_u8(max_p01, max_p23)); |
| max = vmaxq_u16(max, vandq_u16(max_p, cdef_large_value_mask)); |
| } |
| |
| sum = Constrain(primary_val[0], pixel, primary_threshold, |
| primary_damping_shift); |
| sum = vmulq_n_s16(sum, primary_tap_0); |
| sum = vmlaq_n_s16(sum, |
| Constrain(primary_val[1], pixel, primary_threshold, |
| primary_damping_shift), |
| primary_tap_0); |
| sum = vmlaq_n_s16(sum, |
| Constrain(primary_val[2], pixel, primary_threshold, |
| primary_damping_shift), |
| primary_tap_1); |
| sum = vmlaq_n_s16(sum, |
| Constrain(primary_val[3], pixel, primary_threshold, |
| primary_damping_shift), |
| primary_tap_1); |
| } else { |
| sum = vdupq_n_s16(0); |
| } |
| |
| if (enable_secondary) { |
| // Secondary |direction| values (+/- 2). Clamp |direction|. |
| uint16x8_t secondary_val[8]; |
| if (width == 8) { |
| LoadDirection(src, src_stride, secondary_val, direction + 2); |
| LoadDirection(src, src_stride, secondary_val + 4, direction - 2); |
| } else { |
| LoadDirection4(src, src_stride, secondary_val, direction + 2); |
| LoadDirection4(src, src_stride, secondary_val + 4, direction - 2); |
| } |
| |
| if (clipping_required) { |
| min = vminq_u16(min, secondary_val[0]); |
| min = vminq_u16(min, secondary_val[1]); |
| min = vminq_u16(min, secondary_val[2]); |
| min = vminq_u16(min, secondary_val[3]); |
| min = vminq_u16(min, secondary_val[4]); |
| min = vminq_u16(min, secondary_val[5]); |
| min = vminq_u16(min, secondary_val[6]); |
| min = vminq_u16(min, secondary_val[7]); |
| |
| const uint8x16_t max_s01 = |
| vmaxq_u8(vreinterpretq_u8_u16(secondary_val[0]), |
| vreinterpretq_u8_u16(secondary_val[1])); |
| const uint8x16_t max_s23 = |
| vmaxq_u8(vreinterpretq_u8_u16(secondary_val[2]), |
| vreinterpretq_u8_u16(secondary_val[3])); |
| const uint8x16_t max_s45 = |
| vmaxq_u8(vreinterpretq_u8_u16(secondary_val[4]), |
| vreinterpretq_u8_u16(secondary_val[5])); |
| const uint8x16_t max_s67 = |
| vmaxq_u8(vreinterpretq_u8_u16(secondary_val[6]), |
| vreinterpretq_u8_u16(secondary_val[7])); |
| const uint16x8_t max_s = vreinterpretq_u16_u8( |
| vmaxq_u8(vmaxq_u8(max_s01, max_s23), vmaxq_u8(max_s45, max_s67))); |
| max = vmaxq_u16(max, vandq_u16(max_s, cdef_large_value_mask)); |
| } |
| |
| sum = vmlaq_n_s16(sum, |
| Constrain(secondary_val[0], pixel, secondary_threshold, |
| secondary_damping_shift), |
| kCdefSecondaryTap0); |
| sum = vmlaq_n_s16(sum, |
| Constrain(secondary_val[1], pixel, secondary_threshold, |
| secondary_damping_shift), |
| kCdefSecondaryTap0); |
| sum = vmlaq_n_s16(sum, |
| Constrain(secondary_val[2], pixel, secondary_threshold, |
| secondary_damping_shift), |
| kCdefSecondaryTap1); |
| sum = vmlaq_n_s16(sum, |
| Constrain(secondary_val[3], pixel, secondary_threshold, |
| secondary_damping_shift), |
| kCdefSecondaryTap1); |
| sum = vmlaq_n_s16(sum, |
| Constrain(secondary_val[4], pixel, secondary_threshold, |
| secondary_damping_shift), |
| kCdefSecondaryTap0); |
| sum = vmlaq_n_s16(sum, |
| Constrain(secondary_val[5], pixel, secondary_threshold, |
| secondary_damping_shift), |
| kCdefSecondaryTap0); |
| sum = vmlaq_n_s16(sum, |
| Constrain(secondary_val[6], pixel, secondary_threshold, |
| secondary_damping_shift), |
| kCdefSecondaryTap1); |
| sum = vmlaq_n_s16(sum, |
| Constrain(secondary_val[7], pixel, secondary_threshold, |
| secondary_damping_shift), |
| kCdefSecondaryTap1); |
| } |
| // Clip3(pixel + ((8 + sum - (sum < 0)) >> 4), min, max)) |
| const int16x8_t sum_lt_0 = vshrq_n_s16(sum, 15); |
| sum = vaddq_s16(sum, sum_lt_0); |
| int16x8_t result = vrsraq_n_s16(vreinterpretq_s16_u16(pixel), sum, 4); |
| if (clipping_required) { |
| result = vminq_s16(result, vreinterpretq_s16_u16(max)); |
| result = vmaxq_s16(result, vreinterpretq_s16_u16(min)); |
| } |
| |
| const uint8x8_t dst_pixel = vqmovun_s16(result); |
| if (width == 8) { |
| src += src_stride; |
| vst1_u8(dst, dst_pixel); |
| dst += dst_stride; |
| --y; |
| } else { |
| src += src_stride << 1; |
| StoreLo4(dst, dst_pixel); |
| dst += dst_stride; |
| StoreHi4(dst, dst_pixel); |
| dst += dst_stride; |
| y -= 2; |
| } |
| } while (y != 0); |
| } |
| |
| void Init8bpp() { |
| Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); |
| assert(dsp != nullptr); |
| dsp->cdef_direction = CdefDirection_NEON; |
| dsp->cdef_filters[0][0] = CdefFilter_NEON<4>; |
| dsp->cdef_filters[0][1] = |
| CdefFilter_NEON<4, /*enable_primary=*/true, /*enable_secondary=*/false>; |
| dsp->cdef_filters[0][2] = CdefFilter_NEON<4, /*enable_primary=*/false>; |
| dsp->cdef_filters[1][0] = CdefFilter_NEON<8>; |
| dsp->cdef_filters[1][1] = |
| CdefFilter_NEON<8, /*enable_primary=*/true, /*enable_secondary=*/false>; |
| dsp->cdef_filters[1][2] = CdefFilter_NEON<8, /*enable_primary=*/false>; |
| } |
| |
| } // namespace |
| } // namespace low_bitdepth |
| |
| void CdefInit_NEON() { low_bitdepth::Init8bpp(); } |
| |
| } // namespace dsp |
| } // namespace libgav1 |
| #else // !LIBGAV1_ENABLE_NEON |
| namespace libgav1 { |
| namespace dsp { |
| |
| void CdefInit_NEON() {} |
| |
| } // namespace dsp |
| } // namespace libgav1 |
| #endif // LIBGAV1_ENABLE_NEON |