blob: 146c983e7756f1cbdcca8884ace69713ff3907e1 [file] [log] [blame]
// Copyright 2019 The libgav1 Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "src/dsp/loop_filter.h"
#include "src/utils/cpu.h"
#if LIBGAV1_ENABLE_NEON
#include <arm_neon.h>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include "src/dsp/arm/common_neon.h"
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
namespace libgav1 {
namespace dsp {
namespace low_bitdepth {
namespace {
// (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh)
inline uint8x8_t Hev(const uint8x8_t abd_p0p1_q0q1, const uint8_t thresh) {
const uint8x8_t a = vcgt_u8(abd_p0p1_q0q1, vdup_n_u8(thresh));
return vorr_u8(a, RightShift<32>(a));
}
// abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh
inline uint8x8_t OuterThreshold(const uint8x8_t p0q0, const uint8x8_t p1q1,
const uint8_t outer_thresh) {
const uint8x8x2_t a = Interleave32(p0q0, p1q1);
const uint8x8_t b = vabd_u8(a.val[0], a.val[1]);
const uint8x8_t p0q0_double = vqadd_u8(b, b);
const uint8x8_t p1q1_half = RightShift<32>(vshr_n_u8(b, 1));
const uint8x8_t c = vqadd_u8(p0q0_double, p1q1_half);
return vcle_u8(c, vdup_n_u8(outer_thresh));
}
// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
// OuterThreshhold()
inline uint8x8_t NeedsFilter4(const uint8x8_t abd_p0p1_q0q1,
const uint8x8_t p0q0, const uint8x8_t p1q1,
const uint8_t inner_thresh,
const uint8_t outer_thresh) {
const uint8x8_t a = vcle_u8(abd_p0p1_q0q1, vdup_n_u8(inner_thresh));
const uint8x8_t inner_mask = vand_u8(a, RightShift<32>(a));
const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
return vand_u8(inner_mask, outer_mask);
}
inline void Filter4Masks(const uint8x8_t p0q0, const uint8x8_t p1q1,
const uint8_t hev_thresh, const uint8_t outer_thresh,
const uint8_t inner_thresh, uint8x8_t* const hev_mask,
uint8x8_t* const needs_filter4_mask) {
const uint8x8_t p0p1_q0q1 = vabd_u8(p0q0, p1q1);
// This includes cases where NeedsFilter4() is not true and so Filter2() will
// not be applied.
const uint8x8_t hev_tmp_mask = Hev(p0p1_q0q1, hev_thresh);
*needs_filter4_mask =
NeedsFilter4(p0p1_q0q1, p0q0, p1q1, inner_thresh, outer_thresh);
// Filter2() will only be applied if both NeedsFilter4() and Hev() are true.
*hev_mask = vand_u8(hev_tmp_mask, *needs_filter4_mask);
}
// Calculate Filter4() or Filter2() based on |hev_mask|.
inline void Filter4(const uint8x8_t q0p1, const uint8x8_t p0q1,
const uint8x8_t hev_mask, uint8x8_t* const p1q1_result,
uint8x8_t* const p0q0_result) {
const int16x4_t zero = vdup_n_s16(0);
// a = 3 * (q0 - p0) + Clip3(p1 - q1, min_signed_val, max_signed_val);
const int16x8_t q0mp0_p1mq1 = vreinterpretq_s16_u16(vsubl_u8(q0p1, p0q1));
const int16x4_t q0mp0_3 = vmul_n_s16(vget_low_s16(q0mp0_p1mq1), 3);
// If this is for Filter2() then include |p1mq1|. Otherwise zero it.
const int16x4_t p1mq1 = vget_high_s16(q0mp0_p1mq1);
const int8x8_t p1mq1_saturated = vqmovn_s16(vcombine_s16(p1mq1, zero));
const int8x8_t hev_option =
vand_s8(vreinterpret_s8_u8(hev_mask), p1mq1_saturated);
const int16x4_t a =
vget_low_s16(vaddw_s8(vcombine_s16(q0mp0_3, zero), hev_option));
// We can not shift with rounding because the clamp comes *before* the
// shifting. a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3; a2 =
// Clip3(a + 3, min_signed_val, max_signed_val) >> 3;
const int16x4_t plus_four = vadd_s16(a, vdup_n_s16(4));
const int16x4_t plus_three = vadd_s16(a, vdup_n_s16(3));
const int8x8_t a2_a1 =
vshr_n_s8(vqmovn_s16(vcombine_s16(plus_three, plus_four)), 3);
// a3 is in the high 4 values.
// a3 = (a1 + 1) >> 1;
const int8x8_t a3 = vrshr_n_s8(a2_a1, 1);
const int16x8_t p0q1_l = vreinterpretq_s16_u16(vmovl_u8(p0q1));
const int16x8_t q0p1_l = vreinterpretq_s16_u16(vmovl_u8(q0p1));
const int16x8_t p1q1_l =
vcombine_s16(vget_high_s16(q0p1_l), vget_high_s16(p0q1_l));
const int8x8_t a3_ma3 = InterleaveHigh32(a3, vneg_s8(a3));
const int16x8_t p1q1_a3 = vaddw_s8(p1q1_l, a3_ma3);
const int16x8_t p0q0_l =
vcombine_s16(vget_low_s16(p0q1_l), vget_low_s16(q0p1_l));
// Need to shift the second term or we end up with a2_ma2.
const int8x8_t a2_ma1 =
InterleaveLow32(a2_a1, RightShift<32>(vneg_s8(a2_a1)));
const int16x8_t p0q0_a = vaddw_s8(p0q0_l, a2_ma1);
*p1q1_result = vqmovun_s16(p1q1_a3);
*p0q0_result = vqmovun_s16(p0q0_a);
}
void Horizontal4_NEON(void* const dest, const ptrdiff_t stride,
const int outer_thresh, const int inner_thresh,
const int hev_thresh) {
uint8_t* dst = static_cast<uint8_t*>(dest);
const uint8x8_t p1_v = Load4(dst - 2 * stride);
const uint8x8_t p0_v = Load4(dst - stride);
const uint8x8_t p0q0 = Load4<1>(dst, p0_v);
const uint8x8_t p1q1 = Load4<1>(dst + stride, p1_v);
uint8x8_t hev_mask;
uint8x8_t needs_filter4_mask;
Filter4Masks(p0q0, p1q1, hev_thresh, outer_thresh, inner_thresh, &hev_mask,
&needs_filter4_mask);
// Copy the masks to the high bits for packed comparisons later.
hev_mask = InterleaveLow32(hev_mask, hev_mask);
needs_filter4_mask = InterleaveLow32(needs_filter4_mask, needs_filter4_mask);
#if defined(__aarch64__)
// This provides a good speedup for the unit test. Not sure how applicable it
// is to valid streams though.
// Consider doing this on armv7 if there is a quick way to check if a vector
// is zero.
if (vaddv_u8(needs_filter4_mask) == 0) {
// None of the values will be filtered.
return;
}
#endif // defined(__aarch64__)
uint8x8_t f_p1q1;
uint8x8_t f_p0q0;
const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
// Already integrated the Hev mask when calculating the filtered values.
const uint8x8_t p0q0_output = vbsl_u8(needs_filter4_mask, f_p0q0, p0q0);
// p1/q1 are unmodified if only Hev() is true. This works because it was and'd
// with |needs_filter4_mask| previously.
const uint8x8_t p1q1_mask = veor_u8(hev_mask, needs_filter4_mask);
const uint8x8_t p1q1_output = vbsl_u8(p1q1_mask, f_p1q1, p1q1);
StoreLo4(dst - 2 * stride, p1q1_output);
StoreLo4(dst - stride, p0q0_output);
StoreHi4(dst, p0q0_output);
StoreHi4(dst + stride, p1q1_output);
}
void Vertical4_NEON(void* const dest, const ptrdiff_t stride,
const int outer_thresh, const int inner_thresh,
const int hev_thresh) {
uint8_t* dst = static_cast<uint8_t*>(dest);
// Move |dst| to the left side of the filter window.
dst -= 2;
// |p1q0| and |p0q1| are named for the values they will contain after the
// transpose.
const uint8x8_t row0 = Load4(dst);
uint8x8_t p1q0 = Load4<1>(dst + stride, row0);
const uint8x8_t row2 = Load4(dst + 2 * stride);
uint8x8_t p0q1 = Load4<1>(dst + 3 * stride, row2);
Transpose4x4(&p1q0, &p0q1);
// Rearrange.
const uint8x8x2_t p1q1xq0p0 = Interleave32(p1q0, Transpose32(p0q1));
const uint8x8x2_t p1q1xp0q0 = {p1q1xq0p0.val[0],
Transpose32(p1q1xq0p0.val[1])};
uint8x8_t hev_mask;
uint8x8_t needs_filter4_mask;
Filter4Masks(p1q1xp0q0.val[1], p1q1xp0q0.val[0], hev_thresh, outer_thresh,
inner_thresh, &hev_mask, &needs_filter4_mask);
// Copy the masks to the high bits for packed comparisons later.
hev_mask = InterleaveLow32(hev_mask, hev_mask);
needs_filter4_mask = InterleaveLow32(needs_filter4_mask, needs_filter4_mask);
#if defined(__aarch64__)
// This provides a good speedup for the unit test. Not sure how applicable it
// is to valid streams though.
// Consider doing this on armv7 if there is a quick way to check if a vector
// is zero.
if (vaddv_u8(needs_filter4_mask) == 0) {
// None of the values will be filtered.
return;
}
#endif // defined(__aarch64__)
uint8x8_t f_p1q1;
uint8x8_t f_p0q0;
Filter4(Transpose32(p1q0), p0q1, hev_mask, &f_p1q1, &f_p0q0);
// Already integrated the Hev mask when calculating the filtered values.
const uint8x8_t p0q0_output =
vbsl_u8(needs_filter4_mask, f_p0q0, p1q1xp0q0.val[1]);
// p1/q1 are unmodified if only Hev() is true. This works because it was and'd
// with |needs_filter4_mask| previously.
const uint8x8_t p1q1_mask = veor_u8(hev_mask, needs_filter4_mask);
const uint8x8_t p1q1_output = vbsl_u8(p1q1_mask, f_p1q1, p1q1xp0q0.val[0]);
// Put things back in order to reverse the transpose.
const uint8x8x2_t p1p0xq1q0 = Interleave32(p1q1_output, p0q0_output);
uint8x8_t output_0 = p1p0xq1q0.val[0],
output_1 = Transpose32(p1p0xq1q0.val[1]);
Transpose4x4(&output_0, &output_1);
StoreLo4(dst, output_0);
StoreLo4(dst + stride, output_1);
StoreHi4(dst + 2 * stride, output_0);
StoreHi4(dst + 3 * stride, output_1);
}
// abs(p1 - p0) <= flat_thresh && abs(q1 - q0) <= flat_thresh &&
// abs(p2 - p0) <= flat_thresh && abs(q2 - q0) <= flat_thresh
// |flat_thresh| == 1 for 8 bit decode.
inline uint8x8_t IsFlat3(const uint8x8_t abd_p0p1_q0q1,
const uint8x8_t abd_p0p2_q0q2) {
const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p0p2_q0q2);
const uint8x8_t b = vcle_u8(a, vdup_n_u8(1));
return vand_u8(b, RightShift<32>(b));
}
// abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh &&
// abs(q1 - q0) <= inner_thresh && abs(q2 - q1) <= inner_thresh &&
// OuterThreshhold()
inline uint8x8_t NeedsFilter6(const uint8x8_t abd_p0p1_q0q1,
const uint8x8_t abd_p1p2_q1q2,
const uint8x8_t p0q0, const uint8x8_t p1q1,
const uint8_t inner_thresh,
const uint8_t outer_thresh) {
const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p1p2_q1q2);
const uint8x8_t b = vcle_u8(a, vdup_n_u8(inner_thresh));
const uint8x8_t inner_mask = vand_u8(b, RightShift<32>(b));
const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
return vand_u8(inner_mask, outer_mask);
}
inline void Filter6Masks(const uint8x8_t p2q2, const uint8x8_t p1q1,
const uint8x8_t p0q0, const uint8_t hev_thresh,
const uint8_t outer_thresh, const uint8_t inner_thresh,
uint8x8_t* const needs_filter6_mask,
uint8x8_t* const is_flat3_mask,
uint8x8_t* const hev_mask) {
const uint8x8_t p0p1_q0q1 = vabd_u8(p0q0, p1q1);
*hev_mask = Hev(p0p1_q0q1, hev_thresh);
*is_flat3_mask = IsFlat3(p0p1_q0q1, vabd_u8(p0q0, p2q2));
*needs_filter6_mask = NeedsFilter6(p0p1_q0q1, vabd_u8(p1q1, p2q2), p0q0, p1q1,
inner_thresh, outer_thresh);
}
inline void Filter6(const uint8x8_t p2q2, const uint8x8_t p1q1,
const uint8x8_t p0q0, uint8x8_t* const p1q1_output,
uint8x8_t* const p0q0_output) {
// Sum p1 and q1 output from opposite directions
// p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
// ^^^^^^^^
// q1 = p0 + (2 * q0) + (2 * q1) + (3 * q3)
// ^^^^^^^^
const uint16x8_t p2q2_double = vaddl_u8(p2q2, p2q2);
uint16x8_t sum = vaddw_u8(p2q2_double, p2q2);
// p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
// ^^^^^^^^
// q1 = p0 + (2 * q0) + (2 * q1) + (3 * q3)
// ^^^^^^^^
sum = vaddq_u16(vaddl_u8(p1q1, p1q1), sum);
// p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
// ^^^^^^^^
// q1 = p0 + (2 * q0) + (2 * q1) + (3 * q3)
// ^^^^^^^^
sum = vaddq_u16(vaddl_u8(p0q0, p0q0), sum);
// p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
// ^^
// q1 = p0 + (2 * q0) + (2 * q1) + (3 * q3)
// ^^
const uint8x8_t q0p0 = Transpose32(p0q0);
sum = vaddw_u8(sum, q0p0);
*p1q1_output = vrshrn_n_u16(sum, 3);
// Convert to p0 and q0 output:
// p0 = p1 - (2 * p2) + q0 + q1
// q0 = q1 - (2 * q2) + p0 + p1
sum = vsubq_u16(sum, p2q2_double);
const uint8x8_t q1p1 = Transpose32(p1q1);
sum = vaddq_u16(vaddl_u8(q0p0, q1p1), sum);
*p0q0_output = vrshrn_n_u16(sum, 3);
}
void Horizontal6_NEON(void* const dest, const ptrdiff_t stride,
const int outer_thresh, const int inner_thresh,
const int hev_thresh) {
auto* dst = static_cast<uint8_t*>(dest);
const uint8x8_t p2_v = Load4(dst - 3 * stride);
const uint8x8_t p1_v = Load4(dst - 2 * stride);
const uint8x8_t p0_v = Load4(dst - stride);
const uint8x8_t p0q0 = Load4<1>(dst, p0_v);
const uint8x8_t p1q1 = Load4<1>(dst + stride, p1_v);
const uint8x8_t p2q2 = Load4<1>(dst + 2 * stride, p2_v);
uint8x8_t needs_filter6_mask, is_flat3_mask, hev_mask;
Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
&needs_filter6_mask, &is_flat3_mask, &hev_mask);
needs_filter6_mask = InterleaveLow32(needs_filter6_mask, needs_filter6_mask);
is_flat3_mask = InterleaveLow32(is_flat3_mask, is_flat3_mask);
hev_mask = InterleaveLow32(hev_mask, hev_mask);
#if defined(__aarch64__)
// This provides a good speedup for the unit test. Not sure how applicable it
// is to valid streams though.
// Consider doing this on armv7 if there is a quick way to check if a vector
// is zero.
if (vaddv_u8(needs_filter6_mask) == 0) {
// None of the values will be filtered.
return;
}
#endif // defined(__aarch64__)
uint8x8_t f_p1q1;
uint8x8_t f_p0q0;
const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
// Reset the outer values if only a Hev() mask was required.
f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);
uint8x8_t f6_p1q1, f6_p0q0;
#if defined(__aarch64__)
if (vaddv_u8(vand_u8(is_flat3_mask, needs_filter6_mask)) == 0) {
// Filter6() does not apply.
const uint8x8_t zero = vdup_n_u8(0);
f6_p1q1 = zero;
f6_p0q0 = zero;
} else {
#endif // defined(__aarch64__)
Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
#if defined(__aarch64__)
}
#endif // defined(__aarch64__)
uint8x8_t p1q1_output = vbsl_u8(is_flat3_mask, f6_p1q1, f_p1q1);
p1q1_output = vbsl_u8(needs_filter6_mask, p1q1_output, p1q1);
StoreLo4(dst - 2 * stride, p1q1_output);
StoreHi4(dst + stride, p1q1_output);
uint8x8_t p0q0_output = vbsl_u8(is_flat3_mask, f6_p0q0, f_p0q0);
p0q0_output = vbsl_u8(needs_filter6_mask, p0q0_output, p0q0);
StoreLo4(dst - stride, p0q0_output);
StoreHi4(dst, p0q0_output);
}
void Vertical6_NEON(void* const dest, const ptrdiff_t stride,
const int outer_thresh, const int inner_thresh,
const int hev_thresh) {
auto* dst = static_cast<uint8_t*>(dest);
// Move |dst| to the left side of the filter window.
dst -= 3;
// |p2q1|, |p1q2|, |p0xx| and |q0xx| are named for the values they will
// contain after the transpose.
// These over-read by 2 bytes. We only need 6.
uint8x8_t p2q1 = vld1_u8(dst);
uint8x8_t p1q2 = vld1_u8(dst + stride);
uint8x8_t p0xx = vld1_u8(dst + 2 * stride);
uint8x8_t q0xx = vld1_u8(dst + 3 * stride);
Transpose8x4(&p2q1, &p1q2, &p0xx, &q0xx);
const uint8x8x2_t p2q2xq1p1 = Interleave32(p2q1, Transpose32(p1q2));
const uint8x8_t p2q2 = p2q2xq1p1.val[0];
const uint8x8_t p1q1 = Transpose32(p2q2xq1p1.val[1]);
const uint8x8_t p0q0 = InterleaveLow32(p0xx, q0xx);
uint8x8_t needs_filter6_mask, is_flat3_mask, hev_mask;
Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
&needs_filter6_mask, &is_flat3_mask, &hev_mask);
needs_filter6_mask = InterleaveLow32(needs_filter6_mask, needs_filter6_mask);
is_flat3_mask = InterleaveLow32(is_flat3_mask, is_flat3_mask);
hev_mask = InterleaveLow32(hev_mask, hev_mask);
#if defined(__aarch64__)
// This provides a good speedup for the unit test. Not sure how applicable it
// is to valid streams though.
// Consider doing this on armv7 if there is a quick way to check if a vector
// is zero.
if (vaddv_u8(needs_filter6_mask) == 0) {
// None of the values will be filtered.
return;
}
#endif // defined(__aarch64__)
uint8x8_t f_p1q1;
uint8x8_t f_p0q0;
const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
// Reset the outer values if only a Hev() mask was required.
f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);
uint8x8_t f6_p1q1, f6_p0q0;
#if defined(__aarch64__)
if (vaddv_u8(vand_u8(is_flat3_mask, needs_filter6_mask)) == 0) {
// Filter6() does not apply.
const uint8x8_t zero = vdup_n_u8(0);
f6_p1q1 = zero;
f6_p0q0 = zero;
} else {
#endif // defined(__aarch64__)
Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
#if defined(__aarch64__)
}
#endif // defined(__aarch64__)
uint8x8_t p1q1_output = vbsl_u8(is_flat3_mask, f6_p1q1, f_p1q1);
p1q1_output = vbsl_u8(needs_filter6_mask, p1q1_output, p1q1);
uint8x8_t p0q0_output = vbsl_u8(is_flat3_mask, f6_p0q0, f_p0q0);
p0q0_output = vbsl_u8(needs_filter6_mask, p0q0_output, p0q0);
// The six tap filter is only six taps on input. Output is limited to p1-q1.
dst += 1;
// Put things back in order to reverse the transpose.
const uint8x8x2_t p1p0xq1q0 = Interleave32(p1q1_output, p0q0_output);
uint8x8_t output_0 = p1p0xq1q0.val[0];
uint8x8_t output_1 = Transpose32(p1p0xq1q0.val[1]);
Transpose4x4(&output_0, &output_1);
StoreLo4(dst, output_0);
StoreLo4(dst + stride, output_1);
StoreHi4(dst + 2 * stride, output_0);
StoreHi4(dst + 3 * stride, output_1);
}
// IsFlat4 uses N=1, IsFlatOuter4 uses N=4.
// abs(p[N] - p0) <= flat_thresh && abs(q[N] - q0) <= flat_thresh &&
// abs(p[N+1] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh &&
// abs(p[N+2] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh
// |flat_thresh| == 1 for 8 bit decode.
inline uint8x8_t IsFlat4(const uint8x8_t abd_p0n0_q0n0,
const uint8x8_t abd_p0n1_q0n1,
const uint8x8_t abd_p0n2_q0n2) {
const uint8x8_t a = vmax_u8(abd_p0n0_q0n0, abd_p0n1_q0n1);
const uint8x8_t b = vmax_u8(a, abd_p0n2_q0n2);
const uint8x8_t c = vcle_u8(b, vdup_n_u8(1));
return vand_u8(c, RightShift<32>(c));
}
// abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh &&
// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
// abs(q2 - q1) <= inner_thresh && abs(q3 - q2) <= inner_thresh
// OuterThreshhold()
inline uint8x8_t NeedsFilter8(const uint8x8_t abd_p0p1_q0q1,
const uint8x8_t abd_p1p2_q1q2,
const uint8x8_t abd_p2p3_q2q3,
const uint8x8_t p0q0, const uint8x8_t p1q1,
const uint8_t inner_thresh,
const uint8_t outer_thresh) {
const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p1p2_q1q2);
const uint8x8_t b = vmax_u8(a, abd_p2p3_q2q3);
const uint8x8_t c = vcle_u8(b, vdup_n_u8(inner_thresh));
const uint8x8_t inner_mask = vand_u8(c, RightShift<32>(c));
const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
return vand_u8(inner_mask, outer_mask);
}
inline void Filter8Masks(const uint8x8_t p3q3, const uint8x8_t p2q2,
const uint8x8_t p1q1, const uint8x8_t p0q0,
const uint8_t hev_thresh, const uint8_t outer_thresh,
const uint8_t inner_thresh,
uint8x8_t* const needs_filter8_mask,
uint8x8_t* const is_flat4_mask,
uint8x8_t* const hev_mask) {
const uint8x8_t p0p1_q0q1 = vabd_u8(p0q0, p1q1);
*hev_mask = Hev(p0p1_q0q1, hev_thresh);
*is_flat4_mask = IsFlat4(p0p1_q0q1, vabd_u8(p0q0, p2q2), vabd_u8(p0q0, p3q3));
*needs_filter8_mask =
NeedsFilter8(p0p1_q0q1, vabd_u8(p1q1, p2q2), vabd_u8(p2q2, p3q3), p0q0,
p1q1, inner_thresh, outer_thresh);
}
inline void Filter8(const uint8x8_t p3q3, const uint8x8_t p2q2,
const uint8x8_t p1q1, const uint8x8_t p0q0,
uint8x8_t* const p2q2_output, uint8x8_t* const p1q1_output,
uint8x8_t* const p0q0_output) {
// Sum p2 and q2 output from opposite directions
// p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0
// ^^^^^^^^
// q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3)
// ^^^^^^^^
uint16x8_t sum = vaddw_u8(vaddl_u8(p3q3, p3q3), p3q3);
// p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0
// ^^^^^^^^
// q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3)
// ^^^^^^^^
sum = vaddq_u16(vaddl_u8(p2q2, p2q2), sum);
// p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0
// ^^^^^^^
// q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3)
// ^^^^^^^
sum = vaddq_u16(vaddl_u8(p1q1, p0q0), sum);
// p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0
// ^^
// q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3)
// ^^
const uint8x8_t q0p0 = Transpose32(p0q0);
sum = vaddw_u8(sum, q0p0);
*p2q2_output = vrshrn_n_u16(sum, 3);
// Convert to p1 and q1 output:
// p1 = p2 - p3 - p2 + p1 + q1
// q1 = q2 - q3 - q2 + q0 + p1
sum = vsubq_u16(sum, vaddl_u8(p3q3, p2q2));
const uint8x8_t q1p1 = Transpose32(p1q1);
sum = vaddq_u16(vaddl_u8(p1q1, q1p1), sum);
*p1q1_output = vrshrn_n_u16(sum, 3);
// Convert to p0 and q0 output:
// p0 = p1 - p3 - p1 + p0 + q2
// q0 = q1 - q3 - q1 + q0 + p2
sum = vsubq_u16(sum, vaddl_u8(p3q3, p1q1));
const uint8x8_t q2p2 = Transpose32(p2q2);
sum = vaddq_u16(vaddl_u8(p0q0, q2p2), sum);
*p0q0_output = vrshrn_n_u16(sum, 3);
}
void Horizontal8_NEON(void* const dest, const ptrdiff_t stride,
const int outer_thresh, const int inner_thresh,
const int hev_thresh) {
auto* dst = static_cast<uint8_t*>(dest);
const uint8x8_t p3_v = Load4(dst - 4 * stride);
const uint8x8_t p2_v = Load4(dst - 3 * stride);
const uint8x8_t p1_v = Load4(dst - 2 * stride);
const uint8x8_t p0_v = Load4(dst - stride);
const uint8x8_t p0q0 = Load4<1>(dst, p0_v);
const uint8x8_t p1q1 = Load4<1>(dst + stride, p1_v);
const uint8x8_t p2q2 = Load4<1>(dst + 2 * stride, p2_v);
const uint8x8_t p3q3 = Load4<1>(dst + 3 * stride, p3_v);
uint8x8_t needs_filter8_mask, is_flat4_mask, hev_mask;
Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
&needs_filter8_mask, &is_flat4_mask, &hev_mask);
needs_filter8_mask = InterleaveLow32(needs_filter8_mask, needs_filter8_mask);
is_flat4_mask = vand_u8(is_flat4_mask, needs_filter8_mask);
is_flat4_mask = InterleaveLow32(is_flat4_mask, is_flat4_mask);
hev_mask = InterleaveLow32(hev_mask, hev_mask);
#if defined(__aarch64__)
// This provides a good speedup for the unit test. Not sure how applicable it
// is to valid streams though.
// Consider doing this on armv7 if there is a quick way to check if a vector
// is zero.
if (vaddv_u8(needs_filter8_mask) == 0) {
// None of the values will be filtered.
return;
}
#endif // defined(__aarch64__)
uint8x8_t f_p1q1;
uint8x8_t f_p0q0;
const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
// Reset the outer values if only a Hev() mask was required.
f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);
uint8x8_t f8_p2q2, f8_p1q1, f8_p0q0;
#if defined(__aarch64__)
if (vaddv_u8(is_flat4_mask) == 0) {
// Filter8() does not apply.
const uint8x8_t zero = vdup_n_u8(0);
f8_p2q2 = zero;
f8_p1q1 = zero;
f8_p0q0 = zero;
} else {
#endif // defined(__aarch64__)
Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
const uint8x8_t p2p2_output = vbsl_u8(is_flat4_mask, f8_p2q2, p2q2);
StoreLo4(dst - 3 * stride, p2p2_output);
StoreHi4(dst + 2 * stride, p2p2_output);
#if defined(__aarch64__)
}
#endif // defined(__aarch64__)
uint8x8_t p1q1_output = vbsl_u8(is_flat4_mask, f8_p1q1, f_p1q1);
p1q1_output = vbsl_u8(needs_filter8_mask, p1q1_output, p1q1);
StoreLo4(dst - 2 * stride, p1q1_output);
StoreHi4(dst + stride, p1q1_output);
uint8x8_t p0q0_output = vbsl_u8(is_flat4_mask, f8_p0q0, f_p0q0);
p0q0_output = vbsl_u8(needs_filter8_mask, p0q0_output, p0q0);
StoreLo4(dst - stride, p0q0_output);
StoreHi4(dst, p0q0_output);
}
void Vertical8_NEON(void* const dest, const ptrdiff_t stride,
const int outer_thresh, const int inner_thresh,
const int hev_thresh) {
auto* dst = static_cast<uint8_t*>(dest);
// Move |dst| to the left side of the filter window.
dst -= 4;
// |p3q0|, |p2q1|, |p1q2| and |p0q3| are named for the values they will
// contain after the transpose.
uint8x8_t p3q0 = vld1_u8(dst);
uint8x8_t p2q1 = vld1_u8(dst + stride);
uint8x8_t p1q2 = vld1_u8(dst + 2 * stride);
uint8x8_t p0q3 = vld1_u8(dst + 3 * stride);
Transpose8x4(&p3q0, &p2q1, &p1q2, &p0q3);
const uint8x8x2_t p3q3xq0p0 = Interleave32(p3q0, Transpose32(p0q3));
const uint8x8_t p3q3 = p3q3xq0p0.val[0];
const uint8x8_t p0q0 = Transpose32(p3q3xq0p0.val[1]);
const uint8x8x2_t p2q2xq1p1 = Interleave32(p2q1, Transpose32(p1q2));
const uint8x8_t p2q2 = p2q2xq1p1.val[0];
const uint8x8_t p1q1 = Transpose32(p2q2xq1p1.val[1]);
uint8x8_t needs_filter8_mask, is_flat4_mask, hev_mask;
Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
&needs_filter8_mask, &is_flat4_mask, &hev_mask);
needs_filter8_mask = InterleaveLow32(needs_filter8_mask, needs_filter8_mask);
is_flat4_mask = vand_u8(is_flat4_mask, needs_filter8_mask);
is_flat4_mask = InterleaveLow32(is_flat4_mask, is_flat4_mask);
hev_mask = InterleaveLow32(hev_mask, hev_mask);
#if defined(__aarch64__)
// This provides a good speedup for the unit test. Not sure how applicable it
// is to valid streams though.
// Consider doing this on armv7 if there is a quick way to check if a vector
// is zero.
if (vaddv_u8(needs_filter8_mask) == 0) {
// None of the values will be filtered.
return;
}
#endif // defined(__aarch64__)
uint8x8_t f_p1q1;
uint8x8_t f_p0q0;
const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
// Reset the outer values if only a Hev() mask was required.
f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);
uint8x8_t f8_p2q2, f8_p1q1, f8_p0q0;
#if defined(__aarch64__)
if (vaddv_u8(is_flat4_mask) == 0) {
// Filter8() does not apply.
const uint8x8_t zero = vdup_n_u8(0);
f8_p2q2 = zero;
f8_p1q1 = zero;
f8_p0q0 = zero;
} else {
#endif // defined(__aarch64__)
Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
#if defined(__aarch64__)
}
#endif // defined(__aarch64__)
// Always prepare and store p2/q2 because we need to transpose it anyway.
const uint8x8_t p2q2_output = vbsl_u8(is_flat4_mask, f8_p2q2, p2q2);
uint8x8_t p1q1_output = vbsl_u8(is_flat4_mask, f8_p1q1, f_p1q1);
p1q1_output = vbsl_u8(needs_filter8_mask, p1q1_output, p1q1);
uint8x8_t p0q0_output = vbsl_u8(is_flat4_mask, f8_p0q0, f_p0q0);
p0q0_output = vbsl_u8(needs_filter8_mask, p0q0_output, p0q0);
// Write out p3/q3 as well. There isn't a good way to write out 6 bytes.
// Variable names reflect the values before transposition.
const uint8x8x2_t p3q0xq3p0_output =
Interleave32(p3q3, Transpose32(p0q0_output));
uint8x8_t p3q0_output = p3q0xq3p0_output.val[0];
uint8x8_t p0q3_output = Transpose32(p3q0xq3p0_output.val[1]);
const uint8x8x2_t p2q1xq2p1_output =
Interleave32(p2q2_output, Transpose32(p1q1_output));
uint8x8_t p2q1_output = p2q1xq2p1_output.val[0];
uint8x8_t p1q2_output = Transpose32(p2q1xq2p1_output.val[1]);
Transpose8x4(&p3q0_output, &p2q1_output, &p1q2_output, &p0q3_output);
vst1_u8(dst, p3q0_output);
vst1_u8(dst + stride, p2q1_output);
vst1_u8(dst + 2 * stride, p1q2_output);
vst1_u8(dst + 3 * stride, p0q3_output);
}
inline void Filter14(const uint8x8_t p6q6, const uint8x8_t p5q5,
const uint8x8_t p4q4, const uint8x8_t p3q3,
const uint8x8_t p2q2, const uint8x8_t p1q1,
const uint8x8_t p0q0, uint8x8_t* const p5q5_output,
uint8x8_t* const p4q4_output, uint8x8_t* const p3q3_output,
uint8x8_t* const p2q2_output, uint8x8_t* const p1q1_output,
uint8x8_t* const p0q0_output) {
// Sum p5 and q5 output from opposite directions
// p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
// ^^^^^^^^
// q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
// ^^^^^^^^
uint16x8_t sum = vsubw_u8(vshll_n_u8(p6q6, 3), p6q6);
// p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
// ^^^^^^^^
// q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
// ^^^^^^^^
sum = vaddq_u16(vaddl_u8(p5q5, p5q5), sum);
// p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
// ^^^^^^^^
// q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
// ^^^^^^^^
sum = vaddq_u16(vaddl_u8(p4q4, p4q4), sum);
// p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
// ^^^^^^^
// q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
// ^^^^^^^
sum = vaddq_u16(vaddl_u8(p3q3, p2q2), sum);
// p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
// ^^^^^^^
// q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
// ^^^^^^^
sum = vaddq_u16(vaddl_u8(p1q1, p0q0), sum);
// p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
// ^^
// q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
// ^^
const uint8x8_t q0p0 = Transpose32(p0q0);
sum = vaddw_u8(sum, q0p0);
*p5q5_output = vrshrn_n_u16(sum, 4);
// Convert to p4 and q4 output:
// p4 = p5 - (2 * p6) + p3 + q1
// q4 = q5 - (2 * q6) + q3 + p1
sum = vsubq_u16(sum, vaddl_u8(p6q6, p6q6));
const uint8x8_t q1p1 = Transpose32(p1q1);
sum = vaddq_u16(vaddl_u8(p3q3, q1p1), sum);
*p4q4_output = vrshrn_n_u16(sum, 4);
// Convert to p3 and q3 output:
// p3 = p4 - p6 - p5 + p2 + q2
// q3 = q4 - q6 - q5 + q2 + p2
sum = vsubq_u16(sum, vaddl_u8(p6q6, p5q5));
const uint8x8_t q2p2 = Transpose32(p2q2);
sum = vaddq_u16(vaddl_u8(p2q2, q2p2), sum);
*p3q3_output = vrshrn_n_u16(sum, 4);
// Convert to p2 and q2 output:
// p2 = p3 - p6 - p4 + p1 + q3
// q2 = q3 - q6 - q4 + q1 + p3
sum = vsubq_u16(sum, vaddl_u8(p6q6, p4q4));
const uint8x8_t q3p3 = Transpose32(p3q3);
sum = vaddq_u16(vaddl_u8(p1q1, q3p3), sum);
*p2q2_output = vrshrn_n_u16(sum, 4);
// Convert to p1 and q1 output:
// p1 = p2 - p6 - p3 + p0 + q4
// q1 = q2 - q6 - q3 + q0 + p4
sum = vsubq_u16(sum, vaddl_u8(p6q6, p3q3));
const uint8x8_t q4p4 = Transpose32(p4q4);
sum = vaddq_u16(vaddl_u8(p0q0, q4p4), sum);
*p1q1_output = vrshrn_n_u16(sum, 4);
// Convert to p0 and q0 output:
// p0 = p1 - p6 - p2 + q0 + q5
// q0 = q1 - q6 - q2 + p0 + p5
sum = vsubq_u16(sum, vaddl_u8(p6q6, p2q2));
const uint8x8_t q5p5 = Transpose32(p5q5);
sum = vaddq_u16(vaddl_u8(q0p0, q5p5), sum);
*p0q0_output = vrshrn_n_u16(sum, 4);
}
void Horizontal14_NEON(void* const dest, const ptrdiff_t stride,
const int outer_thresh, const int inner_thresh,
const int hev_thresh) {
auto* dst = static_cast<uint8_t*>(dest);
const uint8x8_t p6_v = Load4(dst - 7 * stride);
const uint8x8_t p5_v = Load4(dst - 6 * stride);
const uint8x8_t p4_v = Load4(dst - 5 * stride);
const uint8x8_t p3_v = Load4(dst - 4 * stride);
const uint8x8_t p2_v = Load4(dst - 3 * stride);
const uint8x8_t p1_v = Load4(dst - 2 * stride);
const uint8x8_t p0_v = Load4(dst - stride);
const uint8x8_t p0q0 = Load4<1>(dst, p0_v);
const uint8x8_t p1q1 = Load4<1>(dst + stride, p1_v);
const uint8x8_t p2q2 = Load4<1>(dst + 2 * stride, p2_v);
const uint8x8_t p3q3 = Load4<1>(dst + 3 * stride, p3_v);
const uint8x8_t p4q4 = Load4<1>(dst + 4 * stride, p4_v);
const uint8x8_t p5q5 = Load4<1>(dst + 5 * stride, p5_v);
const uint8x8_t p6q6 = Load4<1>(dst + 6 * stride, p6_v);
uint8x8_t needs_filter8_mask, is_flat4_mask, hev_mask;
Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
&needs_filter8_mask, &is_flat4_mask, &hev_mask);
needs_filter8_mask = InterleaveLow32(needs_filter8_mask, needs_filter8_mask);
is_flat4_mask = vand_u8(is_flat4_mask, needs_filter8_mask);
is_flat4_mask = InterleaveLow32(is_flat4_mask, is_flat4_mask);
hev_mask = InterleaveLow32(hev_mask, hev_mask);
#if defined(__aarch64__)
// This provides a good speedup for the unit test. Not sure how applicable it
// is to valid streams though.
// Consider doing this on armv7 if there is a quick way to check if a vector
// is zero.
if (vaddv_u8(needs_filter8_mask) == 0) {
// None of the values will be filtered.
return;
}
#endif // defined(__aarch64__)
// Decide between Filter8() and Filter14().
uint8x8_t is_flat_outer4_mask =
IsFlat4(vabd_u8(p0q0, p4q4), vabd_u8(p0q0, p5q5), vabd_u8(p0q0, p6q6));
is_flat_outer4_mask = vand_u8(is_flat4_mask, is_flat_outer4_mask);
is_flat_outer4_mask =
InterleaveLow32(is_flat_outer4_mask, is_flat_outer4_mask);
uint8x8_t f_p1q1;
uint8x8_t f_p0q0;
const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
// Reset the outer values if only a Hev() mask was required.
f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);
uint8x8_t f8_p1q1, f8_p0q0;
uint8x8_t f14_p2q2, f14_p1q1, f14_p0q0;
#if defined(__aarch64__)
if (vaddv_u8(is_flat4_mask) == 0) {
// Filter8() and Filter14() do not apply.
const uint8x8_t zero = vdup_n_u8(0);
f8_p1q1 = zero;
f8_p0q0 = zero;
f14_p1q1 = zero;
f14_p0q0 = zero;
} else {
#endif // defined(__aarch64__)
uint8x8_t f8_p2q2;
Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
#if defined(__aarch64__)
if (vaddv_u8(is_flat_outer4_mask) == 0) {
// Filter14() does not apply.
const uint8x8_t zero = vdup_n_u8(0);
f14_p2q2 = zero;
f14_p1q1 = zero;
f14_p0q0 = zero;
} else {
#endif // defined(__aarch64__)
uint8x8_t f14_p5q5, f14_p4q4, f14_p3q3;
Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
&f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
const uint8x8_t p5q5_output =
vbsl_u8(is_flat_outer4_mask, f14_p5q5, p5q5);
StoreLo4(dst - 6 * stride, p5q5_output);
StoreHi4(dst + 5 * stride, p5q5_output);
const uint8x8_t p4q4_output =
vbsl_u8(is_flat_outer4_mask, f14_p4q4, p4q4);
StoreLo4(dst - 5 * stride, p4q4_output);
StoreHi4(dst + 4 * stride, p4q4_output);
const uint8x8_t p3q3_output =
vbsl_u8(is_flat_outer4_mask, f14_p3q3, p3q3);
StoreLo4(dst - 4 * stride, p3q3_output);
StoreHi4(dst + 3 * stride, p3q3_output);
#if defined(__aarch64__)
}
#endif // defined(__aarch64__)
uint8x8_t p2q2_output = vbsl_u8(is_flat_outer4_mask, f14_p2q2, f8_p2q2);
p2q2_output = vbsl_u8(is_flat4_mask, p2q2_output, p2q2);
StoreLo4(dst - 3 * stride, p2q2_output);
StoreHi4(dst + 2 * stride, p2q2_output);
#if defined(__aarch64__)
}
#endif // defined(__aarch64__)
uint8x8_t p1q1_output = vbsl_u8(is_flat_outer4_mask, f14_p1q1, f8_p1q1);
p1q1_output = vbsl_u8(is_flat4_mask, p1q1_output, f_p1q1);
p1q1_output = vbsl_u8(needs_filter8_mask, p1q1_output, p1q1);
StoreLo4(dst - 2 * stride, p1q1_output);
StoreHi4(dst + stride, p1q1_output);
uint8x8_t p0q0_output = vbsl_u8(is_flat_outer4_mask, f14_p0q0, f8_p0q0);
p0q0_output = vbsl_u8(is_flat4_mask, p0q0_output, f_p0q0);
p0q0_output = vbsl_u8(needs_filter8_mask, p0q0_output, p0q0);
StoreLo4(dst - stride, p0q0_output);
StoreHi4(dst, p0q0_output);
}
void Vertical14_NEON(void* const dest, const ptrdiff_t stride,
const int outer_thresh, const int inner_thresh,
const int hev_thresh) {
auto* dst = static_cast<uint8_t*>(dest);
dst -= 8;
// input
// p7 p6 p5 p4 p3 p2 p1 p0 q0 q1 q2 q3 q4 q5 q6 q7
const uint8x16_t x0 = vld1q_u8(dst);
dst += stride;
const uint8x16_t x1 = vld1q_u8(dst);
dst += stride;
const uint8x16_t x2 = vld1q_u8(dst);
dst += stride;
const uint8x16_t x3 = vld1q_u8(dst);
dst -= (stride * 3);
// re-order input
#if defined(__aarch64__)
const uint8x8_t index_qp3toqp0 = vcreate_u8(0x0b0a090804050607);
const uint8x8_t index_qp7toqp4 = vcreate_u8(0x0f0e0d0c00010203);
const uint8x16_t index_qp7toqp0 = vcombine_u8(index_qp3toqp0, index_qp7toqp4);
uint8x16_t input_0 = vqtbl1q_u8(x0, index_qp7toqp0);
uint8x16_t input_1 = vqtbl1q_u8(x1, index_qp7toqp0);
uint8x16_t input_2 = vqtbl1q_u8(x2, index_qp7toqp0);
uint8x16_t input_3 = vqtbl1q_u8(x3, index_qp7toqp0);
#else
const uint8x8_t index_qp3toqp0 = vcreate_u8(0x0b0a090804050607);
const uint8x8_t index_qp7toqp4 = vcreate_u8(0x0f0e0d0c00010203);
const uint8x8_t x0_qp3qp0 = VQTbl1U8(x0, index_qp3toqp0);
const uint8x8_t x1_qp3qp0 = VQTbl1U8(x1, index_qp3toqp0);
const uint8x8_t x2_qp3qp0 = VQTbl1U8(x2, index_qp3toqp0);
const uint8x8_t x3_qp3qp0 = VQTbl1U8(x3, index_qp3toqp0);
const uint8x8_t x0_qp7qp4 = VQTbl1U8(x0, index_qp7toqp4);
const uint8x8_t x1_qp7qp4 = VQTbl1U8(x1, index_qp7toqp4);
const uint8x8_t x2_qp7qp4 = VQTbl1U8(x2, index_qp7toqp4);
const uint8x8_t x3_qp7qp4 = VQTbl1U8(x3, index_qp7toqp4);
const uint8x16_t input_0 = vcombine_u8(x0_qp3qp0, x0_qp7qp4);
const uint8x16_t input_1 = vcombine_u8(x1_qp3qp0, x1_qp7qp4);
const uint8x16_t input_2 = vcombine_u8(x2_qp3qp0, x2_qp7qp4);
const uint8x16_t input_3 = vcombine_u8(x3_qp3qp0, x3_qp7qp4);
#endif
// input after re-order
// p0 p1 p2 p3 q0 q1 q2 q3 p4 p5 p6 p7 q4 q5 q6 q7
const uint8x16x2_t in01 = vtrnq_u8(input_0, input_1);
const uint8x16x2_t in23 = vtrnq_u8(input_2, input_3);
const uint16x8x2_t in02 = vtrnq_u16(vreinterpretq_u16_u8(in01.val[0]),
vreinterpretq_u16_u8(in23.val[0]));
const uint16x8x2_t in13 = vtrnq_u16(vreinterpretq_u16_u8(in01.val[1]),
vreinterpretq_u16_u8(in23.val[1]));
const uint8x8_t p0q0 = vget_low_u8(vreinterpretq_u8_u16(in02.val[0]));
const uint8x8_t p1q1 = vget_low_u8(vreinterpretq_u8_u16(in13.val[0]));
const uint8x8_t p2q2 = vget_low_u8(vreinterpretq_u8_u16(in02.val[1]));
const uint8x8_t p3q3 = vget_low_u8(vreinterpretq_u8_u16(in13.val[1]));
const uint8x8_t p4q4 = vget_high_u8(vreinterpretq_u8_u16(in02.val[0]));
const uint8x8_t p5q5 = vget_high_u8(vreinterpretq_u8_u16(in13.val[0]));
const uint8x8_t p6q6 = vget_high_u8(vreinterpretq_u8_u16(in02.val[1]));
const uint8x8_t p7q7 = vget_high_u8(vreinterpretq_u8_u16(in13.val[1]));
uint8x8_t needs_filter8_mask, is_flat4_mask, hev_mask;
Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
&needs_filter8_mask, &is_flat4_mask, &hev_mask);
needs_filter8_mask = InterleaveLow32(needs_filter8_mask, needs_filter8_mask);
is_flat4_mask = vand_u8(is_flat4_mask, needs_filter8_mask);
is_flat4_mask = InterleaveLow32(is_flat4_mask, is_flat4_mask);
hev_mask = InterleaveLow32(hev_mask, hev_mask);
#if defined(__aarch64__)
// This provides a good speedup for the unit test. Not sure how applicable it
// is to valid streams though.
// Consider doing this on armv7 if there is a quick way to check if a vector
// is zero.
if (vaddv_u8(needs_filter8_mask) == 0) {
// None of the values will be filtered.
return;
}
#endif // defined(__aarch64__)
// Decide between Filter8() and Filter14().
uint8x8_t is_flat_outer4_mask =
IsFlat4(vabd_u8(p0q0, p4q4), vabd_u8(p0q0, p5q5), vabd_u8(p0q0, p6q6));
is_flat_outer4_mask = vand_u8(is_flat4_mask, is_flat_outer4_mask);
is_flat_outer4_mask =
InterleaveLow32(is_flat_outer4_mask, is_flat_outer4_mask);
uint8x8_t f_p0q0, f_p1q1;
const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
// Reset the outer values if only a Hev() mask was required.
f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);
uint8x8_t p1q1_output, p0q0_output;
uint8x8_t p5q5_output, p4q4_output, p3q3_output, p2q2_output;
#if defined(__aarch64__)
if (vaddv_u8(is_flat4_mask) == 0) {
// Filter8() and Filter14() do not apply.
p1q1_output = p1q1;
p0q0_output = p0q0;
p5q5_output = p5q5;
p4q4_output = p4q4;
p3q3_output = p3q3;
p2q2_output = p2q2;
} else {
#endif // defined(__aarch64__)
uint8x8_t f8_p2q2, f8_p1q1, f8_p0q0;
Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
#if defined(__aarch64__)
if (vaddv_u8(is_flat_outer4_mask) == 0) {
// Filter14() does not apply.
p5q5_output = p5q5;
p4q4_output = p4q4;
p3q3_output = p3q3;
p2q2_output = f8_p2q2;
p1q1_output = f8_p1q1;
p0q0_output = f8_p0q0;
} else {
#endif // defined(__aarch64__)
uint8x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
&f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
p5q5_output = vbsl_u8(is_flat_outer4_mask, f14_p5q5, p5q5);
p4q4_output = vbsl_u8(is_flat_outer4_mask, f14_p4q4, p4q4);
p3q3_output = vbsl_u8(is_flat_outer4_mask, f14_p3q3, p3q3);
p2q2_output = vbsl_u8(is_flat_outer4_mask, f14_p2q2, f8_p2q2);
p1q1_output = vbsl_u8(is_flat_outer4_mask, f14_p1q1, f8_p1q1);
p0q0_output = vbsl_u8(is_flat_outer4_mask, f14_p0q0, f8_p0q0);
#if defined(__aarch64__)
}
#endif // defined(__aarch64__)
p2q2_output = vbsl_u8(is_flat4_mask, p2q2_output, p2q2);
#if defined(__aarch64__)
}
#endif // defined(__aarch64__)
p1q1_output = vbsl_u8(is_flat4_mask, p1q1_output, f_p1q1);
p1q1_output = vbsl_u8(needs_filter8_mask, p1q1_output, p1q1);
p0q0_output = vbsl_u8(is_flat4_mask, p0q0_output, f_p0q0);
p0q0_output = vbsl_u8(needs_filter8_mask, p0q0_output, p0q0);
const uint8x16_t p0q0_p4q4 = vcombine_u8(p0q0_output, p4q4_output);
const uint8x16_t p2q2_p6q6 = vcombine_u8(p2q2_output, p6q6);
const uint8x16_t p1q1_p5q5 = vcombine_u8(p1q1_output, p5q5_output);
const uint8x16_t p3q3_p7q7 = vcombine_u8(p3q3_output, p7q7);
const uint16x8x2_t out02 = vtrnq_u16(vreinterpretq_u16_u8(p0q0_p4q4),
vreinterpretq_u16_u8(p2q2_p6q6));
const uint16x8x2_t out13 = vtrnq_u16(vreinterpretq_u16_u8(p1q1_p5q5),
vreinterpretq_u16_u8(p3q3_p7q7));
const uint8x16x2_t out01 = vtrnq_u8(vreinterpretq_u8_u16(out02.val[0]),
vreinterpretq_u8_u16(out13.val[0]));
const uint8x16x2_t out23 = vtrnq_u8(vreinterpretq_u8_u16(out02.val[1]),
vreinterpretq_u8_u16(out13.val[1]));
#if defined(__aarch64__)
const uint8x8_t index_p7top0 = vcreate_u8(0x0001020308090a0b);
const uint8x8_t index_q7toq0 = vcreate_u8(0x0f0e0d0c07060504);
const uint8x16_t index_p7toq7 = vcombine_u8(index_p7top0, index_q7toq0);
const uint8x16_t output_0 = vqtbl1q_u8(out01.val[0], index_p7toq7);
const uint8x16_t output_1 = vqtbl1q_u8(out01.val[1], index_p7toq7);
const uint8x16_t output_2 = vqtbl1q_u8(out23.val[0], index_p7toq7);
const uint8x16_t output_3 = vqtbl1q_u8(out23.val[1], index_p7toq7);
#else
const uint8x8_t index_p7top0 = vcreate_u8(0x0001020308090a0b);
const uint8x8_t index_q7toq0 = vcreate_u8(0x0f0e0d0c07060504);
const uint8x8_t x0_p7p0 = VQTbl1U8(out01.val[0], index_p7top0);
const uint8x8_t x1_p7p0 = VQTbl1U8(out01.val[1], index_p7top0);
const uint8x8_t x2_p7p0 = VQTbl1U8(out23.val[0], index_p7top0);
const uint8x8_t x3_p7p0 = VQTbl1U8(out23.val[1], index_p7top0);
const uint8x8_t x0_q7q0 = VQTbl1U8(out01.val[0], index_q7toq0);
const uint8x8_t x1_q7q0 = VQTbl1U8(out01.val[1], index_q7toq0);
const uint8x8_t x2_q7q0 = VQTbl1U8(out23.val[0], index_q7toq0);
const uint8x8_t x3_q7q0 = VQTbl1U8(out23.val[1], index_q7toq0);
const uint8x16_t output_0 = vcombine_u8(x0_p7p0, x0_q7q0);
const uint8x16_t output_1 = vcombine_u8(x1_p7p0, x1_q7q0);
const uint8x16_t output_2 = vcombine_u8(x2_p7p0, x2_q7q0);
const uint8x16_t output_3 = vcombine_u8(x3_p7p0, x3_q7q0);
#endif
vst1q_u8(dst, output_0);
dst += stride;
vst1q_u8(dst, output_1);
dst += stride;
vst1q_u8(dst, output_2);
dst += stride;
vst1q_u8(dst, output_3);
}
void Init8bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
assert(dsp != nullptr);
dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
Horizontal4_NEON;
dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = Vertical4_NEON;
dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
Horizontal6_NEON;
dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = Vertical6_NEON;
dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
Horizontal8_NEON;
dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = Vertical8_NEON;
dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
Horizontal14_NEON;
dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
Vertical14_NEON;
}
} // namespace
} // namespace low_bitdepth
void LoopFilterInit_NEON() { low_bitdepth::Init8bpp(); }
} // namespace dsp
} // namespace libgav1
#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
void LoopFilterInit_NEON() {}
} // namespace dsp
} // namespace libgav1
#endif // LIBGAV1_ENABLE_NEON