Remove CDEF_FULL and CDEF_CAP defines
Change-Id: I562f1f59c16c1a0295cc141dbcdf122160aa3db0
diff --git a/av1/common/cdef_block.c b/av1/common/cdef_block.c
index be5d6bd..d3ed95e 100644
--- a/av1/common/cdef_block.c
+++ b/av1/common/cdef_block.c
@@ -21,18 +21,6 @@
#include "./cdef.h"
/* Generated from gen_filter_tables.c. */
-#if CDEF_FULL
-DECLARE_ALIGNED(16, const int, cdef_directions[8][3]) = {
- { -1 * CDEF_BSTRIDE + 1, -2 * CDEF_BSTRIDE + 2, -3 * CDEF_BSTRIDE + 3 },
- { 0 * CDEF_BSTRIDE + 1, -1 * CDEF_BSTRIDE + 2, -1 * CDEF_BSTRIDE + 3 },
- { 0 * CDEF_BSTRIDE + 1, 0 * CDEF_BSTRIDE + 2, 0 * CDEF_BSTRIDE + 3 },
- { 0 * CDEF_BSTRIDE + 1, 1 * CDEF_BSTRIDE + 2, 1 * CDEF_BSTRIDE + 3 },
- { 1 * CDEF_BSTRIDE + 1, 2 * CDEF_BSTRIDE + 2, 3 * CDEF_BSTRIDE + 3 },
- { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 1, 3 * CDEF_BSTRIDE + 1 },
- { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 0, 3 * CDEF_BSTRIDE + 0 },
- { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE - 1, 3 * CDEF_BSTRIDE - 1 }
-};
-#else
DECLARE_ALIGNED(16, const int, cdef_directions[8][2]) = {
{ -1 * CDEF_BSTRIDE + 1, -2 * CDEF_BSTRIDE + 2 },
{ 0 * CDEF_BSTRIDE + 1, -1 * CDEF_BSTRIDE + 2 },
@@ -43,7 +31,6 @@
{ 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 0 },
{ 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE - 1 }
};
-#endif
/* Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on.
The search minimizes the weighted variance along all the lines in a
@@ -123,27 +110,14 @@
return best_dir;
}
-#if CDEF_FULL
-const int cdef_pri_taps[2][3] = { { 3, 2, 1 }, { 2, 2, 2 } };
-const int cdef_sec_taps[2][2] = { { 3, 1 }, { 3, 1 } };
-#else
const int cdef_pri_taps[2][2] = { { 4, 2 }, { 3, 3 } };
const int cdef_sec_taps[2][2] = { { 2, 1 }, { 2, 1 } };
-#endif
/* Smooth in the direction detected. */
-#if CDEF_CAP
void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride,
const uint16_t *in, int pri_strength, int sec_strength,
int dir, int pri_damping, int sec_damping, int bsize,
- AOM_UNUSED int max_unused, int coeff_shift)
-#else
-void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride,
- const uint16_t *in, int pri_strength, int sec_strength,
- int dir, int pri_damping, int sec_damping, int bsize,
- int max, int coeff_shift)
-#endif
-{
+ AOM_UNUSED int max_unused, int coeff_shift) {
int i, j, k;
const int s = CDEF_BSTRIDE;
const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
@@ -153,34 +127,21 @@
int16_t sum = 0;
int16_t y;
int16_t x = in[i * s + j];
-#if CDEF_CAP
int max = x;
int min = x;
-#endif
-#if CDEF_FULL
- for (k = 0; k < 3; k++)
-#else
- for (k = 0; k < 2; k++)
-#endif
- {
+ for (k = 0; k < 2; k++) {
int16_t p0 = in[i * s + j + cdef_directions[dir][k]];
int16_t p1 = in[i * s + j - cdef_directions[dir][k]];
sum += pri_taps[k] * constrain(p0 - x, pri_strength, pri_damping);
sum += pri_taps[k] * constrain(p1 - x, pri_strength, pri_damping);
-#if CDEF_CAP
if (p0 != CDEF_VERY_LARGE) max = AOMMAX(p0, max);
if (p1 != CDEF_VERY_LARGE) max = AOMMAX(p1, max);
min = AOMMIN(p0, min);
min = AOMMIN(p1, min);
-#endif
-#if CDEF_FULL
- if (k == 2) continue;
-#endif
int16_t s0 = in[i * s + j + cdef_directions[(dir + 2) & 7][k]];
int16_t s1 = in[i * s + j - cdef_directions[(dir + 2) & 7][k]];
int16_t s2 = in[i * s + j + cdef_directions[(dir + 6) & 7][k]];
int16_t s3 = in[i * s + j - cdef_directions[(dir + 6) & 7][k]];
-#if CDEF_CAP
if (s0 != CDEF_VERY_LARGE) max = AOMMAX(s0, max);
if (s1 != CDEF_VERY_LARGE) max = AOMMAX(s1, max);
if (s2 != CDEF_VERY_LARGE) max = AOMMAX(s2, max);
@@ -189,17 +150,12 @@
min = AOMMIN(s1, min);
min = AOMMIN(s2, min);
min = AOMMIN(s3, min);
-#endif
sum += sec_taps[k] * constrain(s0 - x, sec_strength, sec_damping);
sum += sec_taps[k] * constrain(s1 - x, sec_strength, sec_damping);
sum += sec_taps[k] * constrain(s2 - x, sec_strength, sec_damping);
sum += sec_taps[k] * constrain(s3 - x, sec_strength, sec_damping);
}
-#if CDEF_CAP
y = clamp((int16_t)x + ((8 + sum - (sum < 0)) >> 4), min, max);
-#else
- y = clamp((int16_t)x + ((8 + sum - (sum < 0)) >> 4), 0, max);
-#endif
if (dst8)
dst8[i * dstride + j] = (uint8_t)y;
else
diff --git a/av1/common/cdef_block.h b/av1/common/cdef_block.h
index 14fea27..91c7f89 100644
--- a/av1/common/cdef_block.h
+++ b/av1/common/cdef_block.h
@@ -31,19 +31,9 @@
#define CDEF_INBUF_SIZE \
(CDEF_BSTRIDE * ((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_VBORDER))
-// Filter configuration
-#define CDEF_CAP 1 // 1 = Cap change to largest diff
-#define CDEF_FULL 0 // 1 = 7x7 filter, 0 = 5x5 filter
-
-#if CDEF_FULL
-extern const int cdef_pri_taps[2][3];
-extern const int cdef_sec_taps[2][2];
-DECLARE_ALIGNED(16, extern const int, cdef_directions[8][3]);
-#else
extern const int cdef_pri_taps[2][2];
extern const int cdef_sec_taps[2][2];
DECLARE_ALIGNED(16, extern const int, cdef_directions[8][2]);
-#endif
typedef struct {
uint8_t by;
diff --git a/av1/common/cdef_block_simd.h b/av1/common/cdef_block_simd.h
index 5ee20f3..def8aaf 100644
--- a/av1/common/cdef_block_simd.h
+++ b/av1/common/cdef_block_simd.h
@@ -235,31 +235,17 @@
sign);
}
-#if CDEF_CAP
void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride,
const uint16_t *in, int pri_strength,
int sec_strength, int dir,
int pri_damping, int sec_damping,
AOM_UNUSED int max_unused,
- int coeff_shift)
-#else
-void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride,
- const uint16_t *in, int pri_strength,
- int sec_strength, int dir,
- int pri_damping, int sec_damping,
- int max, int coeff_shift)
-#endif
-{
+ int coeff_shift) {
v128 p0, p1, p2, p3;
v256 sum, row, tap, res;
-#if CDEF_CAP
v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE);
-#endif
int po1 = cdef_directions[dir][0];
int po2 = cdef_directions[dir][1];
-#if CDEF_FULL
- int po3 = cdef_directions[dir][2];
-#endif
int s1o1 = cdef_directions[(dir + 2) & 7][0];
int s1o2 = cdef_directions[(dir + 2) & 7][1];
int s2o1 = cdef_directions[(dir + 6) & 7][0];
@@ -278,9 +264,7 @@
v64_load_aligned(&in[1 * CDEF_BSTRIDE]),
v64_load_aligned(&in[2 * CDEF_BSTRIDE]),
v64_load_aligned(&in[3 * CDEF_BSTRIDE]));
-#if CDEF_CAP
max = min = row;
-#endif
if (pri_strength) {
// Primary near taps
@@ -288,19 +272,15 @@
v64_load_unaligned(&in[1 * CDEF_BSTRIDE + po1]),
v64_load_unaligned(&in[2 * CDEF_BSTRIDE + po1]),
v64_load_unaligned(&in[3 * CDEF_BSTRIDE + po1]));
-#if CDEF_CAP
max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
min = v256_min_s16(min, tap);
-#endif
p0 = constrain(tap, row, pri_strength, pri_damping);
tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - po1]),
v64_load_unaligned(&in[1 * CDEF_BSTRIDE - po1]),
v64_load_unaligned(&in[2 * CDEF_BSTRIDE - po1]),
v64_load_unaligned(&in[3 * CDEF_BSTRIDE - po1]));
-#if CDEF_CAP
max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
min = v256_min_s16(min, tap);
-#endif
p1 = constrain(tap, row, pri_strength, pri_damping);
// sum += pri_taps[0] * (p0 + p1)
@@ -313,52 +293,21 @@
v64_load_unaligned(&in[1 * CDEF_BSTRIDE + po2]),
v64_load_unaligned(&in[2 * CDEF_BSTRIDE + po2]),
v64_load_unaligned(&in[3 * CDEF_BSTRIDE + po2]));
-#if CDEF_CAP
max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
min = v256_min_s16(min, tap);
-#endif
p0 = constrain(tap, row, pri_strength, pri_damping);
tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - po2]),
v64_load_unaligned(&in[1 * CDEF_BSTRIDE - po2]),
v64_load_unaligned(&in[2 * CDEF_BSTRIDE - po2]),
v64_load_unaligned(&in[3 * CDEF_BSTRIDE - po2]));
-#if CDEF_CAP
max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
min = v256_min_s16(min, tap);
-#endif
p1 = constrain(tap, row, pri_strength, pri_damping);
// sum += pri_taps[1] * (p0 + p1)
sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[1]),
v256_from_v128(v128_ziphi_8(p0, p1),
v128_ziplo_8(p0, p1))));
-
-#if CDEF_FULL
- // Primary extra taps
- tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + po3]),
- v64_load_unaligned(&in[1 * CDEF_BSTRIDE + po3]),
- v64_load_unaligned(&in[2 * CDEF_BSTRIDE + po3]),
- v64_load_unaligned(&in[3 * CDEF_BSTRIDE + po3]));
-#if CDEF_CAP
- max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
- min = v256_min_s16(min, tap);
-#endif
- p0 = constrain(tap, row, pri_strength, pri_damping);
- tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - po3]),
- v64_load_unaligned(&in[1 * CDEF_BSTRIDE - po3]),
- v64_load_unaligned(&in[2 * CDEF_BSTRIDE - po3]),
- v64_load_unaligned(&in[3 * CDEF_BSTRIDE - po3]));
-#if CDEF_CAP
- max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
- min = v256_min_s16(min, tap);
-#endif
- p1 = constrain(tap, row, pri_strength, pri_damping);
-
- // sum += pri_taps[2] * (p0 + p1)
- sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[2]),
- v256_from_v128(v128_ziphi_8(p0, p1),
- v128_ziplo_8(p0, p1))));
-#endif
}
if (sec_strength) {
@@ -367,37 +316,29 @@
v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s1o1]),
v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s1o1]),
v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s1o1]));
-#if CDEF_CAP
max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
min = v256_min_s16(min, tap);
-#endif
p0 = constrain(tap, row, sec_strength, sec_damping);
tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s1o1]),
v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s1o1]),
v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s1o1]),
v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s1o1]));
-#if CDEF_CAP
max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
min = v256_min_s16(min, tap);
-#endif
p1 = constrain(tap, row, sec_strength, sec_damping);
tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s2o1]),
v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s2o1]),
v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s2o1]),
v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s2o1]));
-#if CDEF_CAP
max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
min = v256_min_s16(min, tap);
-#endif
p2 = constrain(tap, row, sec_strength, sec_damping);
tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s2o1]),
v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s2o1]),
v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s2o1]),
v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s2o1]));
-#if CDEF_CAP
max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
min = v256_min_s16(min, tap);
-#endif
p3 = constrain(tap, row, sec_strength, sec_damping);
// sum += sec_taps[0] * (p0 + p1 + p2 + p3)
@@ -412,37 +353,29 @@
v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s1o2]),
v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s1o2]),
v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s1o2]));
-#if CDEF_CAP
max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
min = v256_min_s16(min, tap);
-#endif
p0 = constrain(tap, row, sec_strength, sec_damping);
tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s1o2]),
v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s1o2]),
v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s1o2]),
v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s1o2]));
-#if CDEF_CAP
max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
min = v256_min_s16(min, tap);
-#endif
p1 = constrain(tap, row, sec_strength, sec_damping);
tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s2o2]),
v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s2o2]),
v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s2o2]),
v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s2o2]));
-#if CDEF_CAP
max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
min = v256_min_s16(min, tap);
-#endif
p2 = constrain(tap, row, sec_strength, sec_damping);
tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s2o2]),
v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s2o2]),
v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s2o2]),
v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s2o2]));
-#if CDEF_CAP
max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
min = v256_min_s16(min, tap);
-#endif
p3 = constrain(tap, row, sec_strength, sec_damping);
// sum += sec_taps[1] * (p0 + p1 + p2 + p3)
@@ -459,11 +392,7 @@
res = v256_add_16(sum, v256_dup_16(8));
res = v256_shr_n_s16(res, 4);
res = v256_add_16(row, res);
-#if CDEF_CAP
res = v256_min_s16(v256_max_s16(res, min), max);
-#else
- res = v256_min_s16(v256_max_s16(res, v256_zero()), v256_dup_16(max));
-#endif
res = v256_pack_s16_u8(res, res);
p0 = v256_low_v128(res);
@@ -473,32 +402,18 @@
u32_store_aligned(&dst[3 * dstride], v64_low_u32(v128_low_v64(p0)));
}
-#if CDEF_CAP
void SIMD_FUNC(cdef_filter_block_8x8_8)(uint8_t *dst, int dstride,
const uint16_t *in, int pri_strength,
int sec_strength, int dir,
int pri_damping, int sec_damping,
AOM_UNUSED int max_unused,
- int coeff_shift)
-#else
-void SIMD_FUNC(cdef_filter_block_8x8_8)(uint8_t *dst, int dstride,
- const uint16_t *in, int pri_strength,
- int sec_strength, int dir,
- int pri_damping, int sec_damping,
- int max, int coeff_shift)
-#endif
-{
+ int coeff_shift) {
int i;
v128 p0, p1, p2, p3;
v256 sum, row, res, tap;
-#if CDEF_CAP
v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE);
-#endif
int po1 = cdef_directions[dir][0];
int po2 = cdef_directions[dir][1];
-#if CDEF_FULL
- int po3 = cdef_directions[dir][2];
-#endif
int s1o1 = cdef_directions[(dir + 2) & 7][0];
int s1o2 = cdef_directions[(dir + 2) & 7][1];
int s2o1 = cdef_directions[(dir + 6) & 7][0];
@@ -516,25 +431,19 @@
row = v256_from_v128(v128_load_aligned(&in[i * CDEF_BSTRIDE]),
v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]));
-#if CDEF_CAP
max = min = row;
-#endif
// Primary near taps
tap =
v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]),
v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]));
-#if CDEF_CAP
max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
min = v256_min_s16(min, tap);
-#endif
p0 = constrain(tap, row, pri_strength, pri_damping);
tap =
v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]),
v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]));
-#if CDEF_CAP
max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
min = v256_min_s16(min, tap);
-#endif
p1 = constrain(tap, row, pri_strength, pri_damping);
// sum += pri_taps[0] * (p0 + p1)
@@ -546,18 +455,14 @@
tap =
v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]),
v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]));
-#if CDEF_CAP
max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
min = v256_min_s16(min, tap);
-#endif
p0 = constrain(tap, row, pri_strength, pri_damping);
tap =
v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]),
v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]));
-#if CDEF_CAP
max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
min = v256_min_s16(min, tap);
-#endif
p1 = constrain(tap, row, pri_strength, pri_damping);
// sum += pri_taps[1] * (p0 + p1)
@@ -565,63 +470,30 @@
v256_from_v128(v128_ziphi_8(p0, p1),
v128_ziplo_8(p0, p1))));
-#if CDEF_FULL
- // Primary extra taps
- tap =
- v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po3]),
- v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po3]));
-#if CDEF_CAP
- max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
- min = v256_min_s16(min, tap);
-#endif
- p0 = constrain(tap, row, pri_strength, pri_damping);
- tap =
- v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po3]),
- v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po3]));
-#if CDEF_CAP
- max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
- min = v256_min_s16(min, tap);
-#endif
- p1 = constrain(tap, row, pri_strength, pri_damping);
-
- // sum += pri_taps[2] * (p0 + p1)
- sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[2]),
- v256_from_v128(v128_ziphi_8(p0, p1),
- v128_ziplo_8(p0, p1))));
-#endif
-
// Secondary near taps
tap =
v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]),
v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]));
-#if CDEF_CAP
max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
min = v256_min_s16(min, tap);
-#endif
p0 = constrain(tap, row, sec_strength, sec_damping);
tap =
v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]),
v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]));
-#if CDEF_CAP
max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
min = v256_min_s16(min, tap);
-#endif
p1 = constrain(tap, row, sec_strength, sec_damping);
tap =
v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]),
v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]));
-#if CDEF_CAP
max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
min = v256_min_s16(min, tap);
-#endif
p2 = constrain(tap, row, sec_strength, sec_damping);
tap =
v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]),
v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]));
-#if CDEF_CAP
max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
min = v256_min_s16(min, tap);
-#endif
p3 = constrain(tap, row, sec_strength, sec_damping);
// sum += sec_taps[0] * (p0 + p1 + p2 + p3)
@@ -635,34 +507,26 @@
tap =
v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]),
v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]));
-#if CDEF_CAP
max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
min = v256_min_s16(min, tap);
-#endif
p0 = constrain(tap, row, sec_strength, sec_damping);
tap =
v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]),
v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]));
-#if CDEF_CAP
max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
min = v256_min_s16(min, tap);
-#endif
p1 = constrain(tap, row, sec_strength, sec_damping);
tap =
v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]),
v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]));
-#if CDEF_CAP
max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
min = v256_min_s16(min, tap);
-#endif
p2 = constrain(tap, row, sec_strength, sec_damping);
tap =
v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]),
v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]));
-#if CDEF_CAP
max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
min = v256_min_s16(min, tap);
-#endif
p3 = constrain(tap, row, sec_strength, sec_damping);
// sum += sec_taps[1] * (p0 + p1 + p2 + p3)
@@ -677,11 +541,7 @@
res = v256_add_16(sum, v256_dup_16(8));
res = v256_shr_n_s16(res, 4);
res = v256_add_16(row, res);
-#if CDEF_CAP
res = v256_min_s16(v256_max_s16(res, min), max);
-#else
- res = v256_min_s16(v256_max_s16(res, v256_zero()), v256_dup_16(max));
-#endif
res = v256_pack_s16_u8(res, res);
p0 = v256_low_v128(res);
@@ -690,31 +550,17 @@
}
}
-#if CDEF_CAP
void SIMD_FUNC(cdef_filter_block_4x4_16)(uint16_t *dst, int dstride,
const uint16_t *in, int pri_strength,
int sec_strength, int dir,
int pri_damping, int sec_damping,
AOM_UNUSED int max_unused,
- int coeff_shift)
-#else
-void SIMD_FUNC(cdef_filter_block_4x4_16)(uint16_t *dst, int dstride,
- const uint16_t *in, int pri_strength,
- int sec_strength, int dir,
- int pri_damping, int sec_damping,
- int max, int coeff_shift)
-#endif
-{
+ int coeff_shift) {
int i;
v128 p0, p1, p2, p3, sum, row, res;
-#if CDEF_CAP
v128 max, min, large = v128_dup_16(CDEF_VERY_LARGE);
-#endif
int po1 = cdef_directions[dir][0];
int po2 = cdef_directions[dir][1];
-#if CDEF_FULL
- int po3 = cdef_directions[dir][2];
-#endif
int s1o1 = cdef_directions[(dir + 2) & 7][0];
int s1o2 = cdef_directions[(dir + 2) & 7][1];
int s2o1 = cdef_directions[(dir + 6) & 7][0];
@@ -731,21 +577,17 @@
sum = v128_zero();
row = v128_from_v64(v64_load_aligned(&in[i * CDEF_BSTRIDE]),
v64_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]));
-#if CDEF_CAP
min = max = row;
-#endif
// Primary near taps
p0 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + po1]),
v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]));
p1 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - po1]),
v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]));
-#if CDEF_CAP
max =
v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
v128_andn(p1, v128_cmpeq_16(p1, large)));
min = v128_min_s16(v128_min_s16(min, p0), p1);
-#endif
p0 = constrain16(p0, row, pri_strength, pri_damping);
p1 = constrain16(p1, row, pri_strength, pri_damping);
@@ -758,12 +600,10 @@
v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]));
p1 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - po2]),
v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]));
-#if CDEF_CAP
max =
v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
v128_andn(p1, v128_cmpeq_16(p1, large)));
min = v128_min_s16(v128_min_s16(min, p0), p1);
-#endif
p0 = constrain16(p0, row, pri_strength, pri_damping);
p1 = constrain16(p1, row, pri_strength, pri_damping);
@@ -771,26 +611,6 @@
sum = v128_add_16(
sum, v128_mullo_s16(v128_dup_16(pri_taps[1]), v128_add_16(p0, p1)));
-#if CDEF_FULL
- // Primary extra taps
- p0 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + po3]),
- v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po3]));
- p1 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - po3]),
- v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po3]));
-#if CDEF_CAP
- max =
- v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
- v128_andn(p1, v128_cmpeq_16(p1, large)));
- min = v128_min_s16(v128_min_s16(min, p0), p1);
-#endif
- p0 = constrain16(p0, row, pri_strength, pri_damping);
- p1 = constrain16(p1, row, pri_strength, pri_damping);
-
- // sum += pri_taps[2] * (p0 + p1)
- sum = v128_add_16(
- sum, v128_mullo_s16(v128_dup_16(pri_taps[2]), v128_add_16(p0, p1)));
-#endif
-
// Secondary near taps
p0 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]),
v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]));
@@ -800,7 +620,6 @@
v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]));
p3 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]),
v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]));
-#if CDEF_CAP
max =
v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
v128_andn(p1, v128_cmpeq_16(p1, large)));
@@ -809,7 +628,6 @@
v128_andn(p3, v128_cmpeq_16(p3, large)));
min = v128_min_s16(
v128_min_s16(v128_min_s16(v128_min_s16(min, p0), p1), p2), p3);
-#endif
p0 = constrain16(p0, row, sec_strength, sec_damping);
p1 = constrain16(p1, row, sec_strength, sec_damping);
p2 = constrain16(p2, row, sec_strength, sec_damping);
@@ -829,7 +647,6 @@
v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]));
p3 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]),
v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]));
-#if CDEF_CAP
max =
v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
v128_andn(p1, v128_cmpeq_16(p1, large)));
@@ -838,7 +655,6 @@
v128_andn(p3, v128_cmpeq_16(p3, large)));
min = v128_min_s16(
v128_min_s16(v128_min_s16(v128_min_s16(min, p0), p1), p2), p3);
-#endif
p0 = constrain16(p0, row, sec_strength, sec_damping);
p1 = constrain16(p1, row, sec_strength, sec_damping);
p2 = constrain16(p2, row, sec_strength, sec_damping);
@@ -854,41 +670,23 @@
res = v128_add_16(sum, v128_dup_16(8));
res = v128_shr_n_s16(res, 4);
res = v128_add_16(row, res);
-#if CDEF_CAP
res = v128_min_s16(v128_max_s16(res, min), max);
-#else
- res = v128_min_s16(v128_max_s16(res, v128_zero()), v128_dup_16(max));
-#endif
v64_store_aligned(&dst[i * dstride], v128_high_v64(res));
v64_store_aligned(&dst[(i + 1) * dstride], v128_low_v64(res));
}
}
-#if CDEF_CAP
void SIMD_FUNC(cdef_filter_block_8x8_16)(uint16_t *dst, int dstride,
const uint16_t *in, int pri_strength,
int sec_strength, int dir,
int pri_damping, int sec_damping,
AOM_UNUSED int max_unused,
- int coeff_shift)
-#else
-void SIMD_FUNC(cdef_filter_block_8x8_16)(uint16_t *dst, int dstride,
- const uint16_t *in, int pri_strength,
- int sec_strength, int dir,
- int pri_damping, int sec_damping,
- int max, int coeff_shift)
-#endif
-{
+ int coeff_shift) {
int i;
v128 sum, p0, p1, p2, p3, row, res;
-#if CDEF_CAP
v128 max, min, large = v128_dup_16(CDEF_VERY_LARGE);
-#endif
int po1 = cdef_directions[dir][0];
int po2 = cdef_directions[dir][1];
-#if CDEF_FULL
- int po3 = cdef_directions[dir][2];
-#endif
int s1o1 = cdef_directions[(dir + 2) & 7][0];
int s1o2 = cdef_directions[(dir + 2) & 7][1];
int s2o1 = cdef_directions[(dir + 6) & 7][0];
@@ -906,18 +704,14 @@
sum = v128_zero();
row = v128_load_aligned(&in[i * CDEF_BSTRIDE]);
-#if CDEF_CAP
min = max = row;
-#endif
// Primary near taps
p0 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]);
p1 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]);
-#if CDEF_CAP
max =
v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
v128_andn(p1, v128_cmpeq_16(p1, large)));
min = v128_min_s16(v128_min_s16(min, p0), p1);
-#endif
p0 = constrain16(p0, row, pri_strength, pri_damping);
p1 = constrain16(p1, row, pri_strength, pri_damping);
@@ -928,12 +722,10 @@
// Primary far taps
p0 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]);
p1 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]);
-#if CDEF_CAP
max =
v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
v128_andn(p1, v128_cmpeq_16(p1, large)));
min = v128_min_s16(v128_min_s16(min, p0), p1);
-#endif
p0 = constrain16(p0, row, pri_strength, pri_damping);
p1 = constrain16(p1, row, pri_strength, pri_damping);
@@ -941,30 +733,11 @@
sum = v128_add_16(
sum, v128_mullo_s16(v128_dup_16(pri_taps[1]), v128_add_16(p0, p1)));
-#if CDEF_FULL
- // Primary extra taps
- p0 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + po3]);
- p1 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - po3]);
-#if CDEF_CAP
- max =
- v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
- v128_andn(p1, v128_cmpeq_16(p1, large)));
- min = v128_min_s16(v128_min_s16(min, p0), p1);
-#endif
- p0 = constrain16(p0, row, pri_strength, pri_damping);
- p1 = constrain16(p1, row, pri_strength, pri_damping);
-
- // sum += pri_taps[2] * (p0 + p1)
- sum = v128_add_16(
- sum, v128_mullo_s16(v128_dup_16(pri_taps[2]), v128_add_16(p0, p1)));
-#endif
-
// Secondary near taps
p0 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]);
p1 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]);
p2 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]);
p3 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]);
-#if CDEF_CAP
max =
v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
v128_andn(p1, v128_cmpeq_16(p1, large)));
@@ -973,7 +746,6 @@
v128_andn(p3, v128_cmpeq_16(p3, large)));
min = v128_min_s16(
v128_min_s16(v128_min_s16(v128_min_s16(min, p0), p1), p2), p3);
-#endif
p0 = constrain16(p0, row, sec_strength, sec_damping);
p1 = constrain16(p1, row, sec_strength, sec_damping);
p2 = constrain16(p2, row, sec_strength, sec_damping);
@@ -989,7 +761,6 @@
p1 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]);
p2 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]);
p3 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]);
-#if CDEF_CAP
max =
v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
v128_andn(p1, v128_cmpeq_16(p1, large)));
@@ -998,7 +769,6 @@
v128_andn(p3, v128_cmpeq_16(p3, large)));
min = v128_min_s16(
v128_min_s16(v128_min_s16(v128_min_s16(min, p0), p1), p2), p3);
-#endif
p0 = constrain16(p0, row, sec_strength, sec_damping);
p1 = constrain16(p1, row, sec_strength, sec_damping);
p2 = constrain16(p2, row, sec_strength, sec_damping);
@@ -1014,11 +784,7 @@
res = v128_add_16(sum, v128_dup_16(8));
res = v128_shr_n_s16(res, 4);
res = v128_add_16(row, res);
-#if CDEF_CAP
res = v128_min_s16(v128_max_s16(res, min), max);
-#else
- res = v128_min_s16(v128_max_s16(res, v128_zero()), v128_dup_16(max));
-#endif
v128_store_unaligned(&dst[i * dstride], res);
}
}