Improve filter_intra throughput
The prediction can be done in 2x2 or 4x4 processing unit, within
which there is no dependency and the computation can be fully
parallelized.
Also turn < 8x8 filter_intra on, and disable it in > 32x32 txbs.
Change-Id: I4f8a3104019cbb35e88f342d97516f81b19152b0
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 73fe6de..7d6eb71 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -198,7 +198,7 @@
DC_PRED, V_PRED, H_PRED, D117_PRED, D153_PRED, DC_PRED
};
-#define DISABLE_SUB8X8_FILTER_INTRA 1
+#define DISABLE_SUB8X8_FILTER_INTRA 0
static INLINE int av1_filter_intra_allowed_bsize(BLOCK_SIZE bs) {
(void)bs;
@@ -212,9 +212,10 @@
static INLINE int av1_filter_intra_allowed_txsize(TX_SIZE tx) {
(void)tx;
#if DISABLE_SUB8X8_FILTER_INTRA
- return tx_size_wide[tx] >= 8 && tx_size_high[tx] >= 8;
+ return tx_size_wide[tx] >= 8 && tx_size_high[tx] >= 8 &&
+ tx_size_wide[tx] <= 32 && tx_size_high[tx] <= 32;
#else
- return 1;
+ return tx_size_wide[tx] <= 32 && tx_size_high[tx] <= 32;
#endif
}
#endif // CONFIG_FILTER_INTRA
diff --git a/av1/common/reconintra.c b/av1/common/reconintra.c
index 94ce012..7d36847 100644
--- a/av1/common/reconintra.c
+++ b/av1/common/reconintra.c
@@ -1110,6 +1110,268 @@
#endif // CONFIG_EXT_INTRA
#if CONFIG_FILTER_INTRA
+#if FILTER_INTRA_PROC_UNIT_SIZE == 2
+static int filter_intra_taps_2x2procunit[FILTER_INTRA_MODES][4][5] = {
+ {
+ { -3, 5, 0, 6, 0 },
+ { -2, 1, 5, 4, 0 },
+ { -2, 3, 0, 1, 6 },
+ { -2, 1, 3, 1, 5 },
+ },
+ {
+ { -5, 8, 0, 5, 0 },
+ { -3, 0, 8, 3, 0 },
+ { -5, 8, 0, 0, 5 },
+ { -3, 0, 8, 0, 3 },
+ },
+ {
+ { -4, 4, 0, 8, 0 },
+ { -4, 0, 4, 8, 0 },
+ { -2, 2, 0, 0, 8 },
+ { -2, 0, 2, 0, 8 },
+ },
+ {
+ { -1, 6, 0, 3, 0 },
+ { 0, 1, 6, 1, 0 },
+ { -1, 5, 0, 1, 3 },
+ { 0, 2, 4, 1, 1 },
+ },
+ {
+ { -1, 4, 0, 5, 0 },
+ { -1, 2, 4, 3, 0 },
+ { -1, 2, 0, 2, 5 },
+ { -1, 2, 2, 2, 3 },
+ },
+ {
+ { -6, 7, 0, 7, 0 },
+ { -5, 0, 7, 6, 0 },
+ { -5, 6, 0, 0, 7 },
+ { -4, 0, 6, 0, 6 },
+ },
+};
+#elif FILTER_INTRA_PROC_UNIT_SIZE == 4
+static int filter_intra_taps_4x4procunit[FILTER_INTRA_MODES][16][9] = {
+#if FILTER_INTRA_SCALE_BITS == 4
+ {
+ { -6, 10, 0, 0, 0, 12, 0, 0, 0 },
+ { -5, 2, 10, 0, 0, 9, 0, 0, 0 },
+ { -3, 1, 1, 10, 0, 7, 0, 0, 0 },
+ { -3, 1, 1, 2, 10, 5, 0, 0, 0 },
+ { -4, 6, 0, 0, 0, 2, 12, 0, 0 },
+ { -3, 2, 6, 0, 0, 2, 9, 0, 0 },
+ { -3, 2, 2, 6, 0, 2, 7, 0, 0 },
+ { -2, 0, 2, 2, 6, 3, 5, 0, 0 },
+ { -2, 4, 0, 0, 0, 1, 1, 12, 0 },
+ { -3, 2, 4, 0, 0, 2, 2, 9, 0 },
+ { -2, 0, 2, 4, 0, 2, 3, 7, 0 },
+ { -2, 0, 0, 2, 4, 3, 3, 6, 0 },
+ { -1, 2, 0, 0, 0, 1, 1, 1, 12 },
+ { -2, 2, 3, 0, 0, 0, 2, 2, 9 },
+ { -1, 0, 2, 3, 0, 0, 2, 3, 7 },
+ { -1, 0, 0, 2, 3, 0, 3, 3, 6 },
+ },
+ {
+ { -10, 16, 0, 0, 0, 10, 0, 0, 0 },
+ { -6, 0, 16, 0, 0, 6, 0, 0, 0 },
+ { -4, 0, 0, 16, 0, 4, 0, 0, 0 },
+ { -2, 0, 0, 0, 16, 2, 0, 0, 0 },
+ { -10, 16, 0, 0, 0, 0, 10, 0, 0 },
+ { -6, 0, 16, 0, 0, 0, 6, 0, 0 },
+ { -4, 0, 0, 16, 0, 0, 4, 0, 0 },
+ { -2, 0, 0, 0, 16, 0, 2, 0, 0 },
+ { -10, 16, 0, 0, 0, 0, 0, 10, 0 },
+ { -6, 0, 16, 0, 0, 0, 0, 6, 0 },
+ { -4, 0, 0, 16, 0, 0, 0, 4, 0 },
+ { -2, 0, 0, 0, 16, 0, 0, 2, 0 },
+ { -10, 16, 0, 0, 0, 0, 0, 0, 10 },
+ { -6, 0, 16, 0, 0, 0, 0, 0, 6 },
+ { -4, 0, 0, 16, 0, 0, 0, 0, 4 },
+ { -2, 0, 0, 0, 16, 0, 0, 0, 2 },
+ },
+ {
+ { -8, 8, 0, 0, 0, 16, 0, 0, 0 },
+ { -8, 0, 8, 0, 0, 16, 0, 0, 0 },
+ { -8, 0, 0, 8, 0, 16, 0, 0, 0 },
+ { -8, 0, 0, 0, 8, 16, 0, 0, 0 },
+ { -4, 4, 0, 0, 0, 0, 16, 0, 0 },
+ { -4, 0, 4, 0, 0, 0, 16, 0, 0 },
+ { -4, 0, 0, 4, 0, 0, 16, 0, 0 },
+ { -4, 0, 0, 0, 4, 0, 16, 0, 0 },
+ { -2, 2, 0, 0, 0, 0, 0, 16, 0 },
+ { -2, 0, 2, 0, 0, 0, 0, 16, 0 },
+ { -2, 0, 0, 2, 0, 0, 0, 16, 0 },
+ { -2, 0, 0, 0, 2, 0, 0, 16, 0 },
+ { -1, 1, 0, 0, 0, 0, 0, 0, 16 },
+ { -1, 0, 1, 0, 0, 0, 0, 0, 16 },
+ { -1, 0, 0, 1, 0, 0, 0, 0, 16 },
+ { -1, 0, 0, 0, 1, 0, 0, 0, 16 },
+ },
+ {
+ { -2, 12, 0, 0, 0, 6, 0, 0, 0 },
+ { -1, 3, 12, 0, 0, 2, 0, 0, 0 },
+ { 0, 1, 2, 12, 0, 1, 0, 0, 0 },
+ { 0, 0, 1, 3, 12, 0, 0, 0, 0 },
+ { -2, 9, 0, 0, 0, 3, 6, 0, 0 },
+ { -1, 4, 9, 0, 0, 2, 2, 0, 0 },
+ { -1, 2, 4, 9, 0, 1, 1, 0, 0 },
+ { 0, 1, 2, 4, 9, 0, 0, 0, 0 },
+ { -1, 7, 0, 0, 0, 2, 2, 6, 0 },
+ { -1, 4, 7, 0, 0, 2, 2, 2, 0 },
+ { 0, 2, 4, 7, 0, 1, 1, 1, 0 },
+ { 0, 1, 2, 4, 7, 1, 1, 0, 0 },
+ { -1, 5, 0, 0, 0, 1, 2, 3, 6 },
+ { 0, 4, 5, 0, 0, 1, 2, 2, 2 },
+ { 0, 3, 4, 5, 0, 2, 1, 1, 0 },
+ { 0, 2, 3, 4, 5, 1, 1, 0, 0 },
+ },
+ {
+ { -2, 8, 0, 0, 0, 10, 0, 0, 0 },
+ { -1, 3, 8, 0, 0, 6, 0, 0, 0 },
+ { -1, 2, 3, 8, 0, 4, 0, 0, 0 },
+ { 0, 1, 2, 3, 8, 2, 0, 0, 0 },
+ { -1, 4, 0, 0, 0, 3, 10, 0, 0 },
+ { -1, 3, 4, 0, 0, 4, 6, 0, 0 },
+ { -1, 2, 3, 4, 0, 4, 4, 0, 0 },
+ { 0, 2, 2, 3, 4, 3, 2, 0, 0 },
+ { -1, 2, 0, 0, 0, 2, 3, 10, 0 },
+ { -1, 2, 2, 0, 0, 3, 4, 6, 0 },
+ { 0, 2, 2, 2, 0, 3, 3, 4, 0 },
+ { 0, 2, 3, 2, 0, 3, 3, 3, 0 },
+ { 0, 1, 0, 0, 0, 1, 1, 3, 10 },
+ { 0, 1, 1, 0, 0, 2, 2, 4, 6 },
+ { 0, 2, 1, 0, 0, 2, 3, 4, 4 },
+ { 0, 2, 2, 0, 0, 3, 3, 3, 3 },
+ },
+ {
+ { -12, 14, 0, 0, 0, 14, 0, 0, 0 },
+ { -10, 0, 14, 0, 0, 12, 0, 0, 0 },
+ { -9, 0, 0, 14, 0, 11, 0, 0, 0 },
+ { -8, 0, 0, 0, 14, 10, 0, 0, 0 },
+ { -10, 12, 0, 0, 0, 0, 14, 0, 0 },
+ { -9, 1, 12, 0, 0, 0, 12, 0, 0 },
+ { -8, 0, 0, 12, 0, 1, 11, 0, 0 },
+ { -7, 0, 0, 1, 12, 1, 9, 0, 0 },
+ { -9, 11, 0, 0, 0, 0, 0, 14, 0 },
+ { -8, 1, 11, 0, 0, 0, 0, 12, 0 },
+ { -8, 0, 1, 11, 0, 0, 1, 11, 0 },
+ { -7, 0, 0, 1, 11, 1, 1, 9, 0 },
+ { -8, 10, 0, 0, 0, 0, 0, 0, 14 },
+ { -7, 1, 9, 0, 0, 0, 0, 1, 12 },
+ { -7, 1, 1, 9, 0, 0, 0, 1, 11 },
+ { -6, 0, 1, 1, 10, 0, 0, 1, 9 },
+ },
+#else
+ {
+ { -3, 5, 0, 0, 0, 6, 0, 0, 0 },
+ { -2, 1, 5, 0, 0, 4, 0, 0, 0 },
+ { -2, 1, 1, 5, 0, 3, 0, 0, 0 },
+ { -1, 0, 1, 1, 5, 2, 0, 0, 0 },
+ { -2, 3, 0, 0, 0, 1, 6, 0, 0 },
+ { -2, 1, 3, 0, 0, 1, 5, 0, 0 },
+ { -1, 0, 1, 3, 0, 1, 4, 0, 0 },
+ { -1, 0, 0, 1, 3, 2, 3, 0, 0 },
+ { -1, 2, 0, 0, 0, 0, 1, 6, 0 },
+ { -1, 1, 2, 0, 0, 0, 1, 5, 0 },
+ { -1, 0, 0, 2, 0, 1, 2, 4, 0 },
+ { -1, 0, 0, 1, 3, 0, 2, 3, 0 },
+ { -1, 1, 0, 0, 0, 1, 0, 1, 6 },
+ { -1, 1, 2, 0, 0, 0, 0, 1, 5 },
+ { -1, 0, 1, 2, 0, 0, 0, 2, 4 },
+ { 0, 0, 0, 1, 2, 0, 0, 2, 3 },
+ },
+ {
+ { -5, 8, 0, 0, 0, 5, 0, 0, 0 },
+ { -3, 0, 8, 0, 0, 3, 0, 0, 0 },
+ { -2, 0, 0, 8, 0, 2, 0, 0, 0 },
+ { -1, 0, 0, 0, 8, 1, 0, 0, 0 },
+ { -5, 8, 0, 0, 0, 0, 5, 0, 0 },
+ { -3, 0, 8, 0, 0, 0, 3, 0, 0 },
+ { -2, 0, 0, 8, 0, 0, 2, 0, 0 },
+ { -1, 0, 0, 0, 8, 0, 1, 0, 0 },
+ { -5, 8, 0, 0, 0, 0, 0, 5, 0 },
+ { -3, 0, 8, 0, 0, 0, 0, 3, 0 },
+ { -2, 0, 0, 8, 0, 0, 0, 2, 0 },
+ { -1, 0, 0, 0, 8, 0, 0, 1, 0 },
+ { -5, 8, 0, 0, 0, 0, 0, 0, 5 },
+ { -3, 0, 8, 0, 0, 0, 0, 0, 3 },
+ { -2, 0, 0, 8, 0, 0, 0, 0, 2 },
+ { -1, 0, 0, 0, 8, 0, 0, 0, 1 },
+ },
+ {
+ { -4, 4, 0, 0, 0, 8, 0, 0, 0 },
+ { -4, 0, 4, 0, 0, 8, 0, 0, 0 },
+ { -4, 0, 0, 4, 0, 8, 0, 0, 0 },
+ { -4, 0, 0, 0, 4, 8, 0, 0, 0 },
+ { -2, 2, 0, 0, 0, 0, 8, 0, 0 },
+ { -2, 0, 2, 0, 0, 0, 8, 0, 0 },
+ { -2, 0, 0, 2, 0, 0, 8, 0, 0 },
+ { -2, 0, 0, 0, 2, 0, 8, 0, 0 },
+ { -1, 1, 0, 0, 0, 0, 0, 8, 0 },
+ { -1, 0, 1, 0, 0, 0, 0, 8, 0 },
+ { -1, 0, 0, 1, 0, 0, 0, 8, 0 },
+ { -1, 0, 0, 0, 1, 0, 0, 8, 0 },
+ { -1, 1, 0, 0, 0, 0, 0, 0, 8 },
+ { -1, 0, 1, 0, 0, 0, 0, 0, 8 },
+ { -1, 0, 0, 1, 0, 0, 0, 0, 8 },
+ { -1, 0, 0, 0, 1, 0, 0, 0, 8 },
+ },
+ {
+ { -1, 6, 0, 0, 0, 3, 0, 0, 0 },
+ { 0, 1, 6, 0, 0, 1, 0, 0, 0 },
+ { 0, 1, 1, 6, 0, 0, 0, 0, 0 },
+ { 0, 0, 1, 1, 6, 0, 0, 0, 0 },
+ { -1, 5, 0, 0, 0, 1, 3, 0, 0 },
+ { 0, 2, 4, 0, 0, 1, 1, 0, 0 },
+ { 0, 1, 2, 4, 0, 1, 0, 0, 0 },
+ { 0, 0, 1, 2, 5, 0, 0, 0, 0 },
+ { 0, 3, 0, 0, 0, 1, 1, 3, 0 },
+ { 0, 2, 3, 0, 0, 1, 1, 1, 0 },
+ { 0, 1, 2, 3, 0, 1, 1, 0, 0 },
+ { 0, 1, 1, 2, 4, 0, 0, 0, 0 },
+ { 0, 3, 0, 0, 0, 1, 0, 1, 3 },
+ { 0, 2, 3, 0, 0, 1, 0, 1, 1 },
+ { 0, 1, 2, 3, 0, 1, 1, 0, 0 },
+ { 0, 1, 2, 2, 3, 0, 0, 0, 0 },
+ },
+ {
+ { -1, 4, 0, 0, 0, 5, 0, 0, 0 },
+ { -1, 2, 4, 0, 0, 3, 0, 0, 0 },
+ { 0, 1, 1, 4, 0, 2, 0, 0, 0 },
+ { 0, 1, 1, 1, 4, 1, 0, 0, 0 },
+ { -1, 2, 0, 0, 0, 2, 5, 0, 0 },
+ { -1, 2, 2, 0, 0, 2, 3, 0, 0 },
+ { 0, 1, 1, 2, 0, 2, 2, 0, 0 },
+ { 0, 1, 0, 2, 2, 2, 1, 0, 0 },
+ { 0, 1, 0, 0, 0, 1, 1, 5, 0 },
+ { 0, 1, 1, 0, 0, 1, 2, 3, 0 },
+ { 0, 1, 1, 0, 0, 2, 2, 2, 0 },
+ { 0, 1, 1, 0, 0, 2, 2, 2, 0 },
+ { 0, 1, 0, 0, 0, 0, 0, 2, 5 },
+ { 0, 1, 1, 0, 0, 1, 0, 2, 3 },
+ { 0, 1, 1, 0, 0, 2, 0, 2, 2 },
+ { 0, 1, 1, 0, 0, 2, 2, 2, 0 },
+ },
+ {
+ { -6, 7, 0, 0, 0, 7, 0, 0, 0 },
+ { -5, 0, 7, 0, 0, 6, 0, 0, 0 },
+ { -4, 0, 0, 7, 0, 5, 0, 0, 0 },
+ { -4, 0, 0, 0, 7, 5, 0, 0, 0 },
+ { -5, 6, 0, 0, 0, 0, 7, 0, 0 },
+ { -4, 0, 6, 0, 0, 0, 6, 0, 0 },
+ { -4, 0, 0, 6, 0, 0, 6, 0, 0 },
+ { -4, 0, 0, 0, 6, 1, 5, 0, 0 },
+ { -4, 5, 0, 0, 0, 0, 0, 7, 0 },
+ { -4, 0, 6, 0, 0, 0, 0, 6, 0 },
+ { -4, 0, 0, 6, 0, 0, 0, 6, 0 },
+ { -3, 0, 0, 0, 6, 0, 0, 5, 0 },
+ { -4, 5, 0, 0, 0, 0, 0, 0, 7 },
+ { -4, 1, 5, 0, 0, 0, 0, 0, 6 },
+ { -3, 0, 0, 5, 0, 0, 0, 0, 6 },
+ { -3, 0, 0, 1, 5, 0, 0, 0, 5 },
+ },
+#endif
+};
+#else
static int filter_intra_taps_3[TX_SIZES_ALL][FILTER_INTRA_MODES][3] = {
{
{ 5, 7, -4 },
@@ -1252,27 +1514,79 @@
{ 7, 7, -6 },
},
};
+#endif
static void filter_intra_predictors_3tap(uint8_t *dst, ptrdiff_t stride,
TX_SIZE tx_size, const uint8_t *above,
const uint8_t *left, int mode) {
int r, c;
- int ipred;
-#if CONFIG_TX64X64
- int buffer[65][65];
-#else
int buffer[33][33];
-#endif // CONFIG_TX64X64
- const int c0 = filter_intra_taps_3[tx_size][mode][0];
- const int c1 = filter_intra_taps_3[tx_size][mode][1];
- const int c2 = filter_intra_taps_3[tx_size][mode][2];
const int bw = tx_size_wide[tx_size];
const int bh = tx_size_high[tx_size];
+ assert(bw <= 32 && bh <= 32);
+
for (r = 0; r < bh; ++r) buffer[r + 1][0] = (int)left[r];
for (c = 0; c < bw + 1; ++c) buffer[0][c] = (int)above[c - 1];
+#if FILTER_INTRA_PROC_UNIT_SIZE == 2
+ for (r = 1; r < bh + 1; r += 2)
+ for (c = 1; c < bw + 1; c += 2) {
+ const int p0 = buffer[r - 1][c - 1];
+ const int p1 = buffer[r - 1][c];
+ const int p2 = buffer[r - 1][c + 1];
+ const int p3 = buffer[r][c - 1];
+ const int p4 = buffer[r + 1][c - 1];
+ for (int k = 0; k < 4; ++k) {
+ int r_offset = k >> 1;
+ int c_offset = k & 0x01;
+ buffer[r + r_offset][c + c_offset] =
+ filter_intra_taps_2x2procunit[mode][k][0] * p0 +
+ filter_intra_taps_2x2procunit[mode][k][1] * p1 +
+ filter_intra_taps_2x2procunit[mode][k][2] * p2 +
+ filter_intra_taps_2x2procunit[mode][k][3] * p3 +
+ filter_intra_taps_2x2procunit[mode][k][4] * p4;
+ buffer[r + r_offset][c + c_offset] =
+ clip_pixel(ROUND_POWER_OF_TWO_SIGNED(
+ buffer[r + r_offset][c + c_offset], FILTER_INTRA_SCALE_BITS));
+ }
+ }
+#elif FILTER_INTRA_PROC_UNIT_SIZE == 4
+ for (r = 1; r < bh + 1; r += 4)
+ for (c = 1; c < bw + 1; c += 4) {
+ const int p0 = buffer[r - 1][c - 1];
+ const int p1 = buffer[r - 1][c];
+ const int p2 = buffer[r - 1][c + 1];
+ const int p3 = buffer[r - 1][c + 2];
+ const int p4 = buffer[r - 1][c + 3];
+ const int p5 = buffer[r][c - 1];
+ const int p6 = buffer[r + 1][c - 1];
+ const int p7 = buffer[r + 2][c - 1];
+ const int p8 = buffer[r + 3][c - 1];
+ for (int k = 0; k < 16; ++k) {
+ int r_offset = k >> 2;
+ int c_offset = k & 0x03;
+ buffer[r + r_offset][c + c_offset] =
+ filter_intra_taps_4x4procunit[mode][k][0] * p0 +
+ filter_intra_taps_4x4procunit[mode][k][1] * p1 +
+ filter_intra_taps_4x4procunit[mode][k][2] * p2 +
+ filter_intra_taps_4x4procunit[mode][k][3] * p3 +
+ filter_intra_taps_4x4procunit[mode][k][4] * p4 +
+ filter_intra_taps_4x4procunit[mode][k][5] * p5 +
+ filter_intra_taps_4x4procunit[mode][k][6] * p6 +
+ filter_intra_taps_4x4procunit[mode][k][7] * p7 +
+ filter_intra_taps_4x4procunit[mode][k][8] * p8;
+ buffer[r + r_offset][c + c_offset] =
+ clip_pixel(ROUND_POWER_OF_TWO_SIGNED(
+ buffer[r + r_offset][c + c_offset], FILTER_INTRA_SCALE_BITS));
+ }
+ }
+#else
+ int ipred;
+ const int c0 = filter_intra_taps_3[tx_size][mode][0];
+ const int c1 = filter_intra_taps_3[tx_size][mode][1];
+ const int c2 = filter_intra_taps_3[tx_size][mode][2];
for (r = 1; r < bh + 1; ++r)
for (c = 1; c < bw + 1; ++c) {
ipred = c0 * buffer[r - 1][c] + c1 * buffer[r][c - 1] +
@@ -1280,10 +1594,10 @@
buffer[r][c] = ROUND_POWER_OF_TWO_SIGNED(ipred, FILTER_INTRA_SCALE_BITS);
buffer[r][c] = clip_pixel(buffer[r][c]);
}
-
+#endif
for (r = 0; r < bh; ++r) {
for (c = 0; c < bw; ++c) {
- dst[c] = clip_pixel(buffer[r + 1][c + 1]);
+ dst[c] = buffer[r + 1][c + 1];
}
dst += stride;
}
@@ -1360,33 +1674,87 @@
const uint16_t *left, int mode,
int bd) {
int r, c;
- int ipred;
-#if CONFIG_TX64X64
- int preds[65][65];
-#else
- int preds[33][33];
-#endif // CONFIG_TX64X64
- const int c0 = filter_intra_taps_3[tx_size][mode][0];
- const int c1 = filter_intra_taps_3[tx_size][mode][1];
- const int c2 = filter_intra_taps_3[tx_size][mode][2];
+ int buffer[33][33];
const int bw = tx_size_wide[tx_size];
const int bh = tx_size_high[tx_size];
- for (r = 0; r < bh; ++r) preds[r + 1][0] = (int)left[r];
+ assert(bw <= 32 && bh <= 32);
- for (c = 0; c < bw + 1; ++c) preds[0][c] = (int)above[c - 1];
+ for (r = 0; r < bh; ++r) buffer[r + 1][0] = (int)left[r];
+ for (c = 0; c < bw + 1; ++c) buffer[0][c] = (int)above[c - 1];
+
+#if FILTER_INTRA_PROC_UNIT_SIZE == 2
+ for (r = 1; r < bh + 1; r += 2)
+ for (c = 1; c < bw + 1; c += 2) {
+ const int p0 = buffer[r - 1][c - 1];
+ const int p1 = buffer[r - 1][c];
+ const int p2 = buffer[r - 1][c + 1];
+ const int p3 = buffer[r][c - 1];
+ const int p4 = buffer[r + 1][c - 1];
+ for (int k = 0; k < 4; ++k) {
+ int r_offset = k >> 1;
+ int c_offset = k & 0x01;
+ buffer[r + r_offset][c + c_offset] =
+ filter_intra_taps_2x2procunit[mode][k][0] * p0 +
+ filter_intra_taps_2x2procunit[mode][k][1] * p1 +
+ filter_intra_taps_2x2procunit[mode][k][2] * p2 +
+ filter_intra_taps_2x2procunit[mode][k][3] * p3 +
+ filter_intra_taps_2x2procunit[mode][k][4] * p4;
+ buffer[r + r_offset][c + c_offset] = clip_pixel_highbd(
+ ROUND_POWER_OF_TWO_SIGNED(buffer[r + r_offset][c + c_offset],
+ FILTER_INTRA_SCALE_BITS),
+ bd);
+ }
+ }
+#elif FILTER_INTRA_PROC_UNIT_SIZE == 4
+ for (r = 1; r < bh + 1; r += 4)
+ for (c = 1; c < bw + 1; c += 4) {
+ const int p0 = buffer[r - 1][c - 1];
+ const int p1 = buffer[r - 1][c];
+ const int p2 = buffer[r - 1][c + 1];
+ const int p3 = buffer[r - 1][c + 2];
+ const int p4 = buffer[r - 1][c + 3];
+ const int p5 = buffer[r][c - 1];
+ const int p6 = buffer[r + 1][c - 1];
+ const int p7 = buffer[r + 2][c - 1];
+ const int p8 = buffer[r + 3][c - 1];
+ for (int k = 0; k < 16; ++k) {
+ int r_offset = k >> 2;
+ int c_offset = k & 0x03;
+ buffer[r + r_offset][c + c_offset] =
+ filter_intra_taps_4x4procunit[mode][k][0] * p0 +
+ filter_intra_taps_4x4procunit[mode][k][1] * p1 +
+ filter_intra_taps_4x4procunit[mode][k][2] * p2 +
+ filter_intra_taps_4x4procunit[mode][k][3] * p3 +
+ filter_intra_taps_4x4procunit[mode][k][4] * p4 +
+ filter_intra_taps_4x4procunit[mode][k][5] * p5 +
+ filter_intra_taps_4x4procunit[mode][k][6] * p6 +
+ filter_intra_taps_4x4procunit[mode][k][7] * p7 +
+ filter_intra_taps_4x4procunit[mode][k][8] * p8;
+ buffer[r + r_offset][c + c_offset] = clip_pixel_highbd(
+ ROUND_POWER_OF_TWO_SIGNED(buffer[r + r_offset][c + c_offset],
+ FILTER_INTRA_SCALE_BITS),
+ bd);
+ }
+ }
+#else
+ int ipred;
+ const int c0 = filter_intra_taps_3[tx_size][mode][0];
+ const int c1 = filter_intra_taps_3[tx_size][mode][1];
+ const int c2 = filter_intra_taps_3[tx_size][mode][2];
for (r = 1; r < bh + 1; ++r)
for (c = 1; c < bw + 1; ++c) {
- ipred = c0 * preds[r - 1][c] + c1 * preds[r][c - 1] +
- c2 * preds[r - 1][c - 1];
- preds[r][c] = ROUND_POWER_OF_TWO_SIGNED(ipred, FILTER_INTRA_SCALE_BITS);
- preds[r][c] = clip_pixel_highbd(preds[r][c], bd);
+ ipred = c0 * buffer[r - 1][c] + c1 * buffer[r][c - 1] +
+ c2 * buffer[r - 1][c - 1];
+ buffer[r][c] = ROUND_POWER_OF_TWO_SIGNED(ipred, FILTER_INTRA_SCALE_BITS);
+ buffer[r][c] = clip_pixel_highbd(buffer[r][c], bd);
}
+#endif
for (r = 0; r < bh; ++r) {
for (c = 0; c < bw; ++c) {
- dst[c] = clip_pixel_highbd(preds[r + 1][c + 1], bd);
+ dst[c] = buffer[r + 1][c + 1];
}
dst += stride;
}
diff --git a/av1/common/reconintra.h b/av1/common/reconintra.h
index 6fbc135..c507a42 100644
--- a/av1/common/reconintra.h
+++ b/av1/common/reconintra.h
@@ -42,7 +42,12 @@
};
#if CONFIG_FILTER_INTRA
+#define FILTER_INTRA_PROC_UNIT_SIZE 2
+#if FILTER_INTRA_PROC_UNIT_SIZE == 4
+#define FILTER_INTRA_SCALE_BITS 4
+#else
#define FILTER_INTRA_SCALE_BITS 3
+#endif
#endif // CONFIG_FILTER_INTRA
#define CONFIG_INTRA_EDGE_UPSAMPLE CONFIG_INTRA_EDGE