Improve filter_intra throughput

The prediction can be done in 2x2 or 4x4 processing unit, within
which there is no dependency and the computation can be fully
parallelized.
Also turn < 8x8 filter_intra on, and disable it in > 32x32 txbs.

Change-Id: I4f8a3104019cbb35e88f342d97516f81b19152b0
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 73fe6de..7d6eb71 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -198,7 +198,7 @@
   DC_PRED, V_PRED, H_PRED, D117_PRED, D153_PRED, DC_PRED
 };
 
-#define DISABLE_SUB8X8_FILTER_INTRA 1
+#define DISABLE_SUB8X8_FILTER_INTRA 0
 
 static INLINE int av1_filter_intra_allowed_bsize(BLOCK_SIZE bs) {
   (void)bs;
@@ -212,9 +212,10 @@
 static INLINE int av1_filter_intra_allowed_txsize(TX_SIZE tx) {
   (void)tx;
 #if DISABLE_SUB8X8_FILTER_INTRA
-  return tx_size_wide[tx] >= 8 && tx_size_high[tx] >= 8;
+  return tx_size_wide[tx] >= 8 && tx_size_high[tx] >= 8 &&
+         tx_size_wide[tx] <= 32 && tx_size_high[tx] <= 32;
 #else
-  return 1;
+  return tx_size_wide[tx] <= 32 && tx_size_high[tx] <= 32;
 #endif
 }
 #endif  // CONFIG_FILTER_INTRA
diff --git a/av1/common/reconintra.c b/av1/common/reconintra.c
index 94ce012..7d36847 100644
--- a/av1/common/reconintra.c
+++ b/av1/common/reconintra.c
@@ -1110,6 +1110,268 @@
 #endif  // CONFIG_EXT_INTRA
 
 #if CONFIG_FILTER_INTRA
+#if FILTER_INTRA_PROC_UNIT_SIZE == 2
+static int filter_intra_taps_2x2procunit[FILTER_INTRA_MODES][4][5] = {
+  {
+      { -3, 5, 0, 6, 0 },
+      { -2, 1, 5, 4, 0 },
+      { -2, 3, 0, 1, 6 },
+      { -2, 1, 3, 1, 5 },
+  },
+  {
+      { -5, 8, 0, 5, 0 },
+      { -3, 0, 8, 3, 0 },
+      { -5, 8, 0, 0, 5 },
+      { -3, 0, 8, 0, 3 },
+  },
+  {
+      { -4, 4, 0, 8, 0 },
+      { -4, 0, 4, 8, 0 },
+      { -2, 2, 0, 0, 8 },
+      { -2, 0, 2, 0, 8 },
+  },
+  {
+      { -1, 6, 0, 3, 0 },
+      { 0, 1, 6, 1, 0 },
+      { -1, 5, 0, 1, 3 },
+      { 0, 2, 4, 1, 1 },
+  },
+  {
+      { -1, 4, 0, 5, 0 },
+      { -1, 2, 4, 3, 0 },
+      { -1, 2, 0, 2, 5 },
+      { -1, 2, 2, 2, 3 },
+  },
+  {
+      { -6, 7, 0, 7, 0 },
+      { -5, 0, 7, 6, 0 },
+      { -5, 6, 0, 0, 7 },
+      { -4, 0, 6, 0, 6 },
+  },
+};
+#elif FILTER_INTRA_PROC_UNIT_SIZE == 4
+static int filter_intra_taps_4x4procunit[FILTER_INTRA_MODES][16][9] = {
+#if FILTER_INTRA_SCALE_BITS == 4
+  {
+      { -6, 10, 0, 0, 0, 12, 0, 0, 0 },
+      { -5, 2, 10, 0, 0, 9, 0, 0, 0 },
+      { -3, 1, 1, 10, 0, 7, 0, 0, 0 },
+      { -3, 1, 1, 2, 10, 5, 0, 0, 0 },
+      { -4, 6, 0, 0, 0, 2, 12, 0, 0 },
+      { -3, 2, 6, 0, 0, 2, 9, 0, 0 },
+      { -3, 2, 2, 6, 0, 2, 7, 0, 0 },
+      { -2, 0, 2, 2, 6, 3, 5, 0, 0 },
+      { -2, 4, 0, 0, 0, 1, 1, 12, 0 },
+      { -3, 2, 4, 0, 0, 2, 2, 9, 0 },
+      { -2, 0, 2, 4, 0, 2, 3, 7, 0 },
+      { -2, 0, 0, 2, 4, 3, 3, 6, 0 },
+      { -1, 2, 0, 0, 0, 1, 1, 1, 12 },
+      { -2, 2, 3, 0, 0, 0, 2, 2, 9 },
+      { -1, 0, 2, 3, 0, 0, 2, 3, 7 },
+      { -1, 0, 0, 2, 3, 0, 3, 3, 6 },
+  },
+  {
+      { -10, 16, 0, 0, 0, 10, 0, 0, 0 },
+      { -6, 0, 16, 0, 0, 6, 0, 0, 0 },
+      { -4, 0, 0, 16, 0, 4, 0, 0, 0 },
+      { -2, 0, 0, 0, 16, 2, 0, 0, 0 },
+      { -10, 16, 0, 0, 0, 0, 10, 0, 0 },
+      { -6, 0, 16, 0, 0, 0, 6, 0, 0 },
+      { -4, 0, 0, 16, 0, 0, 4, 0, 0 },
+      { -2, 0, 0, 0, 16, 0, 2, 0, 0 },
+      { -10, 16, 0, 0, 0, 0, 0, 10, 0 },
+      { -6, 0, 16, 0, 0, 0, 0, 6, 0 },
+      { -4, 0, 0, 16, 0, 0, 0, 4, 0 },
+      { -2, 0, 0, 0, 16, 0, 0, 2, 0 },
+      { -10, 16, 0, 0, 0, 0, 0, 0, 10 },
+      { -6, 0, 16, 0, 0, 0, 0, 0, 6 },
+      { -4, 0, 0, 16, 0, 0, 0, 0, 4 },
+      { -2, 0, 0, 0, 16, 0, 0, 0, 2 },
+  },
+  {
+      { -8, 8, 0, 0, 0, 16, 0, 0, 0 },
+      { -8, 0, 8, 0, 0, 16, 0, 0, 0 },
+      { -8, 0, 0, 8, 0, 16, 0, 0, 0 },
+      { -8, 0, 0, 0, 8, 16, 0, 0, 0 },
+      { -4, 4, 0, 0, 0, 0, 16, 0, 0 },
+      { -4, 0, 4, 0, 0, 0, 16, 0, 0 },
+      { -4, 0, 0, 4, 0, 0, 16, 0, 0 },
+      { -4, 0, 0, 0, 4, 0, 16, 0, 0 },
+      { -2, 2, 0, 0, 0, 0, 0, 16, 0 },
+      { -2, 0, 2, 0, 0, 0, 0, 16, 0 },
+      { -2, 0, 0, 2, 0, 0, 0, 16, 0 },
+      { -2, 0, 0, 0, 2, 0, 0, 16, 0 },
+      { -1, 1, 0, 0, 0, 0, 0, 0, 16 },
+      { -1, 0, 1, 0, 0, 0, 0, 0, 16 },
+      { -1, 0, 0, 1, 0, 0, 0, 0, 16 },
+      { -1, 0, 0, 0, 1, 0, 0, 0, 16 },
+  },
+  {
+      { -2, 12, 0, 0, 0, 6, 0, 0, 0 },
+      { -1, 3, 12, 0, 0, 2, 0, 0, 0 },
+      { 0, 1, 2, 12, 0, 1, 0, 0, 0 },
+      { 0, 0, 1, 3, 12, 0, 0, 0, 0 },
+      { -2, 9, 0, 0, 0, 3, 6, 0, 0 },
+      { -1, 4, 9, 0, 0, 2, 2, 0, 0 },
+      { -1, 2, 4, 9, 0, 1, 1, 0, 0 },
+      { 0, 1, 2, 4, 9, 0, 0, 0, 0 },
+      { -1, 7, 0, 0, 0, 2, 2, 6, 0 },
+      { -1, 4, 7, 0, 0, 2, 2, 2, 0 },
+      { 0, 2, 4, 7, 0, 1, 1, 1, 0 },
+      { 0, 1, 2, 4, 7, 1, 1, 0, 0 },
+      { -1, 5, 0, 0, 0, 1, 2, 3, 6 },
+      { 0, 4, 5, 0, 0, 1, 2, 2, 2 },
+      { 0, 3, 4, 5, 0, 2, 1, 1, 0 },
+      { 0, 2, 3, 4, 5, 1, 1, 0, 0 },
+  },
+  {
+      { -2, 8, 0, 0, 0, 10, 0, 0, 0 },
+      { -1, 3, 8, 0, 0, 6, 0, 0, 0 },
+      { -1, 2, 3, 8, 0, 4, 0, 0, 0 },
+      { 0, 1, 2, 3, 8, 2, 0, 0, 0 },
+      { -1, 4, 0, 0, 0, 3, 10, 0, 0 },
+      { -1, 3, 4, 0, 0, 4, 6, 0, 0 },
+      { -1, 2, 3, 4, 0, 4, 4, 0, 0 },
+      { 0, 2, 2, 3, 4, 3, 2, 0, 0 },
+      { -1, 2, 0, 0, 0, 2, 3, 10, 0 },
+      { -1, 2, 2, 0, 0, 3, 4, 6, 0 },
+      { 0, 2, 2, 2, 0, 3, 3, 4, 0 },
+      { 0, 2, 3, 2, 0, 3, 3, 3, 0 },
+      { 0, 1, 0, 0, 0, 1, 1, 3, 10 },
+      { 0, 1, 1, 0, 0, 2, 2, 4, 6 },
+      { 0, 2, 1, 0, 0, 2, 3, 4, 4 },
+      { 0, 2, 2, 0, 0, 3, 3, 3, 3 },
+  },
+  {
+      { -12, 14, 0, 0, 0, 14, 0, 0, 0 },
+      { -10, 0, 14, 0, 0, 12, 0, 0, 0 },
+      { -9, 0, 0, 14, 0, 11, 0, 0, 0 },
+      { -8, 0, 0, 0, 14, 10, 0, 0, 0 },
+      { -10, 12, 0, 0, 0, 0, 14, 0, 0 },
+      { -9, 1, 12, 0, 0, 0, 12, 0, 0 },
+      { -8, 0, 0, 12, 0, 1, 11, 0, 0 },
+      { -7, 0, 0, 1, 12, 1, 9, 0, 0 },
+      { -9, 11, 0, 0, 0, 0, 0, 14, 0 },
+      { -8, 1, 11, 0, 0, 0, 0, 12, 0 },
+      { -8, 0, 1, 11, 0, 0, 1, 11, 0 },
+      { -7, 0, 0, 1, 11, 1, 1, 9, 0 },
+      { -8, 10, 0, 0, 0, 0, 0, 0, 14 },
+      { -7, 1, 9, 0, 0, 0, 0, 1, 12 },
+      { -7, 1, 1, 9, 0, 0, 0, 1, 11 },
+      { -6, 0, 1, 1, 10, 0, 0, 1, 9 },
+  },
+#else
+  {
+      { -3, 5, 0, 0, 0, 6, 0, 0, 0 },
+      { -2, 1, 5, 0, 0, 4, 0, 0, 0 },
+      { -2, 1, 1, 5, 0, 3, 0, 0, 0 },
+      { -1, 0, 1, 1, 5, 2, 0, 0, 0 },
+      { -2, 3, 0, 0, 0, 1, 6, 0, 0 },
+      { -2, 1, 3, 0, 0, 1, 5, 0, 0 },
+      { -1, 0, 1, 3, 0, 1, 4, 0, 0 },
+      { -1, 0, 0, 1, 3, 2, 3, 0, 0 },
+      { -1, 2, 0, 0, 0, 0, 1, 6, 0 },
+      { -1, 1, 2, 0, 0, 0, 1, 5, 0 },
+      { -1, 0, 0, 2, 0, 1, 2, 4, 0 },
+      { -1, 0, 0, 1, 3, 0, 2, 3, 0 },
+      { -1, 1, 0, 0, 0, 1, 0, 1, 6 },
+      { -1, 1, 2, 0, 0, 0, 0, 1, 5 },
+      { -1, 0, 1, 2, 0, 0, 0, 2, 4 },
+      { 0, 0, 0, 1, 2, 0, 0, 2, 3 },
+  },
+  {
+      { -5, 8, 0, 0, 0, 5, 0, 0, 0 },
+      { -3, 0, 8, 0, 0, 3, 0, 0, 0 },
+      { -2, 0, 0, 8, 0, 2, 0, 0, 0 },
+      { -1, 0, 0, 0, 8, 1, 0, 0, 0 },
+      { -5, 8, 0, 0, 0, 0, 5, 0, 0 },
+      { -3, 0, 8, 0, 0, 0, 3, 0, 0 },
+      { -2, 0, 0, 8, 0, 0, 2, 0, 0 },
+      { -1, 0, 0, 0, 8, 0, 1, 0, 0 },
+      { -5, 8, 0, 0, 0, 0, 0, 5, 0 },
+      { -3, 0, 8, 0, 0, 0, 0, 3, 0 },
+      { -2, 0, 0, 8, 0, 0, 0, 2, 0 },
+      { -1, 0, 0, 0, 8, 0, 0, 1, 0 },
+      { -5, 8, 0, 0, 0, 0, 0, 0, 5 },
+      { -3, 0, 8, 0, 0, 0, 0, 0, 3 },
+      { -2, 0, 0, 8, 0, 0, 0, 0, 2 },
+      { -1, 0, 0, 0, 8, 0, 0, 0, 1 },
+  },
+  {
+      { -4, 4, 0, 0, 0, 8, 0, 0, 0 },
+      { -4, 0, 4, 0, 0, 8, 0, 0, 0 },
+      { -4, 0, 0, 4, 0, 8, 0, 0, 0 },
+      { -4, 0, 0, 0, 4, 8, 0, 0, 0 },
+      { -2, 2, 0, 0, 0, 0, 8, 0, 0 },
+      { -2, 0, 2, 0, 0, 0, 8, 0, 0 },
+      { -2, 0, 0, 2, 0, 0, 8, 0, 0 },
+      { -2, 0, 0, 0, 2, 0, 8, 0, 0 },
+      { -1, 1, 0, 0, 0, 0, 0, 8, 0 },
+      { -1, 0, 1, 0, 0, 0, 0, 8, 0 },
+      { -1, 0, 0, 1, 0, 0, 0, 8, 0 },
+      { -1, 0, 0, 0, 1, 0, 0, 8, 0 },
+      { -1, 1, 0, 0, 0, 0, 0, 0, 8 },
+      { -1, 0, 1, 0, 0, 0, 0, 0, 8 },
+      { -1, 0, 0, 1, 0, 0, 0, 0, 8 },
+      { -1, 0, 0, 0, 1, 0, 0, 0, 8 },
+  },
+  {
+      { -1, 6, 0, 0, 0, 3, 0, 0, 0 },
+      { 0, 1, 6, 0, 0, 1, 0, 0, 0 },
+      { 0, 1, 1, 6, 0, 0, 0, 0, 0 },
+      { 0, 0, 1, 1, 6, 0, 0, 0, 0 },
+      { -1, 5, 0, 0, 0, 1, 3, 0, 0 },
+      { 0, 2, 4, 0, 0, 1, 1, 0, 0 },
+      { 0, 1, 2, 4, 0, 1, 0, 0, 0 },
+      { 0, 0, 1, 2, 5, 0, 0, 0, 0 },
+      { 0, 3, 0, 0, 0, 1, 1, 3, 0 },
+      { 0, 2, 3, 0, 0, 1, 1, 1, 0 },
+      { 0, 1, 2, 3, 0, 1, 1, 0, 0 },
+      { 0, 1, 1, 2, 4, 0, 0, 0, 0 },
+      { 0, 3, 0, 0, 0, 1, 0, 1, 3 },
+      { 0, 2, 3, 0, 0, 1, 0, 1, 1 },
+      { 0, 1, 2, 3, 0, 1, 1, 0, 0 },
+      { 0, 1, 2, 2, 3, 0, 0, 0, 0 },
+  },
+  {
+      { -1, 4, 0, 0, 0, 5, 0, 0, 0 },
+      { -1, 2, 4, 0, 0, 3, 0, 0, 0 },
+      { 0, 1, 1, 4, 0, 2, 0, 0, 0 },
+      { 0, 1, 1, 1, 4, 1, 0, 0, 0 },
+      { -1, 2, 0, 0, 0, 2, 5, 0, 0 },
+      { -1, 2, 2, 0, 0, 2, 3, 0, 0 },
+      { 0, 1, 1, 2, 0, 2, 2, 0, 0 },
+      { 0, 1, 0, 2, 2, 2, 1, 0, 0 },
+      { 0, 1, 0, 0, 0, 1, 1, 5, 0 },
+      { 0, 1, 1, 0, 0, 1, 2, 3, 0 },
+      { 0, 1, 1, 0, 0, 2, 2, 2, 0 },
+      { 0, 1, 1, 0, 0, 2, 2, 2, 0 },
+      { 0, 1, 0, 0, 0, 0, 0, 2, 5 },
+      { 0, 1, 1, 0, 0, 1, 0, 2, 3 },
+      { 0, 1, 1, 0, 0, 2, 0, 2, 2 },
+      { 0, 1, 1, 0, 0, 2, 2, 2, 0 },
+  },
+  {
+      { -6, 7, 0, 0, 0, 7, 0, 0, 0 },
+      { -5, 0, 7, 0, 0, 6, 0, 0, 0 },
+      { -4, 0, 0, 7, 0, 5, 0, 0, 0 },
+      { -4, 0, 0, 0, 7, 5, 0, 0, 0 },
+      { -5, 6, 0, 0, 0, 0, 7, 0, 0 },
+      { -4, 0, 6, 0, 0, 0, 6, 0, 0 },
+      { -4, 0, 0, 6, 0, 0, 6, 0, 0 },
+      { -4, 0, 0, 0, 6, 1, 5, 0, 0 },
+      { -4, 5, 0, 0, 0, 0, 0, 7, 0 },
+      { -4, 0, 6, 0, 0, 0, 0, 6, 0 },
+      { -4, 0, 0, 6, 0, 0, 0, 6, 0 },
+      { -3, 0, 0, 0, 6, 0, 0, 5, 0 },
+      { -4, 5, 0, 0, 0, 0, 0, 0, 7 },
+      { -4, 1, 5, 0, 0, 0, 0, 0, 6 },
+      { -3, 0, 0, 5, 0, 0, 0, 0, 6 },
+      { -3, 0, 0, 1, 5, 0, 0, 0, 5 },
+  },
+#endif
+};
+#else
 static int filter_intra_taps_3[TX_SIZES_ALL][FILTER_INTRA_MODES][3] = {
   {
       { 5, 7, -4 },
@@ -1252,27 +1514,79 @@
       { 7, 7, -6 },
   },
 };
+#endif
 
 static void filter_intra_predictors_3tap(uint8_t *dst, ptrdiff_t stride,
                                          TX_SIZE tx_size, const uint8_t *above,
                                          const uint8_t *left, int mode) {
   int r, c;
-  int ipred;
-#if CONFIG_TX64X64
-  int buffer[65][65];
-#else
   int buffer[33][33];
-#endif  // CONFIG_TX64X64
-  const int c0 = filter_intra_taps_3[tx_size][mode][0];
-  const int c1 = filter_intra_taps_3[tx_size][mode][1];
-  const int c2 = filter_intra_taps_3[tx_size][mode][2];
   const int bw = tx_size_wide[tx_size];
   const int bh = tx_size_high[tx_size];
 
+  assert(bw <= 32 && bh <= 32);
+
   for (r = 0; r < bh; ++r) buffer[r + 1][0] = (int)left[r];
 
   for (c = 0; c < bw + 1; ++c) buffer[0][c] = (int)above[c - 1];
 
+#if FILTER_INTRA_PROC_UNIT_SIZE == 2
+  for (r = 1; r < bh + 1; r += 2)
+    for (c = 1; c < bw + 1; c += 2) {
+      const int p0 = buffer[r - 1][c - 1];
+      const int p1 = buffer[r - 1][c];
+      const int p2 = buffer[r - 1][c + 1];
+      const int p3 = buffer[r][c - 1];
+      const int p4 = buffer[r + 1][c - 1];
+      for (int k = 0; k < 4; ++k) {
+        int r_offset = k >> 1;
+        int c_offset = k & 0x01;
+        buffer[r + r_offset][c + c_offset] =
+            filter_intra_taps_2x2procunit[mode][k][0] * p0 +
+            filter_intra_taps_2x2procunit[mode][k][1] * p1 +
+            filter_intra_taps_2x2procunit[mode][k][2] * p2 +
+            filter_intra_taps_2x2procunit[mode][k][3] * p3 +
+            filter_intra_taps_2x2procunit[mode][k][4] * p4;
+        buffer[r + r_offset][c + c_offset] =
+            clip_pixel(ROUND_POWER_OF_TWO_SIGNED(
+                buffer[r + r_offset][c + c_offset], FILTER_INTRA_SCALE_BITS));
+      }
+    }
+#elif FILTER_INTRA_PROC_UNIT_SIZE == 4
+  for (r = 1; r < bh + 1; r += 4)
+    for (c = 1; c < bw + 1; c += 4) {
+      const int p0 = buffer[r - 1][c - 1];
+      const int p1 = buffer[r - 1][c];
+      const int p2 = buffer[r - 1][c + 1];
+      const int p3 = buffer[r - 1][c + 2];
+      const int p4 = buffer[r - 1][c + 3];
+      const int p5 = buffer[r][c - 1];
+      const int p6 = buffer[r + 1][c - 1];
+      const int p7 = buffer[r + 2][c - 1];
+      const int p8 = buffer[r + 3][c - 1];
+      for (int k = 0; k < 16; ++k) {
+        int r_offset = k >> 2;
+        int c_offset = k & 0x03;
+        buffer[r + r_offset][c + c_offset] =
+            filter_intra_taps_4x4procunit[mode][k][0] * p0 +
+            filter_intra_taps_4x4procunit[mode][k][1] * p1 +
+            filter_intra_taps_4x4procunit[mode][k][2] * p2 +
+            filter_intra_taps_4x4procunit[mode][k][3] * p3 +
+            filter_intra_taps_4x4procunit[mode][k][4] * p4 +
+            filter_intra_taps_4x4procunit[mode][k][5] * p5 +
+            filter_intra_taps_4x4procunit[mode][k][6] * p6 +
+            filter_intra_taps_4x4procunit[mode][k][7] * p7 +
+            filter_intra_taps_4x4procunit[mode][k][8] * p8;
+        buffer[r + r_offset][c + c_offset] =
+            clip_pixel(ROUND_POWER_OF_TWO_SIGNED(
+                buffer[r + r_offset][c + c_offset], FILTER_INTRA_SCALE_BITS));
+      }
+    }
+#else
+  int ipred;
+  const int c0 = filter_intra_taps_3[tx_size][mode][0];
+  const int c1 = filter_intra_taps_3[tx_size][mode][1];
+  const int c2 = filter_intra_taps_3[tx_size][mode][2];
   for (r = 1; r < bh + 1; ++r)
     for (c = 1; c < bw + 1; ++c) {
       ipred = c0 * buffer[r - 1][c] + c1 * buffer[r][c - 1] +
@@ -1280,10 +1594,10 @@
       buffer[r][c] = ROUND_POWER_OF_TWO_SIGNED(ipred, FILTER_INTRA_SCALE_BITS);
       buffer[r][c] = clip_pixel(buffer[r][c]);
     }
-
+#endif
   for (r = 0; r < bh; ++r) {
     for (c = 0; c < bw; ++c) {
-      dst[c] = clip_pixel(buffer[r + 1][c + 1]);
+      dst[c] = buffer[r + 1][c + 1];
     }
     dst += stride;
   }
@@ -1360,33 +1674,87 @@
                                                 const uint16_t *left, int mode,
                                                 int bd) {
   int r, c;
-  int ipred;
-#if CONFIG_TX64X64
-  int preds[65][65];
-#else
-  int preds[33][33];
-#endif  // CONFIG_TX64X64
-  const int c0 = filter_intra_taps_3[tx_size][mode][0];
-  const int c1 = filter_intra_taps_3[tx_size][mode][1];
-  const int c2 = filter_intra_taps_3[tx_size][mode][2];
+  int buffer[33][33];
   const int bw = tx_size_wide[tx_size];
   const int bh = tx_size_high[tx_size];
 
-  for (r = 0; r < bh; ++r) preds[r + 1][0] = (int)left[r];
+  assert(bw <= 32 && bh <= 32);
 
-  for (c = 0; c < bw + 1; ++c) preds[0][c] = (int)above[c - 1];
+  for (r = 0; r < bh; ++r) buffer[r + 1][0] = (int)left[r];
 
+  for (c = 0; c < bw + 1; ++c) buffer[0][c] = (int)above[c - 1];
+
+#if FILTER_INTRA_PROC_UNIT_SIZE == 2
+  for (r = 1; r < bh + 1; r += 2)
+    for (c = 1; c < bw + 1; c += 2) {
+      const int p0 = buffer[r - 1][c - 1];
+      const int p1 = buffer[r - 1][c];
+      const int p2 = buffer[r - 1][c + 1];
+      const int p3 = buffer[r][c - 1];
+      const int p4 = buffer[r + 1][c - 1];
+      for (int k = 0; k < 4; ++k) {
+        int r_offset = k >> 1;
+        int c_offset = k & 0x01;
+        buffer[r + r_offset][c + c_offset] =
+            filter_intra_taps_2x2procunit[mode][k][0] * p0 +
+            filter_intra_taps_2x2procunit[mode][k][1] * p1 +
+            filter_intra_taps_2x2procunit[mode][k][2] * p2 +
+            filter_intra_taps_2x2procunit[mode][k][3] * p3 +
+            filter_intra_taps_2x2procunit[mode][k][4] * p4;
+        buffer[r + r_offset][c + c_offset] = clip_pixel_highbd(
+            ROUND_POWER_OF_TWO_SIGNED(buffer[r + r_offset][c + c_offset],
+                                      FILTER_INTRA_SCALE_BITS),
+            bd);
+      }
+    }
+#elif FILTER_INTRA_PROC_UNIT_SIZE == 4
+  for (r = 1; r < bh + 1; r += 4)
+    for (c = 1; c < bw + 1; c += 4) {
+      const int p0 = buffer[r - 1][c - 1];
+      const int p1 = buffer[r - 1][c];
+      const int p2 = buffer[r - 1][c + 1];
+      const int p3 = buffer[r - 1][c + 2];
+      const int p4 = buffer[r - 1][c + 3];
+      const int p5 = buffer[r][c - 1];
+      const int p6 = buffer[r + 1][c - 1];
+      const int p7 = buffer[r + 2][c - 1];
+      const int p8 = buffer[r + 3][c - 1];
+      for (int k = 0; k < 16; ++k) {
+        int r_offset = k >> 2;
+        int c_offset = k & 0x03;
+        buffer[r + r_offset][c + c_offset] =
+            filter_intra_taps_4x4procunit[mode][k][0] * p0 +
+            filter_intra_taps_4x4procunit[mode][k][1] * p1 +
+            filter_intra_taps_4x4procunit[mode][k][2] * p2 +
+            filter_intra_taps_4x4procunit[mode][k][3] * p3 +
+            filter_intra_taps_4x4procunit[mode][k][4] * p4 +
+            filter_intra_taps_4x4procunit[mode][k][5] * p5 +
+            filter_intra_taps_4x4procunit[mode][k][6] * p6 +
+            filter_intra_taps_4x4procunit[mode][k][7] * p7 +
+            filter_intra_taps_4x4procunit[mode][k][8] * p8;
+        buffer[r + r_offset][c + c_offset] = clip_pixel_highbd(
+            ROUND_POWER_OF_TWO_SIGNED(buffer[r + r_offset][c + c_offset],
+                                      FILTER_INTRA_SCALE_BITS),
+            bd);
+      }
+    }
+#else
+  int ipred;
+  const int c0 = filter_intra_taps_3[tx_size][mode][0];
+  const int c1 = filter_intra_taps_3[tx_size][mode][1];
+  const int c2 = filter_intra_taps_3[tx_size][mode][2];
   for (r = 1; r < bh + 1; ++r)
     for (c = 1; c < bw + 1; ++c) {
-      ipred = c0 * preds[r - 1][c] + c1 * preds[r][c - 1] +
-              c2 * preds[r - 1][c - 1];
-      preds[r][c] = ROUND_POWER_OF_TWO_SIGNED(ipred, FILTER_INTRA_SCALE_BITS);
-      preds[r][c] = clip_pixel_highbd(preds[r][c], bd);
+      ipred = c0 * buffer[r - 1][c] + c1 * buffer[r][c - 1] +
+              c2 * buffer[r - 1][c - 1];
+      buffer[r][c] = ROUND_POWER_OF_TWO_SIGNED(ipred, FILTER_INTRA_SCALE_BITS);
+      buffer[r][c] = clip_pixel_highbd(buffer[r][c], bd);
     }
+#endif
 
   for (r = 0; r < bh; ++r) {
     for (c = 0; c < bw; ++c) {
-      dst[c] = clip_pixel_highbd(preds[r + 1][c + 1], bd);
+      dst[c] = buffer[r + 1][c + 1];
     }
     dst += stride;
   }
diff --git a/av1/common/reconintra.h b/av1/common/reconintra.h
index 6fbc135..c507a42 100644
--- a/av1/common/reconintra.h
+++ b/av1/common/reconintra.h
@@ -42,7 +42,12 @@
 };
 
 #if CONFIG_FILTER_INTRA
+#define FILTER_INTRA_PROC_UNIT_SIZE 2
+#if FILTER_INTRA_PROC_UNIT_SIZE == 4
+#define FILTER_INTRA_SCALE_BITS 4
+#else
 #define FILTER_INTRA_SCALE_BITS 3
+#endif
 #endif  // CONFIG_FILTER_INTRA
 
 #define CONFIG_INTRA_EDGE_UPSAMPLE CONFIG_INTRA_EDGE