Reuse previous best prediction mode in partition AB search

This commits add a speed feature to reuse the previous best
prediction mode found in PARTITION_SPLIT and PARTITION_RECT for
PARTITION_AB search. This speed feature is turned on for speed 3 and
above.

Performance:
 SPEED_SET | AVG_PSNR | OVR_PSNR |   SSIM  |  SPD
     3     |  +0.019% |  +0.023% | +0.044% | +1.4%
     4     |  +0.012% |  +0.015% | +0.005% | +0.6%
     5     |  +0.011% |  +0.018% | -0.029% | +0.2%

STATS_CHANGED

Change-Id: I788fdbde2d420dc2c9e89dc861d17489ac46bbe7
diff --git a/av1/common/enums.h b/av1/common/enums.h
index 77d2b6a..9c2976b 100644
--- a/av1/common/enums.h
+++ b/av1/common/enums.h
@@ -412,6 +412,7 @@
   GLOBAL_GLOBALMV,
   NEW_NEWMV,
   MB_MODE_COUNT,
+  PRED_MODE_INVALID = MB_MODE_COUNT,
   INTRA_MODE_START = DC_PRED,
   INTRA_MODE_END = NEARESTMV,
   DIR_MODE_START = V_PRED,
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index dc8bb0f..f79c911 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -944,7 +944,7 @@
   /**@}*/
 
   /*****************************************************************************
-   * \name Reference Frame Searc
+   * \name Reference Frame Search
    ****************************************************************************/
   /**@{*/
   /*! \brief Sum absolute distortion of the predicted mv for each ref frame.
@@ -1080,6 +1080,11 @@
    * Contains the hash table, hash function, and buffer used for intrabc.
    */
   IntraBCHashInfo intrabc_hash_info;
+
+  /*! \brief Whether to reuse the mode stored in intermode_cache. */
+  int use_intermode_cache;
+  /*! \brief The mode to reuse during \ref av1_rd_pick_inter_mode_sb. */
+  PREDICTION_MODE intermode_cache;
   /**@}*/
 
   /*****************************************************************************
diff --git a/av1/encoder/context_tree.c b/av1/encoder/context_tree.c
index 6d07ef2..6554be6 100644
--- a/av1/encoder/context_tree.c
+++ b/av1/encoder/context_tree.c
@@ -11,6 +11,7 @@
 
 #include "av1/encoder/context_tree.h"
 #include "av1/encoder/encoder.h"
+#include "av1/encoder/rd.h"
 
 static const BLOCK_SIZE square[MAX_SB_SIZE_LOG2 - 1] = {
   BLOCK_4X4, BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64, BLOCK_128X128,
@@ -101,6 +102,8 @@
     }
   }
 
+  av1_invalid_rd_stats(&ctx->rd_stats);
+
   return ctx;
 }
 
diff --git a/av1/encoder/partition_search.c b/av1/encoder/partition_search.c
index 5d5af2e..54471d7 100644
--- a/av1/encoder/partition_search.c
+++ b/av1/encoder/partition_search.c
@@ -2249,8 +2249,9 @@
                                int mi_row, int mi_col, BLOCK_SIZE bsize,
                                PARTITION_TYPE partition,
                                const BLOCK_SIZE ab_subsize[SUB_PARTITIONS_AB],
-                               const int ab_mi_pos[SUB_PARTITIONS_AB][2]) {
-  const MACROBLOCK *const x = &td->mb;
+                               const int ab_mi_pos[SUB_PARTITIONS_AB][2],
+                               const PREDICTION_MODE *mode_cache) {
+  MACROBLOCK *const x = &td->mb;
   const MACROBLOCKD *const xd = &x->e_mbd;
   const int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
   RD_STATS sum_rdc;
@@ -2259,10 +2260,19 @@
   sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
   // Loop over sub-partitions in AB partition type.
   for (int i = 0; i < SUB_PARTITIONS_AB; i++) {
-    if (!rd_try_subblock(cpi, td, tile_data, tp, i == SUB_PARTITIONS_AB - 1,
-                         ab_mi_pos[i][0], ab_mi_pos[i][1], ab_subsize[i],
-                         *best_rdc, &sum_rdc, partition, ctxs[i]))
+    if (mode_cache && mode_cache[i] != PRED_MODE_INVALID) {
+      x->use_intermode_cache = 1;
+      x->intermode_cache = mode_cache[i];
+    }
+    const int mode_search_success =
+        rd_try_subblock(cpi, td, tile_data, tp, i == SUB_PARTITIONS_AB - 1,
+                        ab_mi_pos[i][0], ab_mi_pos[i][1], ab_subsize[i],
+                        *best_rdc, &sum_rdc, partition, ctxs[i]);
+    x->use_intermode_cache = 0;
+    x->intermode_cache = PRED_MODE_INVALID;
+    if (!mode_search_success) {
       return false;
+    }
   }
 
   av1_rd_cost_update(x->rdmult, &sum_rdc);
@@ -2616,7 +2626,8 @@
     PC_TREE *pc_tree, PICK_MODE_CONTEXT *dst_ctxs[SUB_PARTITIONS_AB],
     PartitionSearchState *part_search_state, RD_STATS *best_rdc,
     const BLOCK_SIZE ab_subsize[SUB_PARTITIONS_AB],
-    const int ab_mi_pos[SUB_PARTITIONS_AB][2], const PARTITION_TYPE part_type) {
+    const int ab_mi_pos[SUB_PARTITIONS_AB][2], const PARTITION_TYPE part_type,
+    const PREDICTION_MODE *mode_cache) {
   const AV1_COMMON *const cm = &cpi->common;
   PartitionBlkParams blk_params = part_search_state->part_blk_params;
   const int mi_row = blk_params.mi_row;
@@ -2641,7 +2652,7 @@
   // Test this partition and update the best partition.
   part_search_state->found_best_partition |= rd_test_partition3(
       cpi, td, tile_data, tp, pc_tree, best_rdc, dst_ctxs, mi_row, mi_col,
-      bsize, part_type, ab_subsize, ab_mi_pos);
+      bsize, part_type, ab_subsize, ab_mi_pos, mode_cache);
 
 #if CONFIG_COLLECT_PARTITION_STATS
   if (partition_timer_on) {
@@ -2683,6 +2694,56 @@
     mode_srch_ctx[HORZ_A][1] = &pc_tree->split[1]->none;
 }
 
+static AOM_INLINE void copy_partition_mode_from_mode_context(
+    PREDICTION_MODE *dst_mode, const PICK_MODE_CONTEXT *ctx) {
+  if (ctx && ctx->rd_stats.rate < INT_MAX) {
+    *dst_mode = ctx->mic.mode;
+  } else {
+    *dst_mode = PRED_MODE_INVALID;
+  }
+}
+
+static AOM_INLINE void copy_partition_mode_from_pc_tree(
+    PREDICTION_MODE *dst_mode, const PC_TREE *pc_tree) {
+  if (pc_tree) {
+    copy_partition_mode_from_mode_context(dst_mode, pc_tree->none);
+  } else {
+    *dst_mode = PRED_MODE_INVALID;
+  }
+}
+
+static AOM_INLINE void set_mode_cache_for_partition_ab(
+    PREDICTION_MODE *mode_cache, const PC_TREE *pc_tree,
+    AB_PART_TYPE ab_part_type) {
+  switch (ab_part_type) {
+    case HORZ_A:
+      copy_partition_mode_from_pc_tree(&mode_cache[0], pc_tree->split[0]);
+      copy_partition_mode_from_pc_tree(&mode_cache[1], pc_tree->split[1]);
+      copy_partition_mode_from_mode_context(&mode_cache[2],
+                                            pc_tree->horizontal[1]);
+      break;
+    case HORZ_B:
+      copy_partition_mode_from_mode_context(&mode_cache[0],
+                                            pc_tree->horizontal[0]);
+      copy_partition_mode_from_pc_tree(&mode_cache[1], pc_tree->split[2]);
+      copy_partition_mode_from_pc_tree(&mode_cache[2], pc_tree->split[3]);
+      break;
+    case VERT_A:
+      copy_partition_mode_from_pc_tree(&mode_cache[0], pc_tree->split[0]);
+      copy_partition_mode_from_pc_tree(&mode_cache[1], pc_tree->split[2]);
+      copy_partition_mode_from_mode_context(&mode_cache[2],
+                                            pc_tree->vertical[1]);
+      break;
+    case VERT_B:
+      copy_partition_mode_from_mode_context(&mode_cache[0],
+                                            pc_tree->vertical[0]);
+      copy_partition_mode_from_pc_tree(&mode_cache[1], pc_tree->split[1]);
+      copy_partition_mode_from_pc_tree(&mode_cache[2], pc_tree->split[3]);
+      break;
+    default: assert(0 && "Invalid ab partition type!\n");
+  }
+}
+
 // AB Partitions type search.
 static void ab_partitions_search(
     AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
@@ -2775,7 +2836,8 @@
       cur_part_ctxs[ab_part_type][i]->rd_mode_is_ready = 0;
     }
 
-    // Copy of mode search results if the ctx is ready.
+    // We can copy directly the mode search results if we have already searched
+    // the current block and the contexts match.
     if (is_ctx_ready[ab_part_type][0]) {
       av1_copy_tree_context(cur_part_ctxs[ab_part_type][0],
                             mode_srch_ctx[ab_part_type][0][0]);
@@ -2789,11 +2851,19 @@
       }
     }
 
+    // Even if the contexts don't match, we can still speed up by reusing the
+    // previous prediction mode.
+    PREDICTION_MODE mode_cache[3] = { PRED_MODE_INVALID, PRED_MODE_INVALID,
+                                      PRED_MODE_INVALID };
+    if (cpi->sf.inter_sf.reuse_best_prediction_for_part_ab) {
+      set_mode_cache_for_partition_ab(mode_cache, pc_tree, ab_part_type);
+    }
+
     // Evaluation of AB partition type.
     rd_pick_ab_part(cpi, td, tile_data, tp, x, x_ctx, pc_tree,
                     cur_part_ctxs[ab_part_type], part_search_state, best_rdc,
                     ab_subsize[ab_part_type], ab_mi_pos[ab_part_type],
-                    part_type);
+                    part_type, mode_cache);
   }
 }
 
@@ -3169,6 +3239,7 @@
   // PARTITION_NONE evaluation and cost update.
   pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, this_rdc, PARTITION_NONE,
                 bsize, pc_tree->none, best_remain_rdcost);
+
   av1_rd_cost_update(x->rdmult, this_rdc);
 
 #if CONFIG_COLLECT_PARTITION_STATS
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 0a4bce7..8edf0bd 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -5095,6 +5095,10 @@
     num_single_modes_processed += is_single_pred;
     set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
 
+    if (x->use_intermode_cache && this_mode != x->intermode_cache) {
+      continue;
+    }
+
     // Apply speed features to decide if this inter mode can be skipped
     if (skip_inter_mode(cpi, x, bsize, ref_frame_rd, midx, &sf_args)) continue;
 
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 8d58cac..b34f5cb 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -536,6 +536,7 @@
     sf->inter_sf.selective_ref_frame = 4;
     sf->inter_sf.skip_repeated_ref_mv = 1;
     sf->inter_sf.skip_repeated_full_newmv = 1;
+    sf->inter_sf.reuse_best_prediction_for_part_ab = 1;
     sf->inter_sf.reuse_compound_type_decision = 1;
     sf->inter_sf.txfm_rd_gate_level =
         boosted ? 0 : (is_boosted_arf2_bwd_type ? 1 : 2);
@@ -1130,6 +1131,7 @@
   inter_sf->txfm_rd_gate_level = 0;
   inter_sf->prune_inter_modes_if_skippable = 0;
   inter_sf->disable_masked_comp = 0;
+  inter_sf->reuse_best_prediction_for_part_ab = 0;
 }
 
 static AOM_INLINE void init_interp_sf(INTERP_FILTER_SPEED_FEATURES *interp_sf) {
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index a28200a..42330af 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -736,6 +736,10 @@
 
   // Enable/disable masked compound.
   int disable_masked_comp;
+
+  // Reuse the best prediction modes found in PARTITION_SPLIT and PARTITION_RECT
+  // when encoding PARTITION_AB.
+  int reuse_best_prediction_for_part_ab;
 } INTER_MODE_SPEED_FEATURES;
 
 typedef struct INTERP_FILTER_SPEED_FEATURES {