Allintra: Introduce sf prune_intra_mode_using_best_sad_so_far

This CL introduces a speed feature
prune_intra_mode_using_best_sad_so_far to prune intra modes
in nonrd path based on best sad so far only if the transform
block size is the same as the current block size. This speed
feature is enabled for allintra, speed 9.

For AVIF still-image encode,

             Encode Time     BD-Rate Loss(%)
cpu-used     Reduction(%)    psnr       ssim
   9           1.869         0.1608     0.0878

STATS_CHANGED

Change-Id: Ib7543ef05fe065c5158d7e0340b7b02a33a756c2
diff --git a/av1/encoder/nonrd_opt.h b/av1/encoder/nonrd_opt.h
index d4d910c..edd3f5c 100644
--- a/av1/encoder/nonrd_opt.h
+++ b/av1/encoder/nonrd_opt.h
@@ -70,6 +70,8 @@
   PREDICTION_MODE mode;
   int skippable;
   RD_STATS *rdc;
+  unsigned int best_sad;
+  bool prune_mode_based_on_sad;
 };
 /*!\endcond */
 
diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index 497ac52..0b30aa8 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c
@@ -1364,6 +1364,17 @@
   this_rdc->rate += (eob_cost << AV1_PROB_COST_SHIFT);
 }
 
+static INLINE void init_estimate_block_intra_args(
+    struct estimate_block_intra_args *args, AV1_COMP *cpi, MACROBLOCK *x) {
+  args->cpi = cpi;
+  args->x = x;
+  args->mode = DC_PRED;
+  args->skippable = 1;
+  args->rdc = 0;
+  args->best_sad = UINT_MAX;
+  args->prune_mode_based_on_sad = false;
+}
+
 static INLINE void init_mbmi(MB_MODE_INFO *mbmi, PREDICTION_MODE pred_mode,
                              MV_REFERENCE_FRAME ref_frame0,
                              MV_REFERENCE_FRAME ref_frame1,
@@ -1631,12 +1642,33 @@
   uint8_t *const dst_buf_base = pd->dst.buf;
   const int64_t src_stride = p->src.stride;
   const int64_t dst_stride = pd->dst.stride;
-  RD_STATS this_rdc;
 
   (void)block;
-  (void)plane_bsize;
 
   av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size);
+
+  if (args->prune_mode_based_on_sad) {
+    unsigned int this_sad = cpi->ppi->fn_ptr[plane_bsize].sdf(
+        p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride);
+    const unsigned int sad_threshold =
+        args->best_sad != UINT_MAX ? args->best_sad + (args->best_sad >> 4)
+                                   : UINT_MAX;
+    // Skip the evaluation of current mode if its SAD is more than a threshold.
+    if (this_sad > sad_threshold) {
+      // For the current mode, set rate and distortion to maximum possible
+      // values and return.
+      // Note: args->rdc->rate is checked in av1_nonrd_pick_intra_mode() to skip
+      // the evaluation of the current mode.
+      args->rdc->rate = INT_MAX;
+      args->rdc->dist = INT64_MAX;
+      return;
+    }
+    if (this_sad < args->best_sad) {
+      args->best_sad = this_sad;
+    }
+  }
+
+  RD_STATS this_rdc;
   av1_invalid_rd_stats(&this_rdc);
 
   p->src.buf = &src_buf_base[4 * (row * src_stride + col)];
@@ -1651,6 +1683,7 @@
 
   p->src.buf = src_buf_base;
   pd->dst.buf = dst_buf_base;
+  assert(args->rdc->rate != INT_MAX && args->rdc->dist != INT64_MAX);
   args->rdc->rate += this_rdc.rate;
   args->rdc->dist += this_rdc.dist;
 }
@@ -2238,11 +2271,19 @@
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mi = xd->mi[0];
   RD_STATS this_rdc, best_rdc;
-  struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 };
+  struct estimate_block_intra_args args;
+  init_estimate_block_intra_args(&args, cpi, x);
   const TxfmSearchParams *txfm_params = &x->txfm_search_params;
-  const TX_SIZE intra_tx_size =
+  mi->tx_size =
       AOMMIN(max_txsize_lookup[bsize],
              tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]);
+  const BLOCK_SIZE tx_bsize = txsize_to_bsize[mi->tx_size];
+
+  // If the current block size is the same as the transform block size, enable
+  // mode pruning based on the best SAD so far.
+  if (cpi->sf.rt_sf.prune_intra_mode_using_best_sad_so_far && bsize == tx_bsize)
+    args.prune_mode_based_on_sad = true;
+
   int *bmode_costs;
   PREDICTION_MODE best_mode = DC_PRED;
   const MB_MODE_INFO *above_mi = xd->above_mbmi;
@@ -2293,10 +2334,12 @@
     args.mode = this_mode;
     args.skippable = 1;
     args.rdc = &this_rdc;
-    mi->tx_size = intra_tx_size;
     mi->mode = this_mode;
     av1_foreach_transformed_block_in_plane(xd, bsize, AOM_PLANE_Y,
                                            estimate_block_intra, &args);
+
+    if (this_rdc.rate == INT_MAX) continue;
+
     const int skip_ctx = av1_get_skip_txfm_context(xd);
     if (args.skippable) {
       this_rdc.rate = x->mode_costs.skip_txfm_cost[skip_ctx][1];
@@ -2616,7 +2659,8 @@
                                : inter_mode_thresh;
   if (known_rd > best_rdc->rdcost) return;
 
-  struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 };
+  struct estimate_block_intra_args args;
+  init_estimate_block_intra_args(&args, cpi, x);
   TX_SIZE intra_tx_size = AOMMIN(
       AOMMIN(max_txsize_lookup[bsize],
              tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]),
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 438fd0f..bb94caf 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -562,6 +562,7 @@
     sf->rt_sf.vbp_prune_16x16_split_using_min_max_sub_blk_var = true;
     sf->rt_sf.prune_h_pred_using_best_mode_so_far = true;
     sf->rt_sf.enable_intra_mode_pruning_using_neighbors = true;
+    sf->rt_sf.prune_intra_mode_using_best_sad_so_far = true;
   }
 
   // As the speed feature prune_chroma_modes_using_luma_winner already
@@ -2136,6 +2137,7 @@
   rt_sf->frame_level_mode_cost_update = false;
   rt_sf->prune_h_pred_using_best_mode_so_far = false;
   rt_sf->enable_intra_mode_pruning_using_neighbors = false;
+  rt_sf->prune_intra_mode_using_best_sad_so_far = false;
   rt_sf->check_only_zero_zeromv_on_large_blocks = false;
   rt_sf->disable_cdf_update_non_reference_frame = false;
   rt_sf->prune_compoundmode_with_singlemode_var = false;
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index e2acf52..83405ab 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -1672,6 +1672,15 @@
   // -0.06%.
   bool enable_intra_mode_pruning_using_neighbors;
 
+  // Prune intra mode evaluations in nonrd path based on best sad so far.
+  //
+  // For allintra encode, this speed feature reduces instruction count by 3.05%
+  // for speed 9 with coding performance change less than 0.24%.
+  // For AVIF image encode, this speed feature reduces encode time by 1.87% for
+  // speed 9 on a typical image dataset with coding performance change less than
+  // 0.16%.
+  bool prune_intra_mode_using_best_sad_so_far;
+
   // If compound is enabled, and the current block size is \geq BLOCK_16X16,
   // limit the compound modes to GLOBAL_GLOBALMV. This does not apply to the
   // base layer of svc.