Enable greedy version of av1_optimize_b().

This has been found to be better than the original version in both ways:
(1) Better compression: lowres -0.229, midres -0.147
(2) Faster too in my quick test over 5 different clips with 30
frames: 2.7% to 10.5% faster.

Change-Id: I4d46e0915d6e4b8e7bfc03d0c8b88cbe3351ca20
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index b0945ed..246b656 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -133,9 +133,6 @@
 }
 
 #if !CONFIG_LV_MAP
-#define USE_GREEDY_OPTIMIZE_B 0
-
-#if USE_GREEDY_OPTIMIZE_B
 
 typedef struct av1_token_state_greedy {
   int16_t token;
@@ -454,354 +451,6 @@
   return final_eob;
 }
 
-#else  // USE_GREEDY_OPTIMIZE_B
-
-typedef struct av1_token_state_org {
-  int64_t error;
-  int rate;
-  int16_t next;
-  int16_t token;
-  tran_low_t qc;
-  tran_low_t dqc;
-  uint8_t best_index;
-} av1_token_state_org;
-
-static int optimize_b_org(const AV1_COMMON *cm, MACROBLOCK *mb, int plane,
-                          int block, TX_SIZE tx_size, int ctx) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
-  struct macroblock_plane *const p = &mb->plane[plane];
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int ref = is_inter_block(&xd->mi[0]->mbmi);
-  av1_token_state_org tokens[MAX_TX_SQUARE + 1][2];
-  uint8_t token_cache[MAX_TX_SQUARE];
-  const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
-  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  const int eob = p->eobs[block];
-  const PLANE_TYPE plane_type = pd->plane_type;
-  const int default_eob = tx_size_2d[tx_size];
-  const int16_t *const dequant_ptr = pd->dequant;
-  const uint8_t *const band_translate = get_band_translate(tx_size);
-  TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
-  const SCAN_ORDER *const scan_order =
-      get_scan(cm, tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi));
-  const int16_t *const scan = scan_order->scan;
-  const int16_t *const nb = scan_order->neighbors;
-  int dqv;
-  const int shift = av1_get_tx_scale(tx_size);
-#if CONFIG_AOM_QM
-  int seg_id = xd->mi[0]->mbmi.segment_id;
-  const qm_val_t *iqmatrix = pd->seg_iqmatrix[seg_id][!ref][tx_size];
-#endif
-#if CONFIG_NEW_QUANT
-  int dq = get_dq_profile_from_ctx(mb->qindex, ctx, ref, plane_type);
-  const dequant_val_type_nuq *dequant_val = pd->dequant_val_nuq[dq];
-#endif  // CONFIG_NEW_QUANT
-  int next = eob, sz = 0;
-  const int64_t rdmult = (mb->rdmult * plane_rd_mult[ref][plane_type]) >> 1;
-  const int64_t rddiv = mb->rddiv;
-  int64_t rd_cost0, rd_cost1;
-  int rate0, rate1;
-  int64_t error0, error1;
-  int16_t t0, t1;
-  int best, band = (eob < default_eob) ? band_translate[eob]
-                                       : band_translate[eob - 1];
-  int pt, i, final_eob;
-  const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, xd->bd);
-  unsigned int(*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
-      mb->token_costs[txsize_sqr_map[tx_size]][plane_type][ref];
-  const uint16_t *band_counts = &band_count_table[tx_size][band];
-  uint16_t band_left = eob - band_cum_count_table[tx_size][band] + 1;
-  int shortcut = 0;
-  int next_shortcut = 0;
-
-#if CONFIG_EXT_DELTA_Q
-  const int qindex = cm->seg.enabled
-                         ? av1_get_qindex(&cm->seg, xd->mi[0]->mbmi.segment_id,
-                                          cm->base_qindex)
-                         : cm->base_qindex;
-  assert(qindex > 0);
-  (void)qindex;
-#else
-  assert(mb->qindex > 0);
-#endif
-
-  token_costs += band;
-
-  assert((!plane_type && !plane) || (plane_type && plane));
-  assert(eob <= default_eob);
-
-  /* Now set up a Viterbi trellis to evaluate alternative roundings. */
-  /* Initialize the sentinel node of the trellis. */
-  tokens[eob][0].rate = 0;
-  tokens[eob][0].error = 0;
-  tokens[eob][0].next = default_eob;
-  tokens[eob][0].token = EOB_TOKEN;
-  tokens[eob][0].qc = 0;
-  tokens[eob][1] = tokens[eob][0];
-
-  for (i = 0; i < eob; i++) {
-    const int rc = scan[i];
-    tokens[i][0].rate = av1_get_token_cost(qcoeff[rc], &t0, cat6_bits);
-    tokens[i][0].token = t0;
-    token_cache[rc] = av1_pt_energy_class[t0];
-  }
-
-  for (i = eob; i-- > 0;) {
-    int base_bits, dx;
-    int64_t d2;
-    const int rc = scan[i];
-    int x = qcoeff[rc];
-#if CONFIG_AOM_QM
-    int iwt = iqmatrix[rc];
-    dqv = dequant_ptr[rc != 0];
-    dqv = ((iwt * (int)dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
-#else
-    dqv = dequant_ptr[rc != 0];
-#endif
-    next_shortcut = shortcut;
-
-    /* Only add a trellis state for non-zero coefficients. */
-    if (UNLIKELY(x)) {
-      error0 = tokens[next][0].error;
-      error1 = tokens[next][1].error;
-      /* Evaluate the first possibility for this state. */
-      rate0 = tokens[next][0].rate;
-      rate1 = tokens[next][1].rate;
-
-      if (next_shortcut) {
-        /* Consider both possible successor states. */
-        if (next < default_eob) {
-          pt = get_coef_context(nb, token_cache, i + 1);
-          rate0 +=
-              get_token_bit_costs(*token_costs, 0, pt, tokens[next][0].token);
-          rate1 +=
-              get_token_bit_costs(*token_costs, 0, pt, tokens[next][1].token);
-        }
-        UPDATE_RD_COST();
-        /* And pick the best. */
-        best = rd_cost1 < rd_cost0;
-      } else {
-        if (next < default_eob) {
-          pt = get_coef_context(nb, token_cache, i + 1);
-          rate0 +=
-              get_token_bit_costs(*token_costs, 0, pt, tokens[next][0].token);
-        }
-        best = 0;
-      }
-
-      dx = (dqcoeff[rc] - coeff[rc]) * (1 << shift);
-#if CONFIG_HIGHBITDEPTH
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        dx >>= xd->bd - 8;
-      }
-#endif  // CONFIG_HIGHBITDEPTH
-      d2 = (int64_t)dx * dx;
-      tokens[i][0].rate += (best ? rate1 : rate0);
-      tokens[i][0].error = d2 + (best ? error1 : error0);
-      tokens[i][0].next = next;
-      tokens[i][0].qc = x;
-      tokens[i][0].dqc = dqcoeff[rc];
-      tokens[i][0].best_index = best;
-
-      /* Evaluate the second possibility for this state. */
-      rate0 = tokens[next][0].rate;
-      rate1 = tokens[next][1].rate;
-
-      // The threshold of 3 is empirically obtained.
-      if (UNLIKELY(abs(x) > 3)) {
-        shortcut = 0;
-      } else {
-#if CONFIG_NEW_QUANT
-        shortcut = ((av1_dequant_abscoeff_nuq(abs(x), dqv,
-                                              dequant_val[band_translate[i]]) >
-                     (abs(coeff[rc]) << shift)) &&
-                    (av1_dequant_abscoeff_nuq(abs(x) - 1, dqv,
-                                              dequant_val[band_translate[i]]) <
-                     (abs(coeff[rc]) << shift)));
-#else  // CONFIG_NEW_QUANT
-#if CONFIG_AOM_QM
-        if ((abs(x) * dequant_ptr[rc != 0] * iwt >
-             ((abs(coeff[rc]) << shift) << AOM_QM_BITS)) &&
-            (abs(x) * dequant_ptr[rc != 0] * iwt <
-             (((abs(coeff[rc]) << shift) + dequant_ptr[rc != 0])
-              << AOM_QM_BITS)))
-#else
-        if ((abs(x) * dequant_ptr[rc != 0] > (abs(coeff[rc]) << shift)) &&
-            (abs(x) * dequant_ptr[rc != 0] <
-             (abs(coeff[rc]) << shift) + dequant_ptr[rc != 0]))
-#endif  // CONFIG_AOM_QM
-          shortcut = 1;
-        else
-          shortcut = 0;
-#endif  // CONFIG_NEW_QUANT
-      }
-
-      if (shortcut) {
-        sz = -(x < 0);
-        x -= 2 * sz + 1;
-      } else {
-        tokens[i][1] = tokens[i][0];
-        next = i;
-
-        if (UNLIKELY(!(--band_left))) {
-          --band_counts;
-          band_left = *band_counts;
-          --token_costs;
-        }
-        continue;
-      }
-
-      /* Consider both possible successor states. */
-      if (!x) {
-        /* If we reduced this coefficient to zero, check to see if
-         *  we need to move the EOB back here.
-         */
-        t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
-        t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
-        base_bits = 0;
-      } else {
-        base_bits = av1_get_token_cost(x, &t0, cat6_bits);
-        t1 = t0;
-      }
-
-      if (next_shortcut) {
-        if (LIKELY(next < default_eob)) {
-          if (t0 != EOB_TOKEN) {
-            token_cache[rc] = av1_pt_energy_class[t0];
-            pt = get_coef_context(nb, token_cache, i + 1);
-            rate0 += get_token_bit_costs(*token_costs, !x, pt,
-                                         tokens[next][0].token);
-          }
-          if (t1 != EOB_TOKEN) {
-            token_cache[rc] = av1_pt_energy_class[t1];
-            pt = get_coef_context(nb, token_cache, i + 1);
-            rate1 += get_token_bit_costs(*token_costs, !x, pt,
-                                         tokens[next][1].token);
-          }
-        }
-
-        UPDATE_RD_COST();
-        /* And pick the best. */
-        best = rd_cost1 < rd_cost0;
-      } else {
-        // The two states in next stage are identical.
-        if (next < default_eob && t0 != EOB_TOKEN) {
-          token_cache[rc] = av1_pt_energy_class[t0];
-          pt = get_coef_context(nb, token_cache, i + 1);
-          rate0 +=
-              get_token_bit_costs(*token_costs, !x, pt, tokens[next][0].token);
-        }
-        best = 0;
-      }
-
-#if CONFIG_NEW_QUANT
-      dx = av1_dequant_coeff_nuq(x, dqv, dequant_val[band_translate[i]]) -
-           (coeff[rc] << shift);
-#if CONFIG_HIGHBITDEPTH
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        dx >>= xd->bd - 8;
-      }
-#endif  // CONFIG_HIGHBITDEPTH
-#else   // CONFIG_NEW_QUANT
-#if CONFIG_HIGHBITDEPTH
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        dx -= ((dqv >> (xd->bd - 8)) + sz) ^ sz;
-      } else {
-        dx -= (dqv + sz) ^ sz;
-      }
-#else
-      dx -= (dqv + sz) ^ sz;
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_NEW_QUANT
-      d2 = (int64_t)dx * dx;
-
-      tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
-      tokens[i][1].error = d2 + (best ? error1 : error0);
-      tokens[i][1].next = next;
-      tokens[i][1].token = best ? t1 : t0;
-      tokens[i][1].qc = x;
-
-      if (x) {
-#if CONFIG_NEW_QUANT
-        tokens[i][1].dqc = av1_dequant_abscoeff_nuq(
-            abs(x), dqv, dequant_val[band_translate[i]]);
-        tokens[i][1].dqc = shift ? ROUND_POWER_OF_TWO(tokens[i][1].dqc, shift)
-                                 : tokens[i][1].dqc;
-        if (sz) tokens[i][1].dqc = -tokens[i][1].dqc;
-#else
-        if (x < 0)
-          tokens[i][1].dqc = -((-x * dqv) >> shift);
-        else
-          tokens[i][1].dqc = (x * dqv) >> shift;
-#endif  // CONFIG_NEW_QUANT
-      } else {
-        tokens[i][1].dqc = 0;
-      }
-
-      tokens[i][1].best_index = best;
-      /* Finally, make this the new head of the trellis. */
-      next = i;
-    } else {
-      /* There's no choice to make for a zero coefficient, so we don't
-       *  add a new trellis node, but we do need to update the costs.
-       */
-      t0 = tokens[next][0].token;
-      t1 = tokens[next][1].token;
-      pt = get_coef_context(nb, token_cache, i + 1);
-      /* Update the cost of each path if we're past the EOB token. */
-      if (t0 != EOB_TOKEN) {
-        tokens[next][0].rate += get_token_bit_costs(*token_costs, 1, pt, t0);
-        tokens[next][0].token = ZERO_TOKEN;
-      }
-      if (t1 != EOB_TOKEN) {
-        tokens[next][1].rate += get_token_bit_costs(*token_costs, 1, pt, t1);
-        tokens[next][1].token = ZERO_TOKEN;
-      }
-      tokens[i][0].best_index = tokens[i][1].best_index = 0;
-      shortcut = (tokens[next][0].rate != tokens[next][1].rate);
-      /* Don't update next, because we didn't add a new node. */
-    }
-
-    if (UNLIKELY(!(--band_left))) {
-      --band_counts;
-      band_left = *band_counts;
-      --token_costs;
-    }
-  }
-
-  /* Now pick the best path through the whole trellis. */
-  rate0 = tokens[next][0].rate;
-  rate1 = tokens[next][1].rate;
-  error0 = tokens[next][0].error;
-  error1 = tokens[next][1].error;
-  t0 = tokens[next][0].token;
-  t1 = tokens[next][1].token;
-  rate0 += get_token_bit_costs(*token_costs, 0, ctx, t0);
-  rate1 += get_token_bit_costs(*token_costs, 0, ctx, t1);
-  UPDATE_RD_COST();
-  best = rd_cost1 < rd_cost0;
-
-  final_eob = -1;
-
-  for (i = next; i < eob; i = next) {
-    const int x = tokens[i][best].qc;
-    const int rc = scan[i];
-    if (x) final_eob = i;
-    qcoeff[rc] = x;
-    dqcoeff[rc] = tokens[i][best].dqc;
-
-    next = tokens[i][best].next;
-    best = tokens[i][best].best_index;
-  }
-  final_eob++;
-
-  mb->plane[plane].eobs[block] = final_eob;
-  assert(final_eob <= default_eob);
-  return final_eob;
-}
-
-#endif  // USE_GREEDY_OPTIMIZE_B
 #endif  // !CONFIG_LV_MAP
 
 int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int block,
@@ -827,13 +476,8 @@
   int ctx = get_entropy_context(tx_size, a, l);
 #else
   int ctx = combine_entropy_contexts(*a, *l);
-#endif
-
-#if USE_GREEDY_OPTIMIZE_B
+#endif  // CONFIG_VAR_TX
   return optimize_b_greedy(cm, mb, plane, block, tx_size, ctx);
-#else   // USE_GREEDY_OPTIMIZE_B
-  return optimize_b_org(cm, mb, plane, block, tx_size, ctx);
-#endif  // USE_GREEDY_OPTIMIZE_B
 #else   // !CONFIG_LV_MAP
   TXB_CTX txb_ctx;
   get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);