Refactor recursive transform block scheme

This commit re-designs the recursive transform block partition
rate-distortion optimization framework. It allows the encoder to
improve speed by 10%.

Change-Id: I6dd3a7dd428a530d8012e5c6ddc40e650c8b392b
diff --git a/vp10/common/pred_common.h b/vp10/common/pred_common.h
index 571783e..314bf1e 100644
--- a/vp10/common/pred_common.h
+++ b/vp10/common/pred_common.h
@@ -189,6 +189,7 @@
 
   if (tx_size == plane_tx_size) {
     ++get_tx_counts(max_tx_size, ctx, tx_counts)[tx_size];
+    mbmi->tx_size = tx_size;
   } else {
     int bsl = b_width_log2_lookup[bsize];
     int i;
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index bc142e9..ffd84e8 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -1286,7 +1286,6 @@
 static void select_tx_block(const VP10_COMP *cpi, MACROBLOCK *x,
                             int blk_row, int blk_col, int plane, int block,
                             TX_SIZE tx_size, BLOCK_SIZE plane_bsize,
-                            BLOCK_SIZE txb_bsize,
                             ENTROPY_CONTEXT *ta, ENTROPY_CONTEXT *tl,
                             int *rate, int64_t *dist,
                             int64_t *bsse, int *skip) {
@@ -1299,34 +1298,38 @@
   int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
   int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
   int64_t this_rd = INT64_MAX;
-  ENTROPY_CONTEXT ctxa[16], ctxl[16];
   ENTROPY_CONTEXT *pta = ta + blk_col;
   ENTROPY_CONTEXT *ptl = tl + blk_row;
+  ENTROPY_CONTEXT stxa = 0, stxl = 0;
   int coeff_ctx, i;
-
-  memcpy(ctxa, ta, sizeof(ENTROPY_CONTEXT) * max_blocks_wide);
-  memcpy(ctxl, tl, sizeof(ENTROPY_CONTEXT) * max_blocks_high);
+  int64_t sum_dist = 0, sum_bsse = 0;
+  int64_t sum_rd = INT64_MAX;
+  int sum_rate = vp10_cost_bit(128, 1);
+  int all_skip = 1;
+  TX_SIZE swap_tx_size = TX_SIZES;
 
   switch (tx_size) {
     case TX_4X4:
+      stxa = pta[0];
+      stxl = ptl[0];
       break;
     case TX_8X8:
-      pta[0] = !!*(const uint16_t *)&pta[0];
-      ptl[0] = !!*(const uint16_t *)&ptl[0];
+      stxa = !!*(const uint16_t *)&pta[0];
+      stxl = !!*(const uint16_t *)&ptl[0];
       break;
     case TX_16X16:
-      pta[0] = !!*(const uint32_t *)&pta[0];
-      ptl[0] = !!*(const uint32_t *)&ptl[0];
+      stxa = !!*(const uint32_t *)&pta[0];
+      stxl = !!*(const uint32_t *)&ptl[0];
       break;
     case TX_32X32:
-      pta[0] = !!*(const uint64_t *)&pta[0];
-      ptl[0] = !!*(const uint64_t *)&ptl[0];
+      stxa = !!*(const uint64_t *)&pta[0];
+      stxl = !!*(const uint64_t *)&ptl[0];
       break;
     default:
       assert(0 && "Invalid transform size.");
       break;
   }
-  coeff_ctx = combine_entropy_contexts(pta[0], ptl[0]);
+  coeff_ctx = combine_entropy_contexts(stxa, stxl);
 
   if (xd->mb_to_bottom_edge < 0)
     max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
@@ -1341,38 +1344,23 @@
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
     return;
 
-  mbmi->inter_tx_size[tx_idx] = tx_size;
-  mbmi->tx_size = tx_size;
-
-  if (cpi->common.tx_mode == TX_MODE_SELECT || tx_size == TX_4X4) {
-    tx_block_rd_b(x, tx_size, blk_row, blk_col, plane, block,
-                  plane_bsize, coeff_ctx, rate, dist, bsse, skip);
-    if (tx_size > TX_4X4)
-      *rate += vp10_cost_bit(128, 0);
-    this_rd = RDCOST(x->rdmult, x->rddiv, *rate, *dist);
-    for (i = 0; i < (1 << tx_size); ++i) {
-      pta[i] = !(p->eobs[block] == 0);
-      ptl[i] = !(p->eobs[block] == 0);
-    }
-  }
-
   if (tx_size > TX_4X4) {
     BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
-    int bh = num_4x4_blocks_high_lookup[bsize];
+    int bsl = b_height_log2_lookup[bsize];
     int sub_step = 1 << (2 * (tx_size - 1));
     int i;
-    int this_rate, sum_rate = vp10_cost_bit(128, 1);
-    int64_t this_dist, sum_dist = 0;
-    int64_t this_bsse, sum_bsse = 0;
-    int this_skip, all_skip = 1;
-    int64_t sum_rd;
+    int this_rate;
+    int64_t this_dist;
+    int64_t this_bsse;
+    int this_skip;
+
+    --bsl;
     for (i = 0; i < 4; ++i) {
-      int offsetr = (i >> 1) * bh / 2;
-      int offsetc = (i & 0x01) * bh / 2;
+      int offsetr = (i >> 1) << bsl;
+      int offsetc = (i & 0x01) << bsl;
       select_tx_block(cpi, x, blk_row + offsetr, blk_col + offsetc,
                       plane, block + i * sub_step, tx_size - 1,
-                      plane_bsize, txsize_to_bsize[tx_size - 1],
-                      ctxa, ctxl, &this_rate, &this_dist,
+                      plane_bsize, ta, tl, &this_rate, &this_dist,
                       &this_bsse, &this_skip);
       sum_rate += this_rate;
       sum_dist += this_dist;
@@ -1380,24 +1368,29 @@
       all_skip &= this_skip;
     }
     sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+  }
 
-    if (this_rd < sum_rd) {
-      int idx, idy;
-      for (idy = blk_row; idy < blk_row + bh; idy += 2)
-        for (idx = blk_col; idx < blk_col + bh; idx += 2)
-          mbmi->inter_tx_size[(idy / 2) * 8 + (idx / 2)] = tx_size;
-      mbmi->tx_size = tx_size;
-    } else {
-      *rate = sum_rate;
-      *dist = sum_dist;
-      *bsse = sum_bsse;
-      *skip = all_skip;
+  if (cpi->common.tx_mode == TX_MODE_SELECT || tx_size == TX_4X4) {
+    swap_tx_size = mbmi->inter_tx_size[tx_idx];
+    mbmi->inter_tx_size[tx_idx] = tx_size;
 
-      memcpy(pta, ctxa + (blk_col >> pd->subsampling_x),
-          sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide_lookup[txb_bsize]);
-      memcpy(ptl, ctxl + (blk_row >> pd->subsampling_y),
-          sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high_lookup[txb_bsize]);
-    }
+    tx_block_rd_b(x, tx_size, blk_row, blk_col, plane, block,
+                  plane_bsize, coeff_ctx, rate, dist, bsse, skip);
+    if (tx_size > TX_4X4)
+      *rate += vp10_cost_bit(128, 0);
+    this_rd = RDCOST(x->rdmult, x->rddiv, *rate, *dist);
+  }
+
+  if (this_rd < sum_rd) {
+    for (i = 0; i < (1 << tx_size); ++i)
+      pta[i] = ptl[i] = !(p->eobs[block] == 0);
+    mbmi->tx_size = tx_size;
+  } else {
+    *rate = sum_rate;
+    *dist = sum_dist;
+    *bsse = sum_bsse;
+    *skip = all_skip;
+    mbmi->inter_tx_size[tx_idx] = swap_tx_size;
   }
 }
 
@@ -1437,7 +1430,7 @@
     for (idy = 0; idy < mi_height; idy += bh) {
       for (idx = 0; idx < mi_width; idx += bh) {
         select_tx_block(cpi, x, idy, idx, 0, block,
-                        max_txsize_lookup[plane_bsize], plane_bsize, txb_size,
+                        max_txsize_lookup[plane_bsize], plane_bsize,
                         ctxa, ctxl, &pnrate, &pndist, &pnsse, &pnskip);
         *rate += pnrate;
         *distortion += pndist;