Implement tx_select for superblock encoding.

Also split superblock handling code out of decode_macroblock() into
a new function decode_superblock(), for easier readability.

Derf +0.05%, HD +0.2%, STDHD +0.1%. We can likely get further gains
by allowing to select mb_skip_coeff for a subset of the complete SB
or something along those lines, because although this change allows
coding smaller transforms for bigger predictors, it increases the
overhead of coding EOBs to skip the parts where the residual is
near-zero, and thus the overall gain is not as high as we'd expect.

Change-Id: I552ce1286487267f504e3090b683e15515791efa
diff --git a/vp9/decoder/decodemv.c b/vp9/decoder/decodemv.c
index 5013462..a22df8f 100644
--- a/vp9/decoder/decodemv.c
+++ b/vp9/decoder/decodemv.c
@@ -186,11 +186,6 @@
   m->mbmi.second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
 #endif
 
-#if CONFIG_SUPERBLOCKS
-  if (m->mbmi.encoded_as_sb)
-    m->mbmi.txfm_size = TX_8X8;
-  else
-#endif
   if (cm->txfm_mode == TX_MODE_SELECT && m->mbmi.mb_skip_coeff == 0 &&
       m->mbmi.mode <= I8X8_PRED) {
     // FIXME(rbultje) code ternary symbol once all experiments are merged
@@ -1132,11 +1127,6 @@
 #endif
   }
 
-#if CONFIG_SUPERBLOCKS
-  if (mbmi->encoded_as_sb)
-    mbmi->txfm_size = TX_8X8;
-  else
-#endif
   if (cm->txfm_mode == TX_MODE_SELECT && mbmi->mb_skip_coeff == 0 &&
       ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= I8X8_PRED) ||
        (mbmi->ref_frame != INTRA_FRAME && !(mbmi->mode == SPLITMV &&
diff --git a/vp9/decoder/decodframe.c b/vp9/decoder/decodframe.c
index 60bf7ba..562b55e 100644
--- a/vp9/decoder/decodframe.c
+++ b/vp9/decoder/decodframe.c
@@ -205,6 +205,146 @@
   }
 }
 
+#if CONFIG_SUPERBLOCKS
+static void decode_superblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
+                              int mb_row, unsigned int mb_col,
+                              BOOL_DECODER* const bc) {
+  int i, n, eobtotal;
+  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
+  VP9_COMMON *const pc = &pbi->common;
+  MODE_INFO *orig_mi = xd->mode_info_context;
+
+  assert(xd->mode_info_context->mbmi.encoded_as_sb);
+
+  // re-initialize macroblock dequantizer before detokenization
+  if (xd->segmentation_enabled)
+    mb_init_dequantizer(pbi, xd);
+
+  if (pbi->common.frame_type != KEY_FRAME)
+    vp9_setup_interp_filters(xd, xd->mode_info_context->mbmi.interp_filter, pc);
+
+  if (xd->mode_info_context->mbmi.mb_skip_coeff) {
+    vp9_reset_mb_tokens_context(xd);
+    if (mb_col < pc->mb_cols - 1)
+      xd->above_context++;
+    if (mb_row < pc->mb_rows - 1)
+      xd->left_context++;
+    vp9_reset_mb_tokens_context(xd);
+    if (mb_col < pc->mb_cols - 1)
+      xd->above_context--;
+    if (mb_row < pc->mb_rows - 1)
+      xd->left_context--;
+
+    /* Special case:  Force the loopfilter to skip when eobtotal and
+     * mb_skip_coeff are zero.
+     */
+    skip_recon_mb(pbi, xd);
+    return;
+  }
+
+  /* do prediction */
+  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+    vp9_build_intra_predictors_sby_s(xd);
+    vp9_build_intra_predictors_sbuv_s(xd);
+  } else {
+    vp9_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,
+                                       xd->dst.u_buffer, xd->dst.v_buffer,
+                                       xd->dst.y_stride, xd->dst.uv_stride);
+  }
+
+  /* dequantization and idct */
+  for (n = 0; n < 4; n++) {
+    BLOCKD *b = &xd->block[24];
+    int x_idx = n & 1, y_idx = n >> 1;
+
+    if (mb_col + x_idx >= pc->mb_cols || mb_row + y_idx >= pc->mb_rows)
+      continue;
+
+    xd->above_context = pc->above_context + mb_col + x_idx;
+    xd->left_context = pc->left_context + y_idx;
+    xd->mode_info_context = orig_mi + x_idx + y_idx * pc->mode_info_stride;
+    for (i = 0; i < 25; i++) {
+      xd->block[i].eob = 0;
+      xd->eobs[i] = 0;
+    }
+
+    if (tx_size == TX_16X16) {
+      eobtotal = vp9_decode_mb_tokens_16x16(pbi, xd, bc);
+    } else if (tx_size == TX_8X8) {
+      eobtotal = vp9_decode_mb_tokens_8x8(pbi, xd, bc);
+    } else {
+      eobtotal = vp9_decode_mb_tokens(pbi, xd, bc);
+    }
+    if (eobtotal == 0) {  // skip loopfilter
+      xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+      continue;
+    }
+
+    if (tx_size == TX_16X16) {
+      vp9_dequant_idct_add_16x16(xd->qcoeff, xd->block[0].dequant,
+          xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
+          xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
+          xd->dst.y_stride, xd->dst.y_stride);
+      vp9_dequant_idct_add_uv_block_8x8_inplace_c(xd->qcoeff + 16 * 16,
+          xd->block[16].dequant,
+          xd->dst.u_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,
+          xd->dst.v_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,
+          xd->dst.uv_stride, xd->eobs + 16, xd);
+    } else if (tx_size == TX_8X8) {
+      vp9_dequantize_b_2x2(b);
+      IDCT_INVOKE(RTCD_VTABLE(idct), ihaar2)(&b->dqcoeff[0], b->diff, 8);
+      ((int *)b->qcoeff)[0] = 0;  // 2nd order block are set to 0 after idct
+      ((int *)b->qcoeff)[1] = 0;
+      ((int *)b->qcoeff)[2] = 0;
+      ((int *)b->qcoeff)[3] = 0;
+      ((int *)b->qcoeff)[4] = 0;
+      ((int *)b->qcoeff)[5] = 0;
+      ((int *)b->qcoeff)[6] = 0;
+      ((int *)b->qcoeff)[7] = 0;
+      vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(xd->qcoeff,
+          xd->block[0].dequant,
+          xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
+          xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd);
+      vp9_dequant_idct_add_uv_block_8x8_inplace_c(xd->qcoeff + 16 * 16,
+          xd->block[16].dequant,
+          xd->dst.u_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,
+          xd->dst.v_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,
+          xd->dst.uv_stride, xd->eobs + 16, xd);
+    } else {
+      vp9_dequantize_b(b);
+      if (xd->eobs[24] > 1) {
+        IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
+        ((int *)b->qcoeff)[0] = 0;
+        ((int *)b->qcoeff)[1] = 0;
+        ((int *)b->qcoeff)[2] = 0;
+        ((int *)b->qcoeff)[3] = 0;
+        ((int *)b->qcoeff)[4] = 0;
+        ((int *)b->qcoeff)[5] = 0;
+        ((int *)b->qcoeff)[6] = 0;
+        ((int *)b->qcoeff)[7] = 0;
+      } else {
+        IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff);
+        ((int *)b->qcoeff)[0] = 0;
+      }
+
+      vp9_dequant_dc_idct_add_y_block_4x4_inplace_c(xd->qcoeff,
+          xd->block[0].dequant,
+          xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
+          xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd);
+      vp9_dequant_idct_add_uv_block_4x4_inplace_c(xd->qcoeff + 16 * 16,
+          xd->block[16].dequant,
+          xd->dst.u_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,
+          xd->dst.v_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,
+          xd->dst.uv_stride, xd->eobs + 16, xd);
+    }
+  }
+
+  xd->above_context = pc->above_context + mb_col;
+  xd->left_context = pc->left_context;
+  xd->mode_info_context = orig_mi;
+}
+#endif
+
 static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
                               int mb_row, unsigned int mb_col,
                               BOOL_DECODER* const bc) {
@@ -213,9 +353,9 @@
   int i;
   int tx_size;
   TX_TYPE tx_type;
-  VP9_COMMON *pc = &pbi->common;
+
 #if CONFIG_SUPERBLOCKS
-  int orig_skip_flag = xd->mode_info_context->mbmi.mb_skip_coeff;
+  assert(!xd->mode_info_context->mbmi.encoded_as_sb);
 #endif
 
   // re-initialize macroblock dequantizer before detokenization
@@ -227,20 +367,6 @@
 
   if (xd->mode_info_context->mbmi.mb_skip_coeff) {
     vp9_reset_mb_tokens_context(xd);
-#if CONFIG_SUPERBLOCKS
-    if (xd->mode_info_context->mbmi.encoded_as_sb &&
-        (mb_col < pc->mb_cols - 1 || mb_row < pc->mb_rows - 1)) {
-      if (mb_col < pc->mb_cols - 1)
-        xd->above_context++;
-      if (mb_row < pc->mb_rows - 1)
-        xd->left_context++;
-      vp9_reset_mb_tokens_context(xd);
-      if (mb_col < pc->mb_cols - 1)
-        xd->above_context--;
-      if (mb_row < pc->mb_rows - 1)
-        xd->left_context--;
-    }
-#endif
   } else if (!bool_error(bc)) {
     for (i = 0; i < 25; i++) {
       xd->block[i].eob = 0;
@@ -267,14 +393,8 @@
      * mb_skip_coeff are zero.
      * */
     xd->mode_info_context->mbmi.mb_skip_coeff = 1;
-
-#if CONFIG_SUPERBLOCKS
-    if (!xd->mode_info_context->mbmi.encoded_as_sb || orig_skip_flag)
-#endif
-    {
-      skip_recon_mb(pbi, xd);
-      return;
-    }
+    skip_recon_mb(pbi, xd);
+    return;
   }
 
   // moved to be performed before detokenization
@@ -283,12 +403,6 @@
 
   /* do prediction */
   if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
-#if CONFIG_SUPERBLOCKS
-    if (xd->mode_info_context->mbmi.encoded_as_sb) {
-      vp9_build_intra_predictors_sby_s(xd);
-      vp9_build_intra_predictors_sbuv_s(xd);
-    } else
-#endif
     if (mode != I8X8_PRED) {
       vp9_build_intra_predictors_mbuv(xd);
       if (mode != B_PRED) {
@@ -296,13 +410,6 @@
       }
     }
   } else {
-#if CONFIG_SUPERBLOCKS
-    if (xd->mode_info_context->mbmi.encoded_as_sb) {
-      vp9_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,
-                                         xd->dst.u_buffer, xd->dst.v_buffer,
-                                         xd->dst.y_stride, xd->dst.uv_stride);
-    } else
-#endif
     vp9_build_inter_predictors_mb(xd);
   }
 
@@ -404,40 +511,9 @@
                                      16, xd->dst.y_stride);
       }
     } else if (tx_size == TX_8X8) {
-#if CONFIG_SUPERBLOCKS
-      void *orig = xd->mode_info_context;
-      int n, num = xd->mode_info_context->mbmi.encoded_as_sb ? 4 : 1;
-      for (n = 0; n < num; n++) {
-        int x_idx = n & 1, y_idx = n >> 1;
-        if (num == 4 && (mb_col + x_idx >= pc->mb_cols ||
-                         mb_row + y_idx >= pc->mb_rows))
-          continue;
-
-        if (n != 0) {
-          for (i = 0; i < 25; i++) {
-            xd->block[i].eob = 0;
-            xd->eobs[i] = 0;
-          }
-          xd->above_context = pc->above_context + mb_col + (n & 1);
-          xd->left_context = pc->left_context + (n >> 1);
-          xd->mode_info_context = orig;
-          xd->mode_info_context += (n & 1);
-          xd->mode_info_context += (n >> 1) * pc->mode_info_stride;
-          if (!orig_skip_flag) {
-            eobtotal = vp9_decode_mb_tokens_8x8(pbi, xd, bc);
-            if (eobtotal == 0) // skip loopfilter
-              xd->mode_info_context->mbmi.mb_skip_coeff = 1;
-          } else {
-            vp9_reset_mb_tokens_context(xd);
-          }
-        }
-
-        if (xd->mode_info_context->mbmi.mb_skip_coeff)
-          continue; // only happens for SBs, which are already in dest buffer
-#endif
       vp9_dequantize_b_2x2(b);
       IDCT_INVOKE(RTCD_VTABLE(idct), ihaar2)(&b->dqcoeff[0], b->diff, 8);
-      ((int *)b->qcoeff)[0] = 0;// 2nd order block are set to 0 after inverse transform
+      ((int *)b->qcoeff)[0] = 0;  // 2nd order block are set to 0 after idct
       ((int *)b->qcoeff)[1] = 0;
       ((int *)b->qcoeff)[2] = 0;
       ((int *)b->qcoeff)[3] = 0;
@@ -445,27 +521,9 @@
       ((int *)b->qcoeff)[5] = 0;
       ((int *)b->qcoeff)[6] = 0;
       ((int *)b->qcoeff)[7] = 0;
-#if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb) {
-        vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(xd->qcoeff,
-          xd->block[0].dequant,
-          xd->dst.y_buffer + (n >> 1) * 16 * xd->dst.y_stride + (n & 1) * 16,
-          xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd);
-        // do UV inline also
-        vp9_dequant_idct_add_uv_block_8x8_inplace_c(xd->qcoeff + 16 * 16,
-          xd->block[16].dequant,
-          xd->dst.u_buffer + (n >> 1) * 8 * xd->dst.uv_stride + (n & 1) * 8,
-          xd->dst.v_buffer + (n >> 1) * 8 * xd->dst.uv_stride + (n & 1) * 8,
-          xd->dst.uv_stride, xd->eobs + 16, xd);
-      } else
-#endif
         vp9_dequant_dc_idct_add_y_block_8x8(xd->qcoeff,
           xd->block[0].dequant, xd->predictor, xd->dst.y_buffer,
           xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd);
-#if CONFIG_SUPERBLOCKS
-      }
-      xd->mode_info_context = orig;
-#endif
     } else {
       vp9_dequantize_b(b);
       if (xd->eobs[24] > 1) {
@@ -489,25 +547,19 @@
     }
   }
 
-#if CONFIG_SUPERBLOCKS
-  if (!xd->mode_info_context->mbmi.encoded_as_sb) {
-#endif
-    if ((tx_size == TX_8X8 &&
-         xd->mode_info_context->mbmi.mode != I8X8_PRED &&
-         xd->mode_info_context->mbmi.mode != SPLITMV)
-        || tx_size == TX_16X16
-       )
-      vp9_dequant_idct_add_uv_block_8x8
-          (xd->qcoeff + 16 * 16, xd->block[16].dequant,
-           xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,
-           xd->dst.uv_stride, xd->eobs + 16, xd); //
-    else if (xd->mode_info_context->mbmi.mode != I8X8_PRED)
-      pbi->idct_add_uv_block(xd->qcoeff + 16 * 16, xd->block[16].dequant,
-           xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,
-           xd->dst.uv_stride, xd->eobs + 16);
-#if CONFIG_SUPERBLOCKS
-  }
-#endif
+  if ((tx_size == TX_8X8 &&
+       xd->mode_info_context->mbmi.mode != I8X8_PRED &&
+       xd->mode_info_context->mbmi.mode != SPLITMV)
+      || tx_size == TX_16X16
+     )
+    vp9_dequant_idct_add_uv_block_8x8
+        (xd->qcoeff + 16 * 16, xd->block[16].dequant,
+         xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,
+         xd->dst.uv_stride, xd->eobs + 16, xd);
+  else if (xd->mode_info_context->mbmi.mode != I8X8_PRED)
+    pbi->idct_add_uv_block(xd->qcoeff + 16 * 16, xd->block[16].dequant,
+         xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,
+         xd->dst.uv_stride, xd->eobs + 16);
 }
 
 
@@ -661,9 +713,15 @@
             mi[pc->mode_info_stride + 1] = mi[0];
         }
       }
+      if (xd->mode_info_context->mbmi.encoded_as_sb) {
+        decode_superblock(pbi, xd, mb_row, mb_col, bc);
+      } else {
 #endif
-      vp9_intra_prediction_down_copy(xd);
-      decode_macroblock(pbi, xd, mb_row, mb_col, bc);
+        vp9_intra_prediction_down_copy(xd);
+        decode_macroblock(pbi, xd, mb_row, mb_col, bc);
+#if CONFIG_SUPERBLOCKS
+      }
+#endif
 
       /* check if the boolean decoder has suffered an error */
       xd->corrupted |= bool_error(bc);
diff --git a/vp9/decoder/dequantize.h b/vp9/decoder/dequantize.h
index 560c4a4..026bd2a 100644
--- a/vp9/decoder/dequantize.h
+++ b/vp9/decoder/dequantize.h
@@ -73,12 +73,24 @@
                                                    int stride,
                                                    unsigned short *eobs,
                                                    short *dc, MACROBLOCKD *xd);
+
+void vp9_dequant_dc_idct_add_y_block_4x4_inplace_c(short *q, short *dq,
+                                                   unsigned char *dst,
+                                                   int stride, char *eobs,
+                                                   short *dc, MACROBLOCKD *xd);
+
 void vp9_dequant_idct_add_uv_block_8x8_inplace_c(short *q, short *dq,
                                                  unsigned char *dstu,
                                                  unsigned char *dstv,
                                                  int stride,
                                                  unsigned short *eobs,
                                                  MACROBLOCKD *xd);
+
+void vp9_dequant_idct_add_uv_block_4x4_inplace_c(short *q, short *dq,
+                                                 unsigned char *dstu,
+                                                 unsigned char *dstv,
+                                                 int stride, char *eobs,
+                                                 MACROBLOCKD *xd);
 #endif
 
 #endif
diff --git a/vp9/decoder/idct_blk.c b/vp9/decoder/idct_blk.c
index d9fbf97..efe451e 100644
--- a/vp9/decoder/idct_blk.c
+++ b/vp9/decoder/idct_blk.c
@@ -36,6 +36,30 @@
   }
 }
 
+#if CONFIG_SUPERBLOCKS
+void vp9_dequant_dc_idct_add_y_block_4x4_inplace_c(short *q, short *dq,
+                                                   unsigned char *dst,
+                                                   int stride, char *eobs,
+                                                   short *dc, MACROBLOCKD *xd) {
+  int i, j;
+
+  for (i = 0; i < 4; i++) {
+    for (j = 0; j < 4; j++) {
+      if (*eobs++ > 1)
+        vp9_dequant_dc_idct_add_c(q, dq, dst, dst, stride, stride, dc[0]);
+      else
+        vp9_dc_only_idct_add_c(dc[0], dst, dst, stride, stride);
+
+      q   += 16;
+      dst += 4;
+      dc++;
+    }
+
+    dst += 4 * stride - 16;
+  }
+}
+#endif
+
 void vp9_dequant_idct_add_y_block_c(short *q, short *dq,
                                     unsigned char *pre,
                                     unsigned char *dst,
@@ -103,6 +127,47 @@
   }
 }
 
+#if CONFIG_SUPERBLOCKS
+void vp9_dequant_idct_add_uv_block_4x4_inplace_c(short *q, short *dq,
+                                                 unsigned char *dstu,
+                                                 unsigned char *dstv,
+                                                 int stride, char *eobs,
+                                                 MACROBLOCKD *xd) {
+  int i, j;
+
+  for (i = 0; i < 2; i++) {
+    for (j = 0; j < 2; j++) {
+      if (*eobs++ > 1) {
+        vp9_dequant_idct_add_c(q, dq, dstu, dstu, stride, stride);
+      } else {
+        vp9_dc_only_idct_add_c(q[0]*dq[0], dstu, dstu, stride, stride);
+        ((int *)q)[0] = 0;
+      }
+
+      q    += 16;
+      dstu += 4;
+    }
+
+    dstu += 4 * stride - 8;
+  }
+
+  for (i = 0; i < 2; i++) {
+    for (j = 0; j < 2; j++) {
+      if (*eobs++ > 1) {
+        vp9_dequant_idct_add_c(q, dq, dstv, dstv, stride, stride);
+      } else {
+        vp9_dc_only_idct_add_c(q[0]*dq[0], dstv, dstv, stride, stride);
+        ((int *)q)[0] = 0;
+      }
+
+      q    += 16;
+      dstv += 4;
+    }
+
+    dstv += 4 * stride - 8;
+  }
+}
+#endif
 
 void vp9_dequant_dc_idct_add_y_block_8x8_c(short *q, short *dq,
                                            unsigned char *pre,
diff --git a/vp9/encoder/bitstream.c b/vp9/encoder/bitstream.c
index b7bc99c..a25783a 100644
--- a/vp9/encoder/bitstream.c
+++ b/vp9/encoder/bitstream.c
@@ -919,7 +919,7 @@
         MB_MODE_INFO *mi;
         MV_REFERENCE_FRAME rf;
         MB_PREDICTION_MODE mode;
-        int segment_id;
+        int segment_id, skip_coeff;
 
         int dy = row_delta[i];
         int dx = col_delta[i];
@@ -973,10 +973,11 @@
           }
         }
 
+        skip_coeff = 1;
         if (pc->mb_no_coeff_skip &&
             (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
              (vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) != 0))) {
-          int skip_coeff = mi->mb_skip_coeff;
+          skip_coeff = mi->mb_skip_coeff;
 #if CONFIG_SUPERBLOCKS
           if (mi->encoded_as_sb) {
             skip_coeff &= m[1].mbmi.mb_skip_coeff;
@@ -1107,6 +1108,7 @@
                       cpi->common.mcomp_filter_type);
             }
           }
+
           if (mi->second_ref_frame &&
               (mode == NEWMV || mode == SPLITMV)) {
             int_mv n1, n2;
@@ -1244,15 +1246,11 @@
           }
         }
 
-        if (
-#if CONFIG_SUPERBLOCKS
-            !mi->encoded_as_sb &&
-#endif
-            ((rf == INTRA_FRAME && mode <= I8X8_PRED) ||
+        if (((rf == INTRA_FRAME && mode <= I8X8_PRED) ||
              (rf != INTRA_FRAME && !(mode == SPLITMV &&
                                      mi->partitioning == PARTITIONING_4X4))) &&
             pc->txfm_mode == TX_MODE_SELECT &&
-            !((pc->mb_no_coeff_skip && mi->mb_skip_coeff) ||
+            !((pc->mb_no_coeff_skip && skip_coeff) ||
               (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
                vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
           TX_SIZE sz = mi->txfm_size;
@@ -1389,11 +1387,7 @@
   } else
     write_uv_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]);
 
-  if (
-#if CONFIG_SUPERBLOCKS
-      !m->mbmi.encoded_as_sb &&
-#endif
-      ym <= I8X8_PRED && c->txfm_mode == TX_MODE_SELECT &&
+  if (ym <= I8X8_PRED && c->txfm_mode == TX_MODE_SELECT &&
       !((c->mb_no_coeff_skip && m->mbmi.mb_skip_coeff) ||
         (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
          vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
diff --git a/vp9/encoder/encodeframe.c b/vp9/encoder/encodeframe.c
index 703b49e..21def26 100644
--- a/vp9/encoder/encodeframe.c
+++ b/vp9/encoder/encodeframe.c
@@ -55,7 +55,8 @@
 
 static void encode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
                                     TOKENEXTRA **t, int recon_yoffset,
-                                    int recon_uvoffset, int output_enabled);
+                                    int recon_uvoffset, int output_enabled,
+                                    int mb_col, int mb_row);
 
 static void encode_inter_superblock(VP9_COMP *cpi, MACROBLOCK *x,
                                     TOKENEXTRA **t, int recon_yoffset,
@@ -65,7 +66,7 @@
                                      TOKENEXTRA **t, int output_enabled);
 
 static void encode_intra_super_block(VP9_COMP *cpi, MACROBLOCK *x,
-                                     TOKENEXTRA **t, int mb_col);
+                                     TOKENEXTRA **t, int mb_col, int mb_row);
 
 static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x);
 
@@ -466,9 +467,9 @@
     cpi->prediction_error += ctx->distortion;
     cpi->intra_error += ctx->intra_error;
 
-    cpi->rd_comp_pred_diff[0] += ctx->single_pred_diff;
-    cpi->rd_comp_pred_diff[1] += ctx->comp_pred_diff;
-    cpi->rd_comp_pred_diff[2] += ctx->hybrid_pred_diff;
+    cpi->rd_comp_pred_diff[SINGLE_PREDICTION_ONLY] += ctx->single_pred_diff;
+    cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY]   += ctx->comp_pred_diff;
+    cpi->rd_comp_pred_diff[HYBRID_PREDICTION]      += ctx->hybrid_pred_diff;
   }
 }
 
@@ -645,7 +646,7 @@
 
       // Dummy encode, do not do the tokenization
       encode_inter_macroblock(cpi, x, tp,
-                              recon_yoffset, recon_uvoffset, 0);
+                              recon_yoffset, recon_uvoffset, 0, mb_col, mb_row);
 
       seg_id = mbmi->segment_id;
       if (cpi->mb.e_mbd.segmentation_enabled && seg_id == 0) {
@@ -975,7 +976,7 @@
     if (cm->frame_type == KEY_FRAME) {
 #if CONFIG_SUPERBLOCKS
       if (xd->mode_info_context->mbmi.encoded_as_sb)
-        encode_intra_super_block(cpi, x, tp, mb_col);
+        encode_intra_super_block(cpi, x, tp, mb_col, mb_row);
       else
 #endif
         encode_intra_macro_block(cpi, x, tp, 1);
@@ -1005,8 +1006,8 @@
                                 mb_col, mb_row);
       else
 #endif
-        encode_inter_macroblock(cpi, x, tp,
-                                recon_yoffset, recon_uvoffset, 1);
+        encode_inter_macroblock(cpi, x, tp, recon_yoffset, recon_uvoffset, 1,
+                                mb_col, mb_row);
         // Note the encoder may have changed the segment_id
 
 #ifdef MODE_STATS
@@ -1431,7 +1432,7 @@
 
 static void reset_skip_txfm_size(VP9_COMP *cpi, TX_SIZE txfm_max) {
   VP9_COMMON *cm = &cpi->common;
-  int mb_row, mb_col, mis = cm->mode_info_stride, segment_id;
+  int mb_row, mb_col, mis = cm->mode_info_stride, segment_id, skip;
   MODE_INFO *mi, *mi_ptr = cm->mi;
 #if CONFIG_SUPERBLOCKS
   MODE_INFO *sb_mi_ptr = cm->mi, *sb_mi;
@@ -1451,17 +1452,45 @@
 #if CONFIG_SUPERBLOCKS
       sb_mbmi = &sb_mi->mbmi;
 #endif
-      if (
+      if (mbmi->txfm_size > txfm_max) {
 #if CONFIG_SUPERBLOCKS
-          !sb_mbmi->encoded_as_sb &&
+        if (sb_mbmi->encoded_as_sb) {
+          if (!((mb_col & 1) || (mb_row & 1))) {
+            segment_id = mbmi->segment_id;
+            skip = mbmi->mb_skip_coeff;
+            if (mb_col < cm->mb_cols - 1) {
+              segment_id = segment_id && mi[1].mbmi.segment_id;
+              skip = skip && mi[1].mbmi.mb_skip_coeff;
+            }
+            if (mb_row < cm->mb_rows - 1) {
+              segment_id = segment_id &&
+                           mi[cm->mode_info_stride].mbmi.segment_id;
+              skip = skip && mi[cm->mode_info_stride].mbmi.mb_skip_coeff;
+              if (mb_col < cm->mb_cols - 1) {
+                segment_id = segment_id &&
+                             mi[cm->mode_info_stride + 1].mbmi.segment_id;
+                skip = skip && mi[cm->mode_info_stride + 1].mbmi.mb_skip_coeff;
+              }
+            }
+            xd->mode_info_context = mi;
+            assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
+                    vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) ||
+                   (cm->mb_no_coeff_skip && skip));
+            mbmi->txfm_size = txfm_max;
+          } else {
+            mbmi->txfm_size = sb_mbmi->txfm_size;
+          }
+        } else {
 #endif
-          mbmi->txfm_size > txfm_max) {
-        segment_id = mbmi->segment_id;
-        xd->mode_info_context = mi;
-        assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
-                vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) ||
-               (cm->mb_no_coeff_skip && mbmi->mb_skip_coeff));
-        mbmi->txfm_size = txfm_max;
+          segment_id = mbmi->segment_id;
+          xd->mode_info_context = mi;
+          assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
+                  vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) ||
+                 (cm->mb_no_coeff_skip && mbmi->mb_skip_coeff));
+          mbmi->txfm_size = txfm_max;
+#if CONFIG_SUPERBLOCKS
+        }
+#endif
       }
 #if CONFIG_SUPERBLOCKS
       if (mb_col & 1)
@@ -1835,7 +1864,7 @@
 }
 
 static void encode_intra_super_block(VP9_COMP *cpi, MACROBLOCK *x,
-                                     TOKENEXTRA **t, int mb_col) {
+                                     TOKENEXTRA **t, int mb_col, int mb_row) {
   const int output_enabled = 1;
   int n;
   MACROBLOCKD *xd = &x->e_mbd;
@@ -1851,7 +1880,7 @@
   const VP9_ENCODER_RTCD *rtcd = IF_RTCD(&cpi->rtcd);
   TOKENEXTRA *tp[4];
   int skip[4];
-  MODE_INFO *mi = x->e_mbd.mode_info_context;
+  MODE_INFO *mi = xd->mode_info_context;
   ENTROPY_CONTEXT_PLANES ta[4], tl[4];
 
   if ((cpi->oxcf.tuning == VP8_TUNE_SSIM) && output_enabled) {
@@ -1862,7 +1891,6 @@
   vp9_build_intra_predictors_sby_s(&x->e_mbd);
   vp9_build_intra_predictors_sbuv_s(&x->e_mbd);
 
-  assert(x->e_mbd.mode_info_context->mbmi.txfm_size == TX_8X8);
   for (n = 0; n < 4; n++) {
     int x_idx = n & 1, y_idx = n >> 1;
 
@@ -1881,15 +1909,9 @@
                           udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
                           vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
                           dst_uv_stride);
-    vp9_transform_mb_8x8(x);
-    vp9_quantize_mb_8x8(x);
-    if (x->optimize) {
-      vp9_optimize_mby_8x8(x, rtcd);
-      vp9_optimize_mbuv_8x8(x, rtcd);
-    }
-    vp9_inverse_transform_mb_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
-    vp9_recon_mby_s_c(&x->e_mbd, dst + x_idx * 16 + y_idx * 16 * dst_y_stride);
-    vp9_recon_mbuv_s_c(&x->e_mbd,
+    vp9_fidct_mb(x, rtcd);
+    vp9_recon_mby_s_c(xd, dst + x_idx * 16 + y_idx * 16 * dst_y_stride);
+    vp9_recon_mbuv_s_c(xd,
                        udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
                        vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride);
 
@@ -1898,16 +1920,35 @@
       memcpy(&tl[n], xd->left_context, sizeof(tl[n]));
       tp[n] = *t;
       xd->mode_info_context = mi + x_idx + y_idx * cm->mode_info_stride;
-      vp9_tokenize_mb(cpi, &x->e_mbd, t, 0);
+      vp9_tokenize_mb(cpi, xd, t, 0);
       skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff;
     }
   }
 
   if (output_enabled) {
+    int segment_id;
+
     // Tokenize
     xd->mode_info_context = mi;
+    segment_id = mi->mbmi.segment_id;
     sum_intra_stats(cpi, x);
     update_sb_skip_coeff_state(cpi, x, ta, tl, tp, t, skip);
+    if (cm->txfm_mode == TX_MODE_SELECT &&
+        !((cm->mb_no_coeff_skip && skip[0] && skip[1] && skip[2] && skip[3]) ||
+          (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
+           vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
+      cpi->txfm_count[mi->mbmi.txfm_size]++;
+    } else {
+      TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ? TX_16X16 : cm->txfm_mode;
+      mi->mbmi.txfm_size = sz;
+      if (mb_col < cm->mb_cols - 1)
+        mi[1].mbmi.txfm_size = sz;
+      if (mb_row < cm->mb_rows - 1) {
+        mi[cm->mode_info_stride].mbmi.txfm_size = sz;
+        if (mb_col < cm->mb_cols - 1)
+          mi[cm->mode_info_stride + 1].mbmi.txfm_size = sz;
+      }
+    }
   }
 }
 #endif /* CONFIG_SUPERBLOCKS */
@@ -1962,7 +2003,8 @@
 }
 static void encode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
                                     TOKENEXTRA **t, int recon_yoffset,
-                                    int recon_uvoffset, int output_enabled) {
+                                    int recon_uvoffset, int output_enabled,
+                                    int mb_col, int mb_row) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
@@ -2151,8 +2193,8 @@
                                     int recon_uvoffset,
                                     int mb_col, int mb_row) {
   const int output_enabled = 1;
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
   const uint8_t *src = x->src.y_buffer;
   uint8_t *dst = xd->dst.y_buffer;
   const uint8_t *usrc = x->src.u_buffer;
@@ -2162,13 +2204,13 @@
   int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
   int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
   const VP9_ENCODER_RTCD *rtcd = IF_RTCD(&cpi->rtcd);
-  unsigned int segment_id = xd->mode_info_context->mbmi.segment_id;
   int seg_ref_active;
   unsigned char ref_pred_flag;
   int n;
   TOKENEXTRA *tp[4];
   int skip[4];
   MODE_INFO *mi = x->e_mbd.mode_info_context;
+  unsigned int segment_id = mi->mbmi.segment_id;
   ENTROPY_CONTEXT_PLANES ta[4], tl[4];
 
   x->skip = 0;
@@ -2248,7 +2290,6 @@
                                        xd->dst.y_stride, xd->dst.uv_stride);
   }
 
-  assert(x->e_mbd.mode_info_context->mbmi.txfm_size == TX_8X8);
   for (n = 0; n < 4; n++) {
     int x_idx = n & 1, y_idx = n >> 1;
 
@@ -2264,13 +2305,7 @@
                           udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
                           vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
                           dst_uv_stride);
-    vp9_transform_mb_8x8(x);
-    vp9_quantize_mb_8x8(x);
-    if (x->optimize) {
-      vp9_optimize_mby_8x8(x, rtcd);
-      vp9_optimize_mbuv_8x8(x, rtcd);
-    }
-    vp9_inverse_transform_mb_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+    vp9_fidct_mb(x, rtcd);
     vp9_recon_mby_s_c(&x->e_mbd,
                       dst + x_idx * 16 + y_idx * 16 * dst_y_stride);
     vp9_recon_mbuv_s_c(&x->e_mbd,
@@ -2313,5 +2348,21 @@
 
   xd->mode_info_context = mi;
   update_sb_skip_coeff_state(cpi, x, ta, tl, tp, t, skip);
+  if (cm->txfm_mode == TX_MODE_SELECT &&
+      !((cm->mb_no_coeff_skip && skip[0] && skip[1] && skip[2] && skip[3]) ||
+        (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
+         vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
+    cpi->txfm_count[mi->mbmi.txfm_size]++;
+  } else {
+    TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ? TX_16X16 : cm->txfm_mode;
+    mi->mbmi.txfm_size = sz;
+    if (mb_col < cm->mb_cols - 1)
+      mi[1].mbmi.txfm_size = sz;
+    if (mb_row < cm->mb_rows - 1) {
+      mi[cm->mode_info_stride].mbmi.txfm_size = sz;
+      if (mb_col < cm->mb_cols - 1)
+        mi[cm->mode_info_stride + 1].mbmi.txfm_size = sz;
+    }
+  }
 }
 #endif
diff --git a/vp9/encoder/encodemb.c b/vp9/encoder/encodemb.c
index d828c51..71e81ed 100644
--- a/vp9/encoder/encodemb.c
+++ b/vp9/encoder/encodemb.c
@@ -884,13 +884,10 @@
   vp9_optimize_mbuv_8x8(x, rtcd);
 }
 
-void vp9_encode_inter16x16(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
-  MACROBLOCKD *xd = &x->e_mbd;
+void vp9_fidct_mb(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
   TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
 
-  vp9_build_inter_predictors_mb(xd);
-  subtract_mb(rtcd, x);
-
   if (tx_size == TX_16X16) {
     vp9_transform_mb_16x16(x);
     vp9_quantize_mb_16x16(x);
@@ -924,7 +921,14 @@
       optimize_mb_4x4(x, rtcd);
     vp9_inverse_transform_mb_4x4(IF_RTCD(&rtcd->common->idct), xd);
   }
+}
 
+void vp9_encode_inter16x16(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  vp9_build_inter_predictors_mb(xd);
+  subtract_mb(rtcd, x);
+  vp9_fidct_mb(x, rtcd);
   vp9_recon_mb(xd);
 }
 
diff --git a/vp9/encoder/encodemb.h b/vp9/encoder/encodemb.h
index 8a3d38f..e59ed8a 100644
--- a/vp9/encoder/encodemb.h
+++ b/vp9/encoder/encodemb.h
@@ -55,6 +55,8 @@
 void vp9_transform_mby_16x16(MACROBLOCK *x);
 void vp9_optimize_mby_16x16(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);
 
+void vp9_fidct_mb(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);
+
 void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch);
 
 #if CONFIG_SUPERBLOCKS
diff --git a/vp9/encoder/rdopt.c b/vp9/encoder/rdopt.c
index 19b96af..ef92b62 100644
--- a/vp9/encoder/rdopt.c
+++ b/vp9/encoder/rdopt.c
@@ -610,7 +610,7 @@
   return cost;
 }
 
-static int rdcost_mby_4x4(MACROBLOCK *mb) {
+static int rdcost_mby_4x4(MACROBLOCK *mb, int backup) {
   int cost = 0;
   int b;
   MACROBLOCKD *xd = &mb->e_mbd;
@@ -618,11 +618,16 @@
   ENTROPY_CONTEXT *ta;
   ENTROPY_CONTEXT *tl;
 
-  vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  if (backup) {
+    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
 
-  ta = (ENTROPY_CONTEXT *)&t_above;
-  tl = (ENTROPY_CONTEXT *)&t_left;
+    ta = (ENTROPY_CONTEXT *)&t_above;
+    tl = (ENTROPY_CONTEXT *)&t_left;
+  } else {
+    ta = (ENTROPY_CONTEXT *)xd->above_context;
+    tl = (ENTROPY_CONTEXT *)xd->left_context;
+  }
 
   for (b = 0; b < 16; b++)
     cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_Y_NO_DC,
@@ -640,7 +645,7 @@
                                 int *Rate,
                                 int *Distortion,
                                 const VP9_ENCODER_RTCD *rtcd,
-                                int *skippable) {
+                                int *skippable, int backup) {
   int b;
   MACROBLOCKD *const xd = &mb->e_mbd;
   BLOCK   *const mb_y2 = mb->block + 24;
@@ -674,7 +679,7 @@
 
   *Distortion = (d >> 2);
   // rate
-  *Rate = rdcost_mby_4x4(mb);
+  *Rate = rdcost_mby_4x4(mb, backup);
   *skippable = vp9_mby_is_skippable_4x4(&mb->e_mbd, 1);
 }
 
@@ -711,7 +716,7 @@
                                 int *Rate,
                                 int *Distortion,
                                 const VP9_ENCODER_RTCD *rtcd,
-                                int *skippable) {
+                                int *skippable, int backup) {
   MACROBLOCKD *const xd = &mb->e_mbd;
   BLOCK   *const mb_y2 = mb->block + 24;
   BLOCKD *const x_y2  = xd->block + 24;
@@ -735,28 +740,34 @@
 
   *Distortion = (d >> 2);
   // rate
-  *Rate = rdcost_mby_8x8(mb, 1);
+  *Rate = rdcost_mby_8x8(mb, backup);
   *skippable = vp9_mby_is_skippable_8x8(&mb->e_mbd, 1);
 }
 
-static int rdcost_mby_16x16(MACROBLOCK *mb) {
+static int rdcost_mby_16x16(MACROBLOCK *mb, int backup) {
   int cost;
   MACROBLOCKD *xd = &mb->e_mbd;
   ENTROPY_CONTEXT_PLANES t_above, t_left;
   ENTROPY_CONTEXT *ta, *tl;
 
-  vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  if (backup) {
+    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
 
-  ta = (ENTROPY_CONTEXT *)&t_above;
-  tl = (ENTROPY_CONTEXT *)&t_left;
+    ta = (ENTROPY_CONTEXT *)&t_above;
+    tl = (ENTROPY_CONTEXT *)&t_left;
+  } else {
+    ta = (ENTROPY_CONTEXT *)xd->above_context;
+    tl = (ENTROPY_CONTEXT *)xd->left_context;
+  }
 
   cost = cost_coeffs(mb, xd->block, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_16X16);
   return cost;
 }
 
 static void macro_block_yrd_16x16(MACROBLOCK *mb, int *Rate, int *Distortion,
-                                  const VP9_ENCODER_RTCD *rtcd, int *skippable) {
+                                  const VP9_ENCODER_RTCD *rtcd, int *skippable,
+                                  int backup) {
   int d;
   MACROBLOCKD *xd = &mb->e_mbd;
   BLOCKD *b  = &mb->e_mbd.block[0];
@@ -780,125 +791,97 @@
 
   *Distortion = (d >> 2);
   // rate
-  *Rate = rdcost_mby_16x16(mb);
+  *Rate = rdcost_mby_16x16(mb, backup);
   *skippable = vp9_mby_is_skippable_16x16(&mb->e_mbd);
 }
 
+static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
+                                     int r[2][TX_SIZE_MAX], int *rate,
+                                     int d[TX_SIZE_MAX], int *distortion,
+                                     int s[TX_SIZE_MAX], int *skip,
+                                     int64_t txfm_cache[NB_TXFM_MODES]) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+  vp9_prob skip_prob = cm->mb_no_coeff_skip ?
+                       vp9_get_pred_prob(cm, xd, PRED_MBSKIP) : 128;
+  int64_t rd[2][TX_SIZE_MAX];
+  int n;
+
+  r[1][TX_16X16] = r[0][TX_16X16] + vp9_cost_one(cm->prob_tx[0]) +
+                   vp9_cost_one(cm->prob_tx[1]);
+  r[1][TX_8X8]   = r[0][TX_8X8] + vp9_cost_one(cm->prob_tx[0]) +
+                   vp9_cost_zero(cm->prob_tx[1]);
+  r[1][TX_4X4]   = r[0][TX_4X4] + vp9_cost_zero(cm->prob_tx[0]);
+
+  if (cm->mb_no_coeff_skip) {
+    int s0, s1;
+
+    assert(skip_prob > 0);
+    s0 = vp9_cost_bit(skip_prob, 0);
+    s1 = vp9_cost_bit(skip_prob, 1);
+
+    for (n = TX_4X4; n <= TX_16X16; n++) {
+      if (s[n]) {
+        rd[0][n] = rd[1][n] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
+      } else {
+        rd[0][n] = RDCOST(x->rdmult, x->rddiv, r[0][n] + s0, d[n]);
+        rd[1][n] = RDCOST(x->rdmult, x->rddiv, r[1][n] + s0, d[n]);
+      }
+    }
+  } else {
+    for (n = TX_4X4; n <= TX_16X16; n++) {
+      rd[0][n] = RDCOST(x->rdmult, x->rddiv, r[0][n], d[n]);
+      rd[1][n] = RDCOST(x->rdmult, x->rddiv, r[1][n], d[n]);
+    }
+  }
+
+  if ( cm->txfm_mode == ALLOW_16X16 ||
+      (cm->txfm_mode == TX_MODE_SELECT &&
+       rd[1][TX_16X16] < rd[1][TX_8X8] && rd[1][TX_16X16] < rd[1][TX_4X4])) {
+    mbmi->txfm_size = TX_16X16;
+  } else if (cm->txfm_mode == ALLOW_8X8 ||
+           (cm->txfm_mode == TX_MODE_SELECT && rd[1][TX_8X8] < rd[1][TX_4X4])) {
+    mbmi->txfm_size = TX_8X8;
+  } else {
+    assert(cm->txfm_mode == ONLY_4X4 ||
+          (cm->txfm_mode == TX_MODE_SELECT && rd[1][TX_4X4] <= rd[1][TX_8X8]));
+    mbmi->txfm_size = TX_4X4;
+  }
+
+  *distortion = d[mbmi->txfm_size];
+  *rate       = r[cm->txfm_mode == TX_MODE_SELECT][mbmi->txfm_size];
+  *skip       = s[mbmi->txfm_size];
+
+  txfm_cache[ONLY_4X4] = rd[0][TX_4X4];
+  txfm_cache[ALLOW_8X8] = rd[0][TX_8X8];
+  txfm_cache[ALLOW_16X16] = rd[0][TX_16X16];
+  if (rd[1][TX_16X16] < rd[1][TX_8X8] && rd[1][TX_16X16] < rd[1][TX_4X4])
+    txfm_cache[TX_MODE_SELECT] = rd[1][TX_16X16];
+  else
+    txfm_cache[TX_MODE_SELECT] = rd[1][TX_4X4] < rd[1][TX_8X8] ?
+                                 rd[1][TX_4X4] : rd[1][TX_8X8];
+}
+
 static void macro_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
                             int *distortion, int *skippable,
                             int64_t txfm_cache[NB_TXFM_MODES]) {
   VP9_COMMON *cm = &cpi->common;
-  MB_MODE_INFO *mbmi = &x->e_mbd.mode_info_context->mbmi;
-
-  MACROBLOCKD *xd = &x->e_mbd;
-  int can_skip = cm->mb_no_coeff_skip;
-  vp9_prob skip_prob = can_skip ? vp9_get_pred_prob(cm, xd, PRED_MBSKIP) : 128;
-  int s0, s1;
-  int r4x4, r4x4s, r8x8, r8x8s, d4x4, d8x8, s4x4, s8x8;
-  int64_t rd4x4, rd8x8, rd4x4s, rd8x8s;
-  int d16x16, r16x16, r16x16s, s16x16;
-  int64_t rd16x16, rd16x16s;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int r[2][TX_SIZE_MAX], d[TX_SIZE_MAX], s[TX_SIZE_MAX];
 
   vp9_subtract_mby(x->src_diff, *(x->block[0].base_src), xd->predictor,
                    x->block[0].src_stride);
 
-  if (skip_prob == 0)
-    skip_prob = 1;
-  s0 = vp9_cost_bit(skip_prob, 0);
-  s1 = vp9_cost_bit(skip_prob, 1);
-  macro_block_yrd_16x16(x, &r16x16, &d16x16, IF_RTCD(&cpi->rtcd), &s16x16);
-  if (can_skip) {
-    if (s16x16) {
-      rd16x16 = RDCOST(x->rdmult, x->rddiv, s1, d16x16);
-    } else {
-      rd16x16 = RDCOST(x->rdmult, x->rddiv, r16x16 + s0, d16x16);
-    }
-  } else {
-    rd16x16 = RDCOST(x->rdmult, x->rddiv, r16x16, d16x16);
-  }
-  r16x16s = r16x16 + vp9_cost_one(cm->prob_tx[0]) + vp9_cost_one(cm->prob_tx[1]);
-  if (can_skip) {
-    if (s16x16) {
-      rd16x16s = RDCOST(x->rdmult, x->rddiv, s1, d16x16);
-    } else {
-      rd16x16s = RDCOST(x->rdmult, x->rddiv, r16x16s + s0, d16x16);
-    }
-  } else {
-    rd16x16s = RDCOST(x->rdmult, x->rddiv, r16x16s, d16x16);
-  }
-  macro_block_yrd_8x8(x, &r8x8, &d8x8, IF_RTCD(&cpi->rtcd), &s8x8);
-  if (can_skip) {
-    if (s8x8) {
-      rd8x8 = RDCOST(x->rdmult, x->rddiv, s1, d8x8);
-    } else {
-      rd8x8 = RDCOST(x->rdmult, x->rddiv, r8x8 + s0, d8x8);
-    }
-  } else {
-    rd8x8 = RDCOST(x->rdmult, x->rddiv, r8x8, d8x8);
-  }
-  r8x8s = r8x8 + vp9_cost_one(cm->prob_tx[0]);
-  r8x8s += vp9_cost_zero(cm->prob_tx[1]);
-  if (can_skip) {
-    if (s8x8) {
-      rd8x8s = RDCOST(x->rdmult, x->rddiv, s1, d8x8);
-    } else {
-      rd8x8s = RDCOST(x->rdmult, x->rddiv, r8x8s + s0, d8x8);
-    }
-  } else {
-    rd8x8s = RDCOST(x->rdmult, x->rddiv, r8x8s, d8x8);
-  }
-  macro_block_yrd_4x4(x, &r4x4, &d4x4, IF_RTCD(&cpi->rtcd), &s4x4);
-  if (can_skip) {
-    if (s4x4) {
-      rd4x4 = RDCOST(x->rdmult, x->rddiv, s1, d4x4);
-    } else {
-      rd4x4 = RDCOST(x->rdmult, x->rddiv, r4x4 + s0, d4x4);
-    }
-  } else {
-    rd4x4 = RDCOST(x->rdmult, x->rddiv, r4x4, d4x4);
-  }
-  r4x4s = r4x4 + vp9_cost_zero(cm->prob_tx[0]);
-  if (can_skip) {
-    if (s4x4) {
-      rd4x4s = RDCOST(x->rdmult, x->rddiv, s1, d4x4);
-    } else {
-      rd4x4s = RDCOST(x->rdmult, x->rddiv, r4x4s + s0, d4x4);
-    }
-  } else {
-    rd4x4s = RDCOST(x->rdmult, x->rddiv, r4x4s, d4x4);
-  }
+  macro_block_yrd_16x16(x, &r[0][TX_16X16], &d[TX_16X16],
+                        IF_RTCD(&cpi->rtcd), &s[TX_16X16], 1);
+  macro_block_yrd_8x8(x, &r[0][TX_8X8], &d[TX_8X8],
+                      IF_RTCD(&cpi->rtcd), &s[TX_8X8], 1);
+  macro_block_yrd_4x4(x, &r[0][TX_4X4], &d[TX_4X4],
+                      IF_RTCD(&cpi->rtcd), &s[TX_4X4], 1);
 
-  if ( cpi->common.txfm_mode == ALLOW_16X16 ||
-      (cpi->common.txfm_mode == TX_MODE_SELECT &&
-       rd16x16s < rd8x8s && rd16x16s < rd4x4s)) {
-    mbmi->txfm_size = TX_16X16;
-    *skippable = s16x16;
-    *distortion = d16x16;
-    *rate = (cpi->common.txfm_mode == ALLOW_16X16) ? r16x16 : r16x16s;
-  } else
-  if ( cpi->common.txfm_mode == ALLOW_8X8 ||
-      (cpi->common.txfm_mode == TX_MODE_SELECT && rd8x8s < rd4x4s)) {
-    mbmi->txfm_size = TX_8X8;
-    *skippable = s8x8;
-    *distortion = d8x8;
-    *rate = (cpi->common.txfm_mode == ALLOW_8X8) ? r8x8 : r8x8s;
-  } else {
-    assert(cpi->common.txfm_mode == ONLY_4X4 ||
-           (cpi->common.txfm_mode == TX_MODE_SELECT && rd4x4s <= rd8x8s));
-    mbmi->txfm_size = TX_4X4;
-    *skippable = s4x4;
-    *distortion = d4x4;
-    *rate = (cpi->common.txfm_mode == ONLY_4X4) ? r4x4 : r4x4s;
-  }
-
-  txfm_cache[ONLY_4X4] = rd4x4;
-  txfm_cache[ALLOW_8X8] = rd8x8;
-  txfm_cache[ALLOW_16X16] = rd16x16;
-  if (rd16x16s < rd8x8s && rd16x16s < rd4x4s)
-    txfm_cache[TX_MODE_SELECT] = rd16x16s;
-  else
-    txfm_cache[TX_MODE_SELECT] = rd4x4s < rd8x8s ? rd4x4s : rd8x8s;
-
+  choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skippable,
+                           txfm_cache);
 }
 
 static void copy_predictor(unsigned char *dst, const unsigned char *predictor) {
@@ -911,62 +894,61 @@
 }
 
 #if CONFIG_SUPERBLOCKS
-static void super_block_yrd_8x8(MACROBLOCK *x,
-                                int *rate,
-                                int *distortion,
-                                const VP9_ENCODER_RTCD *rtcd, int *skip)
-{
+static void super_block_yrd(VP9_COMP *cpi,
+                            MACROBLOCK *x, int *rate, int *distortion,
+                            const VP9_ENCODER_RTCD *rtcd, int *skip,
+                            int64_t txfm_cache[NB_TXFM_MODES]) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  BLOCK *const by2 = x->block + 24;
-  BLOCKD *const bdy2  = xd->block + 24;
-  int d = 0, r = 0, n;
+  int r[2][TX_SIZE_MAX], d[TX_SIZE_MAX], s[TX_SIZE_MAX], n;
   const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;
   int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
-  ENTROPY_CONTEXT_PLANES *ta = xd->above_context;
-  ENTROPY_CONTEXT_PLANES *tl = xd->left_context;
-  ENTROPY_CONTEXT_PLANES t_above[2];
-  ENTROPY_CONTEXT_PLANES t_left[2];
-  int skippable = 1;
+  ENTROPY_CONTEXT_PLANES t_above[3][2], *orig_above = xd->above_context;
+  ENTROPY_CONTEXT_PLANES t_left[3][2], *orig_left = xd->left_context;
 
-  vpx_memcpy(t_above, xd->above_context, sizeof(t_above));
-  vpx_memcpy(t_left, xd->left_context, sizeof(t_left));
+  for (n = TX_4X4; n <= TX_16X16; n++) {
+    vpx_memcpy(t_above[n], xd->above_context, sizeof(t_above[n]));
+    vpx_memcpy(t_left[n], xd->left_context, sizeof(t_left[n]));
+    r[0][n] = 0;
+    d[n] = 0;
+    s[n] = 1;
+  }
 
   for (n = 0; n < 4; n++) {
     int x_idx = n & 1, y_idx = n >> 1;
+    int r_tmp, d_tmp, s_tmp;
 
     vp9_subtract_mby_s_c(x->src_diff,
                          src + x_idx * 16 + y_idx * 16 * src_y_stride,
                          src_y_stride,
                          dst + x_idx * 16 + y_idx * 16 * dst_y_stride,
                          dst_y_stride);
-    vp9_transform_mby_8x8(x);
-    vp9_quantize_mby_8x8(x);
 
-    /* remove 1st order dc to properly combine 1st/2nd order distortion */
-    x->coeff[  0] = 0;
-    x->coeff[ 64] = 0;
-    x->coeff[128] = 0;
-    x->coeff[192] = 0;
-    xd->dqcoeff[  0] = 0;
-    xd->dqcoeff[ 64] = 0;
-    xd->dqcoeff[128] = 0;
-    xd->dqcoeff[192] = 0;
+    xd->above_context = &t_above[TX_16X16][x_idx];
+    xd->left_context = &t_left[TX_16X16][y_idx];
+    macro_block_yrd_16x16(x, &r_tmp, &d_tmp, IF_RTCD(&cpi->rtcd), &s_tmp, 0);
+    d[TX_16X16] += d_tmp;
+    r[0][TX_16X16] += r_tmp;
+    s[TX_16X16] = s[TX_16X16] && s_tmp;
 
-    d += vp9_mbblock_error(x, 0);
-    d += vp9_block_error(by2->coeff, bdy2->dqcoeff, 16);
-    xd->above_context = ta + x_idx;
-    xd->left_context = tl + y_idx;
-    r += rdcost_mby_8x8(x, 0);
-    skippable = skippable && vp9_mby_is_skippable_8x8(xd, 1);
+    xd->above_context = &t_above[TX_4X4][x_idx];
+    xd->left_context = &t_left[TX_4X4][y_idx];
+    macro_block_yrd_4x4(x, &r_tmp, &d_tmp, IF_RTCD(&cpi->rtcd), &s_tmp, 0);
+    d[TX_4X4] += d_tmp;
+    r[0][TX_4X4] += r_tmp;
+    s[TX_4X4] = s[TX_4X4] && s_tmp;
+
+    xd->above_context = &t_above[TX_8X8][x_idx];
+    xd->left_context = &t_left[TX_8X8][y_idx];
+    macro_block_yrd_8x8(x, &r_tmp, &d_tmp, IF_RTCD(&cpi->rtcd), &s_tmp, 0);
+    d[TX_8X8] += d_tmp;
+    r[0][TX_8X8] += r_tmp;
+    s[TX_8X8] = s[TX_8X8] && s_tmp;
   }
 
-  *distortion = (d >> 2);
-  *rate       = r;
-  if (skip) *skip = skippable;
-  xd->above_context = ta;
-  xd->left_context = tl;
-  vpx_memcpy(xd->above_context, &t_above, sizeof(t_above));
-  vpx_memcpy(xd->left_context, &t_left, sizeof(t_left));
+  choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache);
+
+  xd->above_context = orig_above;
+  xd->left_context = orig_left;
 }
 #endif
 
@@ -1190,7 +1172,8 @@
                                       int *rate,
                                       int *rate_tokenonly,
                                       int *distortion,
-                                      int *skippable) {
+                                      int *skippable,
+                                      int64_t txfm_cache[NB_TXFM_MODES]) {
   MB_PREDICTION_MODE mode;
   MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
   int this_rate, this_rate_tokenonly;
@@ -1202,8 +1185,8 @@
     x->e_mbd.mode_info_context->mbmi.mode = mode;
     vp9_build_intra_predictors_sby_s(&x->e_mbd);
 
-    super_block_yrd_8x8(x, &this_rate_tokenonly,
-                        &this_distortion, IF_RTCD(&cpi->rtcd), &s);
+    super_block_yrd(cpi, x, &this_rate_tokenonly,
+                    &this_distortion, IF_RTCD(&cpi->rtcd), &s, txfm_cache);
     this_rate = this_rate_tokenonly +
                 x->mbmode_cost[x->e_mbd.frame_type]
                               [x->e_mbd.mode_info_context->mbmi.mode];
@@ -1239,12 +1222,12 @@
   MB_PREDICTION_MODE mode2;
   MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode2_selected);
 #endif
-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
   int rate, ratey;
   int distortion, skip;
   int64_t best_rd = INT64_MAX;
   int64_t this_rd;
-  MACROBLOCKD *xd = &x->e_mbd;
 
   int i;
   for (i = 0; i < NB_TXFM_MODES; i++)
@@ -1261,11 +1244,11 @@
       mbmi->second_mode = mode2;
       if (mode2 == (MB_PREDICTION_MODE)(DC_PRED - 1)) {
 #endif
-        vp9_build_intra_predictors_mby(&x->e_mbd);
+        vp9_build_intra_predictors_mby(xd);
 #if CONFIG_COMP_INTRA_PRED
       } else {
         continue; // i.e. disable for now
-        vp9_build_comp_intra_predictors_mby(&x->e_mbd);
+        vp9_build_comp_intra_predictors_mby(xd);
       }
 #endif
 
@@ -1273,7 +1256,7 @@
 
       // FIXME add compoundmode cost
       // FIXME add rate for mode2
-      rate = ratey + x->mbmode_cost[x->e_mbd.frame_type][mbmi->mode];
+      rate = ratey + x->mbmode_cost[xd->frame_type][mbmi->mode];
 
       this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
 
@@ -1519,18 +1502,23 @@
   return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
 }
 
-static int rd_cost_mbuv(MACROBLOCK *mb) {
+static int rd_cost_mbuv_4x4(MACROBLOCK *mb, int backup) {
   int b;
   int cost = 0;
   MACROBLOCKD *xd = &mb->e_mbd;
   ENTROPY_CONTEXT_PLANES t_above, t_left;
   ENTROPY_CONTEXT *ta, *tl;
 
-  vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  if (backup) {
+    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
 
-  ta = (ENTROPY_CONTEXT *)&t_above;
-  tl = (ENTROPY_CONTEXT *)&t_left;
+    ta = (ENTROPY_CONTEXT *)&t_above;
+    tl = (ENTROPY_CONTEXT *)&t_left;
+  } else {
+    ta = (ENTROPY_CONTEXT *)xd->above_context;
+    tl = (ENTROPY_CONTEXT *)xd->left_context;
+  }
 
   for (b = 16; b < 24; b++)
     cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_UV,
@@ -1541,15 +1529,13 @@
 }
 
 
-static int64_t rd_inter16x16_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
-                                int *distortion, int fullpixel, int *skip) {
-  vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
-                    x->e_mbd.predictor, x->src.uv_stride);
-
+static int64_t rd_inter16x16_uv_4x4(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+                                    int *distortion, int fullpixel, int *skip,
+                                    int do_ctx_backup) {
   vp9_transform_mbuv_4x4(x);
   vp9_quantize_mbuv_4x4(x);
 
-  *rate       = rd_cost_mbuv(x);
+  *rate       = rd_cost_mbuv_4x4(x, do_ctx_backup);
   *distortion = vp9_mbuverror(x) / 4;
   *skip       = vp9_mbuv_is_skippable_4x4(&x->e_mbd);
 
@@ -1582,10 +1568,24 @@
   return cost;
 }
 
+static int64_t rd_inter16x16_uv_8x8(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+                                    int *distortion, int fullpixel, int *skip,
+                                    int do_ctx_backup) {
+  vp9_transform_mbuv_8x8(x);
+  vp9_quantize_mbuv_8x8(x);
+
+  *rate       = rd_cost_mbuv_8x8(x, do_ctx_backup);
+  *distortion = vp9_mbuverror(x) / 4;
+  *skip       = vp9_mbuv_is_skippable_8x8(&x->e_mbd);
+
+  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
+}
+
 #if CONFIG_SUPERBLOCKS
-static int64_t rd_inter32x32_uv_8x8(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+static int64_t rd_inter32x32_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
                                 int *distortion, int fullpixel, int *skip) {
   MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
   int n, r = 0, d = 0;
   const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
   const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
@@ -1600,7 +1600,10 @@
 
   for (n = 0; n < 4; n++) {
     int x_idx = n & 1, y_idx = n >> 1;
+    int d_tmp, s_tmp, r_tmp;
 
+    xd->above_context = ta + x_idx;
+    xd->left_context = tl + y_idx;
     vp9_subtract_mbuv_s_c(x->src_diff,
                           usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
                           vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
@@ -1609,58 +1612,35 @@
                           vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
                           dst_uv_stride);
 
-    vp9_transform_mbuv_8x8(x);
-    vp9_quantize_mbuv_8x8(x);
+    if (mbmi->txfm_size == TX_4X4) {
+      rd_inter16x16_uv_4x4(cpi, x, &r_tmp, &d_tmp, fullpixel, &s_tmp, 0);
+    } else {
+      rd_inter16x16_uv_8x8(cpi, x, &r_tmp, &d_tmp, fullpixel, &s_tmp, 0);
+    }
 
-    xd->above_context = ta + x_idx;
-    xd->left_context = tl + y_idx;
-    r += rd_cost_mbuv_8x8(x, 0);
-    d += vp9_mbuverror(x) / 4;
-    skippable = skippable && vp9_mbuv_is_skippable_8x8(xd);
+    r += r_tmp;
+    d += d_tmp;
+    skippable = skippable && s_tmp;
   }
 
   *rate = r;
   *distortion = d;
-  if (skip) *skip = skippable;
+  *skip = skippable;
   xd->left_context = tl;
   xd->above_context = ta;
   memcpy(xd->above_context, t_above, sizeof(t_above));
   memcpy(xd->left_context, t_left, sizeof(t_left));
 
-  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
+  return RDCOST(x->rdmult, x->rddiv, r, d);
 }
 #endif
 
-static int64_t rd_inter16x16_uv_8x8(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
-                                    int *distortion, int fullpixel, int *skip) {
-  vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
-                    x->e_mbd.predictor, x->src.uv_stride);
-
-  vp9_transform_mbuv_8x8(x);
-  vp9_quantize_mbuv_8x8(x);
-
-  *rate       = rd_cost_mbuv_8x8(x, 1);
-  *distortion = vp9_mbuverror(x) / 4;
-  *skip       = vp9_mbuv_is_skippable_8x8(&x->e_mbd);
-
-  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
-}
-
-
 static int64_t rd_inter4x4_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
-                              int *distortion, int *skippable, int fullpixel) {
+                              int *distortion, int *skip, int fullpixel) {
   vp9_build_inter4x4_predictors_mbuv(&x->e_mbd);
   vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
                     x->e_mbd.predictor, x->src.uv_stride);
-
-  vp9_transform_mbuv_4x4(x);
-  vp9_quantize_mbuv_4x4(x);
-
-  *rate       = rd_cost_mbuv(x);
-  *distortion = vp9_mbuverror(x) / 4;
-  *skippable  = vp9_mbuv_is_skippable_4x4(&x->e_mbd);
-
-  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
+  return rd_inter16x16_uv_4x4(cpi, x, rate, distortion, fullpixel, skip, 1);
 }
 
 static void rd_pick_intra_mbuv_mode(VP9_COMP *cpi,
@@ -1707,7 +1687,7 @@
       vp9_transform_mbuv_4x4(x);
       vp9_quantize_mbuv_4x4(x);
 
-      rate_to = rd_cost_mbuv(x);
+      rate_to = rd_cost_mbuv_4x4(x, 1);
       rate = rate_to
              + x->intra_uv_mode_cost[x->e_mbd.frame_type][mbmi->uv_mode];
 
@@ -2434,8 +2414,6 @@
 
     // store everything needed to come back to this!!
     for (i = 0; i < 16; i++) {
-      BLOCKD *bd = &x->e_mbd.block[i];
-
       bsi->mvs[i].as_mv = x->partition_info->bmi[i].mv.as_mv;
       if (mbmi->second_ref_frame)
         bsi->second_mvs[i].as_mv = x->partition_info->bmi[i].second_mv.as_mv;
@@ -3114,12 +3092,9 @@
                                  PARTITION_INFO *partition,
                                  int_mv *ref_mv,
                                  int_mv *second_ref_mv,
-                                 int single_pred_diff,
-                                 int comp_pred_diff,
-                                 int hybrid_pred_diff,
+                                 int64_t comp_pred_diff[NB_PREDICTION_TYPES],
                                  int64_t txfm_size_diff[NB_TXFM_MODES]) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+  MACROBLOCKD *const xd = &x->e_mbd;
 
   // Take a snapshot of the coding context so it can be
   // restored if we decide to encode this way
@@ -3135,15 +3110,11 @@
   // ctx[mb_index].rddiv = x->rddiv;
   // ctx[mb_index].rdmult = x->rdmult;
 
-  ctx->single_pred_diff = single_pred_diff;
-  ctx->comp_pred_diff   = comp_pred_diff;
-  ctx->hybrid_pred_diff = hybrid_pred_diff;
+  ctx->single_pred_diff = comp_pred_diff[SINGLE_PREDICTION_ONLY];
+  ctx->comp_pred_diff   = comp_pred_diff[COMP_PREDICTION_ONLY];
+  ctx->hybrid_pred_diff = comp_pred_diff[HYBRID_PREDICTION];
 
-  if (txfm_size_diff) {
-    memcpy(ctx->txfm_rd_diff, txfm_size_diff, sizeof(ctx->txfm_rd_diff));
-  } else {
-    memset(ctx->txfm_rd_diff, 0, sizeof(ctx->txfm_rd_diff));
-  }
+  memcpy(ctx->txfm_rd_diff, txfm_size_diff, sizeof(ctx->txfm_rd_diff));
 }
 
 static void inter_mode_cost(VP9_COMP *cpi, MACROBLOCK *x, int this_mode,
@@ -3159,12 +3130,15 @@
   *distortion2 += *distortion;
 
   // UV cost and distortion
+  vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
+                    x->e_mbd.predictor, x->src.uv_stride);
   if (x->e_mbd.mode_info_context->mbmi.txfm_size != TX_4X4)
     rd_inter16x16_uv_8x8(cpi, x, rate_uv, distortion_uv,
-                         cpi->common.full_pixel, &uv_skippable);
+                         cpi->common.full_pixel, &uv_skippable, 1);
   else
-    rd_inter16x16_uv(cpi, x, rate_uv, distortion_uv, cpi->common.full_pixel,
-                     &uv_skippable);
+    rd_inter16x16_uv_4x4(cpi, x, rate_uv, distortion_uv,
+                         cpi->common.full_pixel, &uv_skippable, 1);
+
   *rate2 += *rate_uv;
   *distortion2 += *distortion_uv;
   *skippable = y_skippable && uv_skippable;
@@ -3183,8 +3157,8 @@
                                unsigned char *u_buffer[4],
                                unsigned char *v_buffer[4]) {
   YV12_BUFFER_CONFIG *yv12 = &cpi->common.yv12_fb[idx];
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
 
   vp9_find_near_mvs(xd, xd->mode_info_context,
                     xd->prev_mode_info_context,
@@ -3435,14 +3409,14 @@
 #if CONFIG_SUPERBLOCKS
       int skippable_y, skippable_uv;
 
-      // Y cost and distortion - FIXME support other transform sizes
-      super_block_yrd_8x8(x, rate_y, distortion_y,
-                          IF_RTCD(&cpi->rtcd), &skippable_y);
+      // Y cost and distortion
+      super_block_yrd(cpi, x, rate_y, distortion_y,
+                      IF_RTCD(&cpi->rtcd), &skippable_y, txfm_cache);
       *rate2 += *rate_y;
       *distortion += *distortion_y;
 
-      rd_inter32x32_uv_8x8(cpi, x, rate_uv, distortion_uv,
-                           cm->full_pixel, &skippable_uv);
+      rd_inter32x32_uv(cpi, x, rate_uv, distortion_uv,
+                       cm->full_pixel, &skippable_uv);
 
       *rate2 += *rate_uv;
       *distortion += *distortion_uv;
@@ -4053,8 +4027,7 @@
       }
 
       /* keep record of best compound/single-only prediction */
-      if (!disable_skip &&
-          mbmi->ref_frame != INTRA_FRAME) {
+      if (!disable_skip && mbmi->ref_frame != INTRA_FRAME) {
         int64_t single_rd, hybrid_rd;
         int single_rate, hybrid_rate;
 
@@ -4202,12 +4175,10 @@
   }
 
 end:
-  store_coding_context(x, &x->mb_context[xd->mb_index],
-    best_mode_index, &best_partition,
-    &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame],
-    &frame_best_ref_mv[xd->mode_info_context->mbmi.second_ref_frame],
-    (int)best_pred_diff[0], (int)best_pred_diff[1], (int)best_pred_diff[2],
-    best_txfm_diff);
+  store_coding_context(x, &x->mb_context[xd->mb_index], best_mode_index,
+                       &best_partition, &frame_best_ref_mv[mbmi->ref_frame],
+                       &frame_best_ref_mv[mbmi->second_ref_frame],
+                       best_pred_diff, best_txfm_diff);
 }
 
 #if CONFIG_SUPERBLOCKS
@@ -4221,13 +4192,14 @@
   int error_y, error_uv;
   int dist_y, dist_uv;
   int y_skip, uv_skip;
+  int64_t txfm_cache[NB_TXFM_MODES];
 
   xd->mode_info_context->mbmi.txfm_size = TX_8X8;
 
+  error_y = rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
+                                   &dist_y, &y_skip, txfm_cache);
   error_uv = rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
                                      &dist_uv, &uv_skip);
-  error_y = rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
-                                   &dist_y, &y_skip);
 
   if (cpi->common.mb_no_coeff_skip && y_skip && uv_skip) {
     *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
@@ -4408,7 +4380,7 @@
   MB_PREDICTION_MODE this_mode;
   MV_REFERENCE_FRAME ref_frame;
   unsigned char segment_id = xd->mode_info_context->mbmi.segment_id;
-  int comp_pred;
+  int comp_pred, i;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   int_mv frame_best_ref_mv[4];
   int frame_mdcounts[4][4];
@@ -4423,10 +4395,11 @@
   int near_sadidx[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
   int saddone = 0;
   int64_t best_rd = INT64_MAX;
-  int64_t best_comp_rd = INT64_MAX;
-  int64_t best_single_rd = INT64_MAX;
-  int64_t best_hybrid_rd = INT64_MAX;
   int64_t best_yrd = INT64_MAX;
+  int64_t best_txfm_rd[NB_TXFM_MODES];
+  int64_t best_txfm_diff[NB_TXFM_MODES];
+  int64_t best_pred_diff[NB_PREDICTION_TYPES];
+  int64_t best_pred_rd[NB_PREDICTION_TYPES];
   MB_MODE_INFO best_mbmode;
   int mode_index, best_mode_index;
   unsigned int ref_costs[MAX_REF_FRAMES];
@@ -4436,6 +4409,11 @@
   estimate_ref_frame_costs(cpi, segment_id, ref_costs);
   vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
 
+  for (i = 0; i < NB_PREDICTION_TYPES; ++i)
+    best_pred_rd[i] = INT64_MAX;
+  for (i = 0; i < NB_TXFM_MODES; i++)
+    best_txfm_rd[i] = INT64_MAX;
+
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
       setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame,
@@ -4606,14 +4584,9 @@
     }
 #endif
 
-    if (!disable_skip && xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
-      if (this_rd < best_comp_rd)
-        best_comp_rd = this_rd;
-      if (this_rd < best_single_rd)
-        best_single_rd = this_rd;
-      if (this_rd < best_hybrid_rd)
-        best_hybrid_rd = this_rd;
-    }
+    if (!disable_skip && mbmi->ref_frame == INTRA_FRAME)
+      for (i = 0; i < NB_PREDICTION_TYPES; ++i)
+        best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
 
     // Did this mode help.. i.e. is it the new best mode
     if (this_rd < best_rd || x->skip) {
@@ -4673,14 +4646,28 @@
       single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
       hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
 
-      if (mbmi->second_ref_frame == INTRA_FRAME && single_rd < best_single_rd) {
-        best_single_rd = single_rd;
+      if (mbmi->second_ref_frame == INTRA_FRAME &&
+          single_rd < best_pred_rd[SINGLE_PREDICTION_ONLY]) {
+        best_pred_rd[SINGLE_PREDICTION_ONLY] = single_rd;
       } else if (mbmi->second_ref_frame != INTRA_FRAME &&
-                 single_rd < best_comp_rd) {
-        best_comp_rd = single_rd;
+                 single_rd < best_pred_rd[COMP_PREDICTION_ONLY]) {
+        best_pred_rd[COMP_PREDICTION_ONLY] = single_rd;
       }
-      if (hybrid_rd < best_hybrid_rd) {
-        best_hybrid_rd = hybrid_rd;
+      if (hybrid_rd < best_pred_rd[HYBRID_PREDICTION])
+        best_pred_rd[HYBRID_PREDICTION] = hybrid_rd;
+    }
+
+    /* keep record of best txfm size */
+    if (!mode_excluded && this_rd != INT64_MAX) {
+      for (i = 0; i < NB_TXFM_MODES; i++) {
+        int64_t adj_rd;
+        if (this_mode != B_PRED) {
+          adj_rd = this_rd + txfm_cache[i] - txfm_cache[cm->txfm_mode];
+        } else {
+          adj_rd = this_rd;
+        }
+        if (adj_rd < best_txfm_rd[i])
+          best_txfm_rd[i] = adj_rd;
       }
     }
 
@@ -4719,31 +4706,40 @@
     mbmi->uv_mode = DC_PRED;
     mbmi->mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0;
     mbmi->partitioning = 0;
-    mbmi->txfm_size = TX_8X8;
+    mbmi->txfm_size = cm->txfm_mode == TX_MODE_SELECT ?
+                      TX_16X16 : cm->txfm_mode;
 
-    if (best_rd != INT64_MAX)
-      store_coding_context(x, &x->sb_context[0], best_mode_index, NULL,
-                           &frame_best_ref_mv[mbmi->ref_frame],
-                           &frame_best_ref_mv[mbmi->second_ref_frame],
-                           0, 0, 0, NULL);
-    return best_rd;
+    vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
+    vpx_memset(best_pred_diff, 0, sizeof(best_pred_diff));
+    goto end;
   }
 
   // macroblock modes
   vpx_memcpy(mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
-  mbmi->txfm_size = TX_8X8;
 
-  if (best_rd != INT64_MAX)
-    store_coding_context(x, &x->sb_context[0], best_mode_index, NULL,
-                         &frame_best_ref_mv[mbmi->ref_frame],
-                         &frame_best_ref_mv[mbmi->second_ref_frame],
-                         (best_single_rd == INT64_MAX) ? INT_MIN :
-                                        (best_rd - best_single_rd),
-                         (best_comp_rd   == INT64_MAX) ? INT_MIN :
-                                        (best_rd - best_comp_rd),
-                         (best_hybrid_rd == INT64_MAX) ? INT_MIN :
-                                        (best_rd - best_hybrid_rd),
-                         NULL);
+  for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
+    if (best_pred_rd[i] == INT64_MAX)
+      best_pred_diff[i] = INT_MIN;
+    else
+      best_pred_diff[i] = best_rd - best_pred_rd[i];
+  }
+
+  if (!x->skip) {
+    for (i = 0; i < NB_TXFM_MODES; i++) {
+      if (best_txfm_rd[i] == INT64_MAX)
+        best_txfm_diff[i] = INT_MIN;
+      else
+        best_txfm_diff[i] = best_rd - best_txfm_rd[i];
+    }
+  } else {
+    vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
+  }
+
+ end:
+  store_coding_context(x, &x->sb_context[0], best_mode_index, NULL,
+                       &frame_best_ref_mv[mbmi->ref_frame],
+                       &frame_best_ref_mv[mbmi->second_ref_frame],
+                       best_pred_diff, best_txfm_diff);
 
   return best_rd;
 }