Adds hybrid transform

Adds ADST/DCT hybrid transform coding for Intra4x4 mode.
The ADST is applied to directions in which the boundary
pixels are used for prediction, while DCT applied to
directions without corresponding boundary prediction.

Adds enum TX_TYPE in b_mode_infor to indicate the transform
type used.

Make coding style consistent with google style.
Fixed the commented issues.

Experimental results in terms of bit-rate reduction:
derf:   0.731%
yt:     0.982%
std-hd: 0.459%
hd:     0.725%

Will be looking at 8x8 transforms next.

Change-Id: I46dbd7b80dbb3e8856e9c34fbc58cb3764a12fcf
diff --git a/configure b/configure
index 202778d..a24a1e1 100755
--- a/configure
+++ b/configure
@@ -228,6 +228,7 @@
     adaptive_entropy
     pred_filter
     lossless
+    hybridtransform
 "
 CONFIG_LIST="
     external_build
diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h
index 3c60b10..758977d 100644
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -124,12 +124,25 @@
 
 } TX_SIZE;
 
+#if CONFIG_HYBRIDTRANSFORM
+typedef enum {
+  DCT_DCT   = 0,                      // DCT  in both horizontal and vertical
+  ADST_DCT  = 1,                      // ADST in horizontal, DCT in vertical
+  DCT_ADST  = 2,                      // DCT  in horizontal, ADST in vertical
+  ADST_ADST = 3                       // ADST in both directions
+} TX_TYPE;
+#endif
+
 #define VP8_YMODES  (B_PRED + 1)
 #define VP8_UV_MODES (TM_PRED + 1)
 #define VP8_I8X8_MODES (TM_PRED + 1)
 
 #define VP8_MVREFS (1 + SPLITMV - NEARESTMV)
 
+#if CONFIG_HYBRIDTRANSFORM
+#define ACTIVE_HT 110                // quantization stepsize threshold
+#endif
+
 typedef enum {
   B_DC_PRED,          /* average of above and left pixels */
   B_TM_PRED,
@@ -163,6 +176,11 @@
 union b_mode_info {
   struct {
     B_PREDICTION_MODE first;
+#if CONFIG_HYBRIDTRANSFORM
+    B_PREDICTION_MODE test;
+    TX_TYPE           tx_type;
+#endif
+
 #if CONFIG_COMP_INTRA_PRED
     B_PREDICTION_MODE second;
 #endif
@@ -183,6 +201,10 @@
 
 typedef struct {
   MB_PREDICTION_MODE mode, uv_mode;
+#if CONFIG_HYBRIDTRANSFORM
+  MB_PREDICTION_MODE mode_rdopt;
+#endif
+
 #if CONFIG_COMP_INTRA_PRED
   MB_PREDICTION_MODE second_mode, second_uv_mode;
 #endif
@@ -345,6 +367,10 @@
 
   int mb_index;   // Index of the MB in the SB (0..3)
 
+#if CONFIG_HYBRIDTRANSFORM
+  int q_index;
+#endif
+
 } MACROBLOCKD;
 
 
diff --git a/vp8/common/entropy.c b/vp8/common/entropy.c
index 4c7caef..397e4ae 100644
--- a/vp8/common/entropy.c
+++ b/vp8/common/entropy.c
@@ -65,6 +65,24 @@
   9, 12, 13, 10,
   7, 11, 14, 15,
 };
+
+
+#if CONFIG_HYBRIDTRANSFORM
+DECLARE_ALIGNED(16, const int, vp8_col_scan[16]) = {
+  0, 4,  8, 12,
+  1, 5,  9, 13,
+  2, 6, 10, 14,
+  3, 7, 11, 15
+};
+DECLARE_ALIGNED(16, const int, vp8_row_scan[16]) = {
+  0,   1,  2,  3,
+  4,   5,  6,  7,
+  8,   9, 10, 11,
+  12, 13, 14, 15
+};
+#endif
+
+
 DECLARE_ALIGNED(64, cuchar, vp8_coef_bands_8x8[64]) = { 0, 1, 2, 3, 5, 4, 4, 5,
                                                         5, 3, 6, 3, 5, 4, 6, 6,
                                                         6, 5, 5, 6, 6, 6, 6, 6,
diff --git a/vp8/common/entropy.h b/vp8/common/entropy.h
index bc6f16c..aa1faca 100644
--- a/vp8/common/entropy.h
+++ b/vp8/common/entropy.h
@@ -107,6 +107,12 @@
 struct VP8Common;
 void vp8_default_coef_probs(struct VP8Common *);
 extern DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]);
+
+#if CONFIG_HYBRIDTRANSFORM
+extern DECLARE_ALIGNED(16, const int, vp8_col_scan[16]);
+extern DECLARE_ALIGNED(16, const int, vp8_row_scan[16]);
+#endif
+
 extern short vp8_default_zig_zag_mask[16];
 extern DECLARE_ALIGNED(64, const int, vp8_default_zig_zag1d_8x8[64]);
 extern short vp8_default_zig_zag_mask_8x8[64];// int64_t
diff --git a/vp8/common/idct.h b/vp8/common/idct.h
index 433f577..7582a1c 100644
--- a/vp8/common/idct.h
+++ b/vp8/common/idct.h
@@ -97,6 +97,12 @@
 extern prototype_second_order(vp8_short_inv_walsh4x4_1_lossless_c);
 #endif
 
+#if CONFIG_HYBRIDTRANSFORM
+#include "vp8/common/blockd.h"
+void vp8_iht4x4llm_c(short *input, short *output, int pitch, TX_TYPE tx_type);
+#endif
+
+
 typedef prototype_idct((*vp8_idct_fn_t));
 typedef prototype_idct_scalar_add((*vp8_idct_scalar_add_fn_t));
 typedef prototype_second_order((*vp8_second_order_fn_t));
diff --git a/vp8/common/idctllm.c b/vp8/common/idctllm.c
index 684868c..dbf0fda 100644
--- a/vp8/common/idctllm.c
+++ b/vp8/common/idctllm.c
@@ -25,6 +25,9 @@
 #include "vpx_ports/config.h"
 #include "vp8/common/idct.h"
 
+#if CONFIG_HYBRIDTRANSFORM
+#include "vp8/common/blockd.h"
+#endif
 
 #include <math.h>
 
@@ -32,6 +35,130 @@
 static const int sinpi8sqrt2      = 35468;
 static const int rounding = 0;
 
+#if CONFIG_HYBRIDTRANSFORM
+float idct_4[16] = {
+  0.500000000000000,   0.653281482438188,   0.500000000000000,   0.270598050073099,
+  0.500000000000000,   0.270598050073099,  -0.500000000000000,  -0.653281482438188,
+  0.500000000000000,  -0.270598050073099,  -0.500000000000000,   0.653281482438188,
+  0.500000000000000,  -0.653281482438188,   0.500000000000000,  -0.270598050073099
+};
+
+float iadst_4[16] = {
+  0.228013428883779,   0.577350269189626,   0.656538502008139,   0.428525073124360,
+  0.428525073124360,   0.577350269189626,  -0.228013428883779,  -0.656538502008139,
+  0.577350269189626,                   0,  -0.577350269189626,   0.577350269189626,
+  0.656538502008139,  -0.577350269189626,   0.428525073124359,  -0.228013428883779
+};
+#endif
+
+#if CONFIG_HYBRIDTRANSFORM
+void vp8_iht4x4llm_c(short *input, short *output, int pitch, TX_TYPE tx_type) {
+  int i, j, k;
+  float bufa[16], bufb[16]; // buffers are for floating-point test purpose
+                            // the implementation could be simplified in conjunction with integer transform
+  short *ip = input;
+  short *op = output;
+  int shortpitch = pitch >> 1;
+
+  float *pfa = &bufa[0];
+  float *pfb = &bufb[0];
+
+  // pointers to vertical and horizontal transforms
+  float *ptv, *pth;
+
+  // load and convert residual array into floating-point
+  for(j = 0; j < 4; j++) {
+    for(i = 0; i < 4; i++) {
+      pfa[i] = (float)ip[i];
+    }
+    pfa += 4;
+    ip  += 4;
+  }
+
+  // vertical transformation
+  pfa = &bufa[0];
+  pfb = &bufb[0];
+
+  switch(tx_type) {
+    case ADST_ADST :
+    case ADST_DCT  :
+      ptv = &iadst_4[0];
+      break;
+
+    default :
+      ptv = &idct_4[0];
+      break;
+  }
+
+  for(j = 0; j < 4; j++) {
+    for(i = 0; i < 4; i++) {
+      pfb[i] = 0 ;
+      for(k = 0; k < 4; k++) {
+        pfb[i] += ptv[k] * pfa[(k<<2)];
+      }
+      pfa += 1;
+    }
+
+    pfb += 4;
+    ptv += 4;
+    pfa = &bufa[0];
+  }
+
+  // horizontal transformation
+  pfa = &bufa[0];
+  pfb = &bufb[0];
+
+  switch(tx_type) {
+    case ADST_ADST :
+    case  DCT_ADST :
+      pth = &iadst_4[0];
+      break;
+
+    default :
+      pth = &idct_4[0];
+      break;
+  }
+
+  for(j = 0; j < 4; j++) {
+    for(i = 0; i < 4; i++) {
+      pfa[i] = 0;
+      for(k = 0; k < 4; k++) {
+        pfa[i] += pfb[k] * pth[k];
+      }
+      pth += 4;
+     }
+
+    pfa += 4;
+    pfb += 4;
+
+    switch(tx_type) {
+      case ADST_ADST :
+      case  DCT_ADST :
+        pth = &iadst_4[0];
+        break;
+
+      default :
+        pth = &idct_4[0];
+        break;
+    }
+  }
+
+  // convert to short integer format and load BLOCKD buffer
+  op  = output;
+  pfa = &bufa[0];
+
+  for(j = 0; j < 4; j++) {
+    for(i = 0; i < 4; i++) {
+      op[i] = (pfa[i] > 0 ) ? (short)( pfa[i] / 8 + 0.49) :
+                             -(short)( - pfa[i] / 8 + 0.49);
+    }
+    op  += shortpitch;
+    pfa += 4;
+  }
+}
+#endif
+
+
 void vp8_short_idct4x4llm_c(short *input, short *output, int pitch) {
   int i;
   int a1, b1, c1, d1;
diff --git a/vp8/common/invtrans.c b/vp8/common/invtrans.c
index aea4536..d350716 100644
--- a/vp8/common/invtrans.c
+++ b/vp8/common/invtrans.c
@@ -31,6 +31,11 @@
 
 }
 
+#if CONFIG_HYBRIDTRANSFORM
+void vp8_inverse_htransform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b, int pitch) {
+  vp8_iht4x4llm_c(b->dqcoeff, b->diff, pitch, b->bmi.as_mode.tx_type);
+}
+#endif
 
 void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b, int pitch) {
   if (b->eob <= 1)
diff --git a/vp8/common/invtrans.h b/vp8/common/invtrans.h
index 4c4f0d3..1eda173 100644
--- a/vp8/common/invtrans.h
+++ b/vp8/common/invtrans.h
@@ -15,6 +15,11 @@
 #include "vpx_ports/config.h"
 #include "idct.h"
 #include "blockd.h"
+
+#if CONFIG_HYBRIDTRANSFORM
+extern void vp8_inverse_htransform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b, int pitch);
+#endif
+
 extern void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b, int pitch);
 extern void vp8_inverse_transform_mb(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
 extern void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index 9eb42cc..d0e43ca 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -128,6 +128,11 @@
     xd->block[i].dequant = pc->Y1dequant[QIndex];
   }
 
+#if CONFIG_HYBRIDTRANSFORM
+  xd->q_index = QIndex;
+#endif
+
+
 #if CONFIG_LOSSLESS
   if (!QIndex) {
     pbi->common.rtcd.idct.idct1        = vp8_short_inv_walsh4x4_1_x8_c;
@@ -208,6 +213,11 @@
   int i;
   int tx_type;
 
+#if CONFIG_HYBRIDTRANSFORM
+  int QIndex = xd->q_index;
+  int active_ht = (QIndex < ACTIVE_HT);
+#endif
+
   if (pbi->common.frame_type == KEY_FRAME) {
     if (pbi->common.txfm_mode == ALLOW_8X8 &&
         xd->mode_info_context->mbmi.mode != I8X8_PRED &&
@@ -281,6 +291,39 @@
   if (xd->segmentation_enabled)
     mb_init_dequantizer(pbi, xd);
 
+#if CONFIG_HYBRIDTRANSFORM
+  // parse transform types for intra 4x4 mode
+  if (mode == B_PRED) {
+    for (i = 0; i < 16; i++) {
+      BLOCKD *b = &xd->block[i];
+      int b_mode = xd->mode_info_context->bmi[i].as_mode.first;
+      if(active_ht) {
+        switch(b_mode) {
+          case B_TM_PRED :
+          case B_RD_PRED :
+            b->bmi.as_mode.tx_type = ADST_ADST;
+            break;
+
+          case B_VE_PRED :
+          case B_VR_PRED :
+            b->bmi.as_mode.tx_type = ADST_DCT;
+            break ;
+
+          case B_HE_PRED :
+          case B_HD_PRED :
+          case B_HU_PRED :
+            b->bmi.as_mode.tx_type = DCT_ADST;
+            break;
+
+          default :
+            b->bmi.as_mode.tx_type = DCT_DCT;
+            break;
+        }
+      }
+    } // loop over 4x4 blocks
+  }
+#endif
+
   /* do prediction */
   if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
     if (mode != I8X8_PRED) {
@@ -360,16 +403,29 @@
       }
 #endif
 
-      if (xd->eobs[i] > 1) {
-        DEQUANT_INVOKE(&pbi->dequant, idct_add)
-        (b->qcoeff, b->dequant,  b->predictor,
-         *(b->base_dst) + b->dst, 16, b->dst_stride);
-      } else {
-        IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
-        (b->qcoeff[0] * b->dequant[0], b->predictor,
-         *(b->base_dst) + b->dst, 16, b->dst_stride);
-        ((int *)b->qcoeff)[0] = 0;
+#if CONFIG_HYBRIDTRANSFORM
+      if(active_ht)
+        vp8_ht_dequant_idct_add_c( (TX_TYPE)b->bmi.as_mode.tx_type, b->qcoeff,
+                                   b->dequant, b->predictor,
+                                   *(b->base_dst) + b->dst, 16, b->dst_stride);
+      else
+        vp8_dequant_idct_add_c(b->qcoeff, b->dequant, b->predictor,
+                               *(b->base_dst) + b->dst, 16, b->dst_stride);
+#else
+      if (xd->eobs[i] > 1)
+      {
+          DEQUANT_INVOKE(&pbi->dequant, idct_add)
+              (b->qcoeff, b->dequant,  b->predictor,
+              *(b->base_dst) + b->dst, 16, b->dst_stride);
       }
+      else
+      {
+          IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
+              (b->qcoeff[0] * b->dequant[0], b->predictor,
+              *(b->base_dst) + b->dst, 16, b->dst_stride);
+          ((int *)b->qcoeff)[0] = 0;
+      }
+#endif
     }
   } else if (mode == SPLITMV) {
     DEQUANT_INVOKE(&pbi->dequant, idct_add_y_block)
@@ -378,8 +434,6 @@
      xd->dst.y_stride, xd->eobs);
   } else {
     BLOCKD *b = &xd->block[24];
-
-
     if (tx_type == TX_8X8) {
       DEQUANT_INVOKE(&pbi->dequant, block_2x2)(b);
 #ifdef DEC_DEBUG
diff --git a/vp8/decoder/dequantize.c b/vp8/decoder/dequantize.c
index 73859b0..3669cc2 100644
--- a/vp8/decoder/dequantize.c
+++ b/vp8/decoder/dequantize.c
@@ -41,6 +41,44 @@
   }
 }
 
+
+#if CONFIG_HYBRIDTRANSFORM
+void vp8_ht_dequant_idct_add_c(TX_TYPE tx_type, short *input, short *dq,
+                               unsigned char *pred, unsigned char *dest,
+                               int pitch, int stride) {
+  short output[16];
+  short *diff_ptr = output;
+  int r, c;
+  int i;
+
+  for (i = 0; i < 16; i++) {
+    input[i] = dq[i] * input[i];
+  }
+
+  vp8_iht4x4llm_c( input, output, 4 << 1, tx_type );
+
+  vpx_memset(input, 0, 32);
+
+  for (r = 0; r < 4; r++) {
+      for (c = 0; c < 4; c++) {
+        int a = diff_ptr[c] + pred[c];
+
+        if (a < 0)
+            a = 0;
+
+        if (a > 255)
+            a = 255;
+
+        dest[c] = (unsigned char) a;
+    }
+
+      dest += stride;
+      diff_ptr += 4;
+      pred += pitch;
+  }
+}
+#endif
+
 void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
                             unsigned char *dest, int pitch, int stride) {
   short output[16];
diff --git a/vp8/decoder/dequantize.h b/vp8/decoder/dequantize.h
index 2582d86..76418ff 100644
--- a/vp8/decoder/dequantize.h
+++ b/vp8/decoder/dequantize.h
@@ -76,6 +76,17 @@
 #endif
 extern prototype_dequant_idct_add(vp8_dequant_idct_add);
 
+#if CONFIG_HYBRIDTRANSFORM
+// declare dequantization and inverse transform module of hybrid transform decoder
+#ifndef vp8_ht_dequant_idct_add
+#define vp8_ht_dequant_idct_add vp8_ht_dequant_idct_add_c
+#endif
+extern void vp8_ht_dequant_idct_add(TX_TYPE tx_type, short *input, short *dq,
+                                    unsigned char *pred, unsigned char *dest,
+                                    int pitch, int stride);
+
+#endif
+
 #ifndef vp8_dequant_dc_idct_add
 #define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_c
 #endif
diff --git a/vp8/decoder/detokenize.c b/vp8/decoder/detokenize.c
index 7f3469e..a113087 100644
--- a/vp8/decoder/detokenize.c
+++ b/vp8/decoder/detokenize.c
@@ -119,6 +119,53 @@
   else return DCT_VAL_CATEGORY6;
 }
 
+#if CONFIG_HYBRIDTRANSFORM
+void static count_tokens_adaptive_scan(const MACROBLOCKD *xd, INT16 *qcoeff_ptr,
+                                       int block, int type, ENTROPY_CONTEXT *a,
+                                       ENTROPY_CONTEXT *l, int eob, int seg_eob,
+                                       FRAME_CONTEXT *fc) {
+  int c, pt, token, band;
+  const int *scan;
+
+  int QIndex = xd->q_index;
+  int active_ht = (QIndex < ACTIVE_HT) &&
+                  (xd->mode_info_context->mbmi.mode == B_PRED);
+
+  if(active_ht) {
+    switch(xd->block[block].bmi.as_mode.tx_type) {
+      case ADST_DCT :
+        scan = vp8_row_scan;
+        break;
+
+      case DCT_ADST :
+        scan = vp8_col_scan;
+        break;
+
+      default :
+        scan = vp8_default_zig_zag1d;
+        break;
+    }
+  } else {
+    scan = vp8_default_zig_zag1d;
+  }
+
+  VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+  for (c = !type; c < eob; ++c) {
+    int rc = scan[c];
+    int v = qcoeff_ptr[rc];
+    band = vp8_coef_bands[c];
+    token = get_token(v);
+    fc->coef_counts[type][band][pt][token]++;
+    pt = vp8_prev_token_class[token];
+  }
+
+  if (eob < seg_eob) {
+    band = vp8_coef_bands[c];
+    fc->coef_counts[type][band][pt][DCT_EOB_TOKEN]++;
+  }
+}
+#endif
+
 void static count_tokens(INT16 *qcoeff_ptr, int block, int type,
                          ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
                          int eob, int seg_eob, FRAME_CONTEXT *const fc) {
@@ -289,8 +336,14 @@
     WRITE_COEF_CONTINUE(val);
   }
 #if CONFIG_ADAPTIVE_ENTROPY
+
   if (block_type == TX_4X4)
+#if CONFIG_HYBRIDTRANSFORM
+    count_tokens_adaptive_scan(xd, qcoeff_ptr, i, type, a, l, c, seg_eob, fc);
+#else
     count_tokens(qcoeff_ptr, i, type, a, l, c, seg_eob, fc);
+#endif
+
   else
     count_tokens_8x8(qcoeff_ptr, i, type, a, l, c, seg_eob, fc);
 #endif
@@ -351,12 +404,21 @@
   return eobtotal;
 }
 
+
 int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *xd) {
   ENTROPY_CONTEXT *const A = (ENTROPY_CONTEXT *)xd->above_context;
   ENTROPY_CONTEXT *const L = (ENTROPY_CONTEXT *)xd->left_context;
 
   char *const eobs = xd->eobs;
+#if CONFIG_HYBRIDTRANSFORM
+  const int *scan = vp8_default_zig_zag1d;
+  int QIndex = xd->q_index;
+  int active_ht = (QIndex < ACTIVE_HT) &&
+                  (xd->mode_info_context->mbmi.mode == B_PRED);
+#else
   const int *const scan = vp8_default_zig_zag1d;
+#endif
+
   int c, i, type, eobtotal = 0, seg_eob = 16;
   INT16 *qcoeff_ptr = &xd->qcoeff[0];
 
@@ -388,6 +450,41 @@
     if (i == 16)
       type = PLANE_TYPE_UV;
 
+#if CONFIG_HYBRIDTRANSFORM
+    if (type == PLANE_TYPE_Y_WITH_DC &&
+        xd->mode_info_context->mbmi.mode == B_PRED &&
+        active_ht) {
+      BLOCKD *b = &xd->block[i];
+      switch(b->bmi.as_mode.first) {
+        case B_TM_PRED :
+        case B_RD_PRED :
+          b->bmi.as_mode.tx_type = ADST_ADST;
+          scan = vp8_default_zig_zag1d;
+          break;
+
+        case B_VE_PRED :
+        case B_VR_PRED :
+          b->bmi.as_mode.tx_type = ADST_DCT;
+          scan = vp8_row_scan;
+          break ;
+
+        case B_HE_PRED :
+        case B_HD_PRED :
+        case B_HU_PRED :
+          b->bmi.as_mode.tx_type = DCT_ADST;
+          scan = vp8_col_scan;
+          break;
+
+        default :
+          b->bmi.as_mode.tx_type = DCT_DCT;
+          scan = vp8_default_zig_zag1d;
+          break;
+      }
+    }
+    if (type == PLANE_TYPE_UV) {
+      scan = vp8_default_zig_zag1d;
+    }
+#endif
     c = vp8_decode_coefs(dx, xd, a, l, type, seg_eob, qcoeff_ptr,
                          i, scan, TX_4X4, coef_bands_x);
     a[0] = l[0] = ((eobs[i] = c) != !type);
diff --git a/vp8/encoder/dct.c b/vp8/encoder/dct.c
index fbe22eb..c3a938b 100644
--- a/vp8/encoder/dct.c
+++ b/vp8/encoder/dct.c
@@ -13,6 +13,28 @@
 #include "vpx_ports/config.h"
 #include "vp8/common/idct.h"
 
+#if CONFIG_HYBRIDTRANSFORM
+
+#include "vp8/common/blockd.h"
+
+float dct_4[16] = {
+  0.500000000000000,  0.500000000000000,  0.500000000000000,  0.500000000000000,
+  0.653281482438188,  0.270598050073099, -0.270598050073099, -0.653281482438188,
+  0.500000000000000, -0.500000000000000, -0.500000000000000,  0.500000000000000,
+  0.270598050073099, -0.653281482438188,  0.653281482438188, -0.270598050073099
+};
+
+float adst_4[16] = {
+  0.228013428883779,  0.428525073124360,  0.577350269189626,  0.656538502008139,
+  0.577350269189626,  0.577350269189626,  0.000000000000000, -0.577350269189626,
+  0.656538502008139, -0.228013428883779, -0.577350269189626,  0.428525073124359,
+  0.428525073124360, -0.656538502008139,  0.577350269189626, -0.228013428883779
+};
+#endif
+
+
+#if CONFIG_INT_8X8FDCT
+
 static const int xC1S7 = 16069;
 static const int xC2S6 = 15137;
 static const int xC3S5 = 13623;
@@ -268,6 +290,112 @@
 
 }
 
+#if CONFIG_HYBRIDTRANSFORM
+void vp8_fht4x4_c(short *input, short *output, int pitch, TX_TYPE tx_type) {
+  int i, j, k;
+  float bufa[16], bufb[16]; // buffers are for floating-point test purpose
+                             // the implementation could be simplified in
+                             // conjunction with integer transform
+  short *ip = input;
+  short *op = output;
+
+  float *pfa = &bufa[0];
+  float *pfb = &bufb[0];
+
+  // pointers to vertical and horizontal transforms
+  float *ptv, *pth;
+
+  // load and convert residual array into floating-point
+  for(j = 0; j < 4; j++) {
+    for(i = 0; i < 4; i++) {
+      pfa[i] = (float)ip[i];
+    }
+    pfa += 4;
+    ip  += pitch / 2;
+  }
+
+  // vertical transformation
+  pfa = &bufa[0];
+  pfb = &bufb[0];
+
+  switch(tx_type) {
+    case ADST_ADST :
+    case ADST_DCT  :
+      ptv = &adst_4[0];
+      break;
+
+    default :
+      ptv = &dct_4[0];
+      break;
+  }
+
+  for(j = 0; j < 4; j++) {
+    for(i = 0; i < 4; i++) {
+      pfb[i] = 0;
+      for(k = 0; k < 4; k++) {
+        pfb[i] += ptv[k] * pfa[(k<<2)];
+      }
+      pfa += 1;
+    }
+    pfb += 4;
+    ptv += 4;
+    pfa = &bufa[0];
+  }
+
+  // horizontal transformation
+  pfa = &bufa[0];
+  pfb = &bufb[0];
+
+  switch(tx_type) {
+    case ADST_ADST :
+    case  DCT_ADST :
+      pth = &adst_4[0];
+      break;
+
+    default :
+      pth = &dct_4[0];
+      break;
+  }
+
+  for(j = 0; j < 4; j++) {
+    for(i = 0; i < 4; i++) {
+      pfa[i] = 0;
+      for(k = 0; k < 4; k++) {
+        pfa[i] += pfb[k] * pth[k];
+      }
+      pth += 4;
+     }
+
+    pfa += 4;
+    pfb += 4;
+
+    switch(tx_type) {
+      case ADST_ADST :
+      case  DCT_ADST :
+        pth = &adst_4[0];
+        break;
+
+      default :
+        pth = &dct_4[0];
+        break;
+    }
+  }
+
+  // convert to short integer format and load BLOCKD buffer
+  op  = output ;
+  pfa = &bufa[0] ;
+
+  for(j = 0; j < 4; j++) {
+    for(i = 0; i < 4; i++) {
+      op[i] = (pfa[i] > 0 ) ? (short)( 8 * pfa[i] + 0.49) :
+                                   -(short)(- 8 * pfa[i] + 0.49);
+    }
+    op  += 4;
+    pfa += 4;
+  }
+}
+#endif
+
 void vp8_short_fdct4x4_c(short *input, short *output, int pitch) {
   int i;
   int a1, b1, c1, d1;
@@ -309,9 +437,18 @@
   }
 }
 
-void vp8_short_fdct8x4_c(short *input, short *output, int pitch) {
-  vp8_short_fdct4x4_c(input,   output,    pitch);
-  vp8_short_fdct4x4_c(input + 4, output + 16, pitch);
+#if CONFIG_HYBRIDTRANSFORM
+void vp8_fht8x4_c(short *input, short *output, int pitch,
+                  TX_TYPE tx_type) {
+  vp8_fht4x4_c(input,     output,      pitch, tx_type);
+  vp8_fht4x4_c(input + 4, output + 16, pitch, tx_type);
+}
+#endif
+
+void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
+{
+    vp8_short_fdct4x4_c(input,   output,    pitch);
+    vp8_short_fdct4x4_c(input + 4, output + 16, pitch);
 }
 
 void vp8_short_walsh4x4_c(short *input, short *output, int pitch) {
diff --git a/vp8/encoder/dct.h b/vp8/encoder/dct.h
index 6d2b736..ac7769d 100644
--- a/vp8/encoder/dct.h
+++ b/vp8/encoder/dct.h
@@ -23,6 +23,10 @@
 #endif
 
 
+#if CONFIG_HYBRIDTRANSFORM
+void vp8_fht4x4_c(short *input, short *output, int pitch, TX_TYPE tx_type);
+void vp8_fht8x4_c(short *input, short *output, int pitch, TX_TYPE tx_type);
+#endif
 
 #ifndef vp8_fdct_short8x8
 #define vp8_fdct_short8x8  vp8_short_fdct8x8_c
diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c
index 7d14532..46b352e 100644
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@@ -32,8 +32,11 @@
 #define IF_RTCD(x) NULL
 #endif
 
-int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_16x16_pred) {
+#if CONFIG_HYBRIDTRANSFORM
+extern void vp8_ht_quantize_b(BLOCK *b, BLOCKD *d);
+#endif
 
+int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_16x16_pred) {
   int i;
   int intra_pred_var = 0;
   (void) cpi;
@@ -64,6 +67,12 @@
   BLOCKD *b = &x->e_mbd.block[ib];
   BLOCK *be = &x->block[ib];
 
+#if CONFIG_HYBRIDTRANSFORM
+    int QIndex = x->q_index;
+    int active_ht = (QIndex < ACTIVE_HT);
+#endif
+
+
 #if CONFIG_COMP_INTRA_PRED
   if (b->bmi.as_mode.second == (B_PREDICTION_MODE)(B_DC_PRED - 1)) {
 #endif
@@ -78,11 +87,45 @@
 
   ENCODEMB_INVOKE(&rtcd->encodemb, subb)(be, b, 16);
 
-  x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
+#if CONFIG_HYBRIDTRANSFORM
+    if(active_ht) {
+      b->bmi.as_mode.test = b->bmi.as_mode.first;
+      switch(b->bmi.as_mode.first) {
+        // case B_DC_PRED :
+        case B_TM_PRED :
+        case B_RD_PRED :
+          b->bmi.as_mode.tx_type = ADST_ADST;
+          break;
 
-  x->quantize_b(be, b);
+        case B_VE_PRED :
+        case B_VR_PRED :
+          b->bmi.as_mode.tx_type = ADST_DCT;
+          break;
 
-  vp8_inverse_transform_b(IF_RTCD(&rtcd->common->idct), b, 32);
+        case B_HE_PRED :
+        case B_HD_PRED :
+        case B_HU_PRED :
+          b->bmi.as_mode.tx_type = DCT_ADST;
+          break;
+
+        default :
+          b->bmi.as_mode.tx_type = DCT_DCT;
+          break;
+      }
+
+      vp8_fht4x4_c(be->src_diff, be->coeff, 32, b->bmi.as_mode.tx_type);
+      vp8_ht_quantize_b(be, b);
+      vp8_inverse_htransform_b(IF_RTCD(&rtcd->common->idct), b, 32) ;
+    } else {
+      x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32) ;
+      x->quantize_b(be, b) ;
+      vp8_inverse_transform_b(IF_RTCD(&rtcd->common->idct), b, 32) ;
+    }
+#else
+    x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
+    x->quantize_b(be, b);
+    vp8_inverse_transform_b(IF_RTCD(&rtcd->common->idct), b, 32);
+#endif
 
   RECON_INVOKE(&rtcd->common->recon, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
 }
@@ -273,7 +316,6 @@
     RECON_INVOKE(&rtcd->common->recon, recon)(b->predictor,
                                               b->diff, *(b->base_dst) + b->dst, b->dst_stride);
   }
-
 }
 
 extern const int vp8_i8x8_block[4];
diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c
index 821507c..78892fc 100644
--- a/vp8/encoder/quantize.c
+++ b/vp8/encoder/quantize.c
@@ -22,6 +22,72 @@
 extern int enc_debug;
 #endif
 
+#if CONFIG_HYBRIDTRANSFORM
+void vp8_ht_quantize_b(BLOCK *b, BLOCKD *d) {
+  int i, rc, eob;
+  int zbin;
+  int x, y, z, sz;
+  short *zbin_boost_ptr  = b->zrun_zbin_boost;
+  short *coeff_ptr       = b->coeff;
+  short *zbin_ptr        = b->zbin;
+  short *round_ptr       = b->round;
+  short *quant_ptr       = b->quant;
+  unsigned char *quant_shift_ptr = b->quant_shift;
+  short *qcoeff_ptr      = d->qcoeff;
+  short *dqcoeff_ptr     = d->dqcoeff;
+  short *dequant_ptr     = d->dequant;
+  short zbin_oq_value    = b->zbin_extra;
+
+  int const *pt_scan ;
+
+  switch(d->bmi.as_mode.tx_type) {
+    case ADST_DCT :
+      pt_scan = vp8_row_scan;
+      break;
+
+    case DCT_ADST :
+      pt_scan = vp8_col_scan;
+      break;
+
+    default :
+      pt_scan = vp8_default_zig_zag1d;
+      break;
+  }
+
+  vpx_memset(qcoeff_ptr, 0, 32);
+  vpx_memset(dqcoeff_ptr, 0, 32);
+
+  eob = -1;
+
+  for (i = 0; i < b->eob_max_offset; i++) {
+    rc   = pt_scan[i];
+    z    = coeff_ptr[rc];
+
+    zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;
+    zbin_boost_ptr ++;
+
+    sz = (z >> 31);                                 // sign of z
+    x  = (z ^ sz) - sz;                             // x = abs(z)
+
+    if (x >= zbin) {
+      x += round_ptr[rc];
+      y  = (((x * quant_ptr[rc]) >> 16) + x)
+           >> quant_shift_ptr[rc];                // quantize (x)
+      x  = (y ^ sz) - sz;                         // get the sign back
+      qcoeff_ptr[rc]  = x;                        // write to destination
+      dqcoeff_ptr[rc] = x * dequant_ptr[rc];      // dequantized value
+
+      if (y) {
+        eob = i;                                // last nonzero coeffs
+        zbin_boost_ptr = b->zrun_zbin_boost;    // reset zero runlength
+      }
+    }
+  }
+
+  d->eob = eob + 1;
+}
+#endif
+
 void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d) {
   int i, rc, eob;
   int zbin;
@@ -47,13 +113,14 @@
     z    = coeff_ptr[rc];
 
     zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;
-    zbin_boost_ptr++;
+    zbin_boost_ptr ++;
 
     sz = (z >> 31);                                 // sign of z
     x  = (z ^ sz) - sz;                             // x = abs(z)
 
     if (x >= zbin) {
       x += round_ptr[rc];
+
       y  = (((x * quant_ptr[rc]) >> 16) + x)
            >> quant_shift_ptr[rc];                // quantize (x)
       x  = (y ^ sz) - sz;                         // get the sign back
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 2c04b54..7616636 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -50,6 +50,10 @@
 extern void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x);
 extern void vp8_update_zbin_extra(VP8_COMP *cpi, MACROBLOCK *x);
 
+#if CONFIG_HYBRIDTRANSFORM
+extern void vp8_ht_quantize_b(BLOCK *b, BLOCKD *d);
+#endif
+
 #if CONFIG_HIGH_PRECISION_MV
 #define XMVCOST (x->e_mbd.allow_high_precision_mv?x->mvcost_hp:x->mvcost)
 #else
@@ -545,9 +549,38 @@
   int cost = 0;
   short *qcoeff_ptr = b->qcoeff;
 
-  VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+#if CONFIG_HYBRIDTRANSFORM
+  int QIndex = mb->q_index;
+  int active_ht = (QIndex < ACTIVE_HT) &&
+                (mb->e_mbd.mode_info_context->mbmi.mode_rdopt == B_PRED);
 
-# define QC( I)  ( qcoeff_ptr [vp8_default_zig_zag1d[I]] )
+  int const *pt_scan;
+
+  if((type == PLANE_TYPE_Y_WITH_DC) && active_ht) {
+    switch (b->bmi.as_mode.tx_type) {
+      case ADST_DCT :
+        pt_scan = vp8_row_scan;
+        break;
+
+      case DCT_ADST :
+        pt_scan = vp8_col_scan;
+        break;
+
+      default :
+        pt_scan = vp8_default_zig_zag1d;
+        break;
+    }
+
+  } else {
+    pt_scan = vp8_default_zig_zag1d;
+  }
+
+#define  QC(I)  ( qcoeff_ptr [pt_scan[I]] )
+#else
+#define QC(I)  ( qcoeff_ptr [vp8_default_zig_zag1d[I]] )
+#endif
+
+    VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
 
   for (; c < eob; c++) {
     int v = QC(c);
@@ -804,11 +837,17 @@
   int *bmode_costs,
   ENTROPY_CONTEXT *a,
   ENTROPY_CONTEXT *l,
-
   int *bestrate,
   int *bestratey,
   int *bestdistortion) {
   B_PREDICTION_MODE mode;
+
+#if CONFIG_HYBRIDTRANSFORM
+  int QIndex = x->q_index;
+  int active_ht = (QIndex < ACTIVE_HT);
+  TX_TYPE best_tx_type;
+#endif
+
 #if CONFIG_COMP_INTRA_PRED
   B_PREDICTION_MODE mode2;
 #endif
@@ -828,7 +867,8 @@
 
   for (mode = B_DC_PRED; mode <= B_HU_PRED; mode++) {
 #if CONFIG_COMP_INTRA_PRED
-    for (mode2 = (allow_comp ? 0 : (B_DC_PRED - 1)); mode2 != (allow_comp ? (mode + 1) : 0); mode2++) {
+    for (mode2 = (allow_comp ? 0 : (B_DC_PRED - 1));
+                   mode2 != (allow_comp ? (mode + 1) : 0); mode2++) {
 #endif
       int this_rd;
       int ratey;
@@ -853,43 +893,95 @@
       }
 #endif
       ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), subb)(be, b, 16);
-      x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
-      x->quantize_b(be, b);
 
-      tempa = ta;
-      templ = tl;
+#if CONFIG_HYBRIDTRANSFORM
+      if(active_ht) {
+        b->bmi.as_mode.test = mode;
+        switch(mode) {
+          // case B_DC_PRED :
+          case B_TM_PRED :
+          case B_RD_PRED :
+            b->bmi.as_mode.tx_type = ADST_ADST;
+            break;
 
-      ratey = cost_coeffs(x, b, PLANE_TYPE_Y_WITH_DC, &tempa, &templ);
-      rate += ratey;
-      distortion = ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), berr)(
-                     be->coeff, b->dqcoeff) >> 2;
+          case B_VE_PRED :
+          case B_VR_PRED :
+            b->bmi.as_mode.tx_type = ADST_DCT;
+            break;
 
-      this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+          case B_HE_PRED :
+          case B_HD_PRED :
+          case B_HU_PRED :
+            b->bmi.as_mode.tx_type = DCT_ADST;
+            break;
 
-      if (this_rd < best_rd) {
-        *bestrate = rate;
-        *bestratey = ratey;
-        *bestdistortion = distortion;
-        best_rd = this_rd;
-        *best_mode = mode;
-#if CONFIG_COMP_INTRA_PRED
-        *best_second_mode = mode2;
-#endif
-        *a = tempa;
-        *l = templ;
-        copy_predictor(best_predictor, b->predictor);
-        vpx_memcpy(best_dqcoeff, b->dqcoeff, 32);
-#if CONFIG_COMP_INTRA_PRED
+          default :
+            b->bmi.as_mode.tx_type = DCT_DCT;
+            break;
+        }
+
+        vp8_fht4x4_c(be->src_diff, be->coeff, 32, b->bmi.as_mode.tx_type);
+        vp8_ht_quantize_b(be, b);
+      } else {
+        x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
+        x->quantize_b(be, b);
       }
+#else
+        x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
+        x->quantize_b(be, b);
 #endif
+
+        tempa = ta;
+        templ = tl;
+
+        ratey = cost_coeffs(x, b, PLANE_TYPE_Y_WITH_DC, &tempa, &templ);
+        rate += ratey;
+        distortion = ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), berr)(
+            be->coeff, b->dqcoeff) >> 2;
+
+        this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+        if (this_rd < best_rd) {
+          *bestrate = rate;
+          *bestratey = ratey;
+          *bestdistortion = distortion;
+          best_rd = this_rd;
+          *best_mode = mode;
+#if CONFIG_HYBRIDTRANSFORM
+          best_tx_type = b->bmi.as_mode.tx_type ;
+#endif
+
+#if CONFIG_COMP_INTRA_PRED
+          *best_second_mode = mode2;
+#endif
+          *a = tempa;
+          *l = templ;
+          copy_predictor(best_predictor, b->predictor);
+          vpx_memcpy(best_dqcoeff, b->dqcoeff, 32);
+        }
+#if CONFIG_COMP_INTRA_PRED
     }
+#endif
   }
   b->bmi.as_mode.first = (B_PREDICTION_MODE)(*best_mode);
 #if CONFIG_COMP_INTRA_PRED
   b->bmi.as_mode.second = (B_PREDICTION_MODE)(*best_second_mode);
 #endif
 
+#if CONFIG_HYBRIDTRANSFORM
+  b->bmi.as_mode.tx_type = best_tx_type;
+
+  // inverse transform
+  if(active_ht) {
+    vp8_iht4x4llm_c(best_dqcoeff, b->diff, 32, b->bmi.as_mode.tx_type );
+  } else {
+    IDCT_INVOKE(IF_RTCD(&cpi->rtcd.common->idct), idct16)(best_dqcoeff,
+                                                                b->diff, 32);
+  }
+#else
   IDCT_INVOKE(IF_RTCD(&cpi->rtcd.common->idct), idct16)(best_dqcoeff, b->diff, 32);
+#endif
+
   RECON_INVOKE(IF_RTCD(&cpi->rtcd.common->recon), recon)(best_predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
 
   return best_rd;
@@ -1043,6 +1135,8 @@
 #endif
   return best_rd;
 }
+
+
 static int rd_pick_intra8x8block(
   VP8_COMP *cpi,
   MACROBLOCK *x,
@@ -2739,6 +2833,12 @@
     xd->mode_info_context->mbmi.pred_filter_enabled = 0;
 #endif
 
+    // current coding mode under rate-distortion optimization test loop
+#if CONFIG_HYBRIDTRANSFORM
+    xd->mode_info_context->mbmi.mode_rdopt = this_mode;
+#endif
+
+
 #if CONFIG_COMP_INTRA_PRED
     xd->mode_info_context->mbmi.second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
     xd->mode_info_context->mbmi.second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
@@ -3646,36 +3746,45 @@
   MACROBLOCKD *xd = &x->e_mbd;
   int error4x4, error16x16;
 #if CONFIG_COMP_INTRA_PRED
-  int error4x4d, rate4x4d, dist4x4d;
+    int error4x4d, rate4x4d, dist4x4d;
 #endif
-  int rate4x4, rate16x16 = 0, rateuv;
-  int dist4x4, dist16x16, distuv;
-  int rate;
-  int rate4x4_tokenonly = 0;
-  int rate16x16_tokenonly = 0;
-  int rateuv_tokenonly = 0;
-  int error8x8, rate8x8_tokenonly = 0;
-  int rate8x8, dist8x8;
-  int mode16x16;
-  int mode8x8[2][4];
+    int rate4x4, rate16x16 = 0, rateuv;
+    int dist4x4, dist16x16, distuv;
+    int rate;
+    int rate4x4_tokenonly = 0;
+    int rate16x16_tokenonly = 0;
+    int rateuv_tokenonly = 0;
+    int error8x8, rate8x8_tokenonly=0;
+    int rate8x8, dist8x8;
+    int mode16x16;
+    int mode8x8[2][4];
 
-  xd->mode_info_context->mbmi.ref_frame = INTRA_FRAME;
+    xd->mode_info_context->mbmi.ref_frame = INTRA_FRAME;
 
-  rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv);
-  rate = rateuv;
+    rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv);
+    rate = rateuv;
 
-  error16x16 = rd_pick_intra16x16mby_mode(cpi, x,
-                                          &rate16x16, &rate16x16_tokenonly,
-                                          &dist16x16);
-  mode16x16 = xd->mode_info_context->mbmi.mode;
+    // current macroblock under rate-distortion optimization test loop
+#if CONFIG_HYBRIDTRANSFORM
+    xd->mode_info_context->mbmi.mode_rdopt = DC_PRED;
+#endif
 
-  error8x8 = rd_pick_intra8x8mby_modes(cpi, x,
-                                       &rate8x8, &rate8x8_tokenonly,
-                                       &dist8x8, error16x16);
-  mode8x8[0][0] = xd->mode_info_context->bmi[0].as_mode.first;
-  mode8x8[0][1] = xd->mode_info_context->bmi[2].as_mode.first;
-  mode8x8[0][2] = xd->mode_info_context->bmi[8].as_mode.first;
-  mode8x8[0][3] = xd->mode_info_context->bmi[10].as_mode.first;
+    error16x16 = rd_pick_intra16x16mby_mode(cpi, x,
+                                            &rate16x16, &rate16x16_tokenonly,
+                                            &dist16x16);
+    mode16x16 = xd->mode_info_context->mbmi.mode;
+
+#if CONFIG_HYBRIDTRANSFORM
+    xd->mode_info_context->mbmi.mode_rdopt = I8X8_PRED;
+#endif
+
+    error8x8 = rd_pick_intra8x8mby_modes(cpi, x,
+                &rate8x8, &rate8x8_tokenonly,
+                &dist8x8, error16x16);
+    mode8x8[0][0]= xd->mode_info_context->bmi[0].as_mode.first;
+    mode8x8[0][1]= xd->mode_info_context->bmi[2].as_mode.first;
+    mode8x8[0][2]= xd->mode_info_context->bmi[8].as_mode.first;
+    mode8x8[0][3]= xd->mode_info_context->bmi[10].as_mode.first;
 #if CONFIG_COMP_INTRA_PRED
   mode8x8[1][0] = xd->mode_info_context->bmi[0].as_mode.second;
   mode8x8[1][1] = xd->mode_info_context->bmi[2].as_mode.second;
@@ -3683,9 +3792,13 @@
   mode8x8[1][3] = xd->mode_info_context->bmi[10].as_mode.second;
 #endif
 
-  error4x4 = rd_pick_intra4x4mby_modes(cpi, x,
-                                       &rate4x4, &rate4x4_tokenonly,
-                                       &dist4x4, error16x16,
+#if CONFIG_HYBRIDTRANSFORM
+    xd->mode_info_context->mbmi.mode_rdopt = B_PRED;
+#endif
+
+    error4x4 = rd_pick_intra4x4mby_modes(cpi, x,
+                                         &rate4x4, &rate4x4_tokenonly,
+                                         &dist4x4, error16x16,
 #if CONFIG_COMP_INTRA_PRED
                                        0,
 #endif
diff --git a/vp8/encoder/tokenize.c b/vp8/encoder/tokenize.c
index 2d58669..b00c5b2 100644
--- a/vp8/encoder/tokenize.c
+++ b/vp8/encoder/tokenize.c
@@ -298,6 +298,164 @@
   *a = *l = pt;
 }
 
+#if CONFIG_HYBRIDTRANSFORM
+static void tokenize1st_order_ht(   MACROBLOCKD *xd,
+                                    TOKENEXTRA **tp,
+                                    int type,
+                                    VP8_COMP    *cpi) {
+  unsigned int block;
+  const BLOCKD *b;
+  int pt;             /* near block/prev token context index */
+  int c;
+  int token;
+  TOKENEXTRA *t = *tp;/* store tokens starting here */
+  const short *qcoeff_ptr;
+  ENTROPY_CONTEXT * a;
+  ENTROPY_CONTEXT * l;
+  int band, rc, v;
+  int tmp1, tmp2;
+
+  int const *pt_scan ;
+
+  int seg_eob = 16;
+  int segment_id = xd->mode_info_context->mbmi.segment_id;
+
+  if ( segfeature_active( xd, segment_id, SEG_LVL_EOB ) ) {
+    seg_eob = get_segdata( xd, segment_id, SEG_LVL_EOB );
+  }
+
+  b = xd->block;
+
+  /* Luma */
+  for (block = 0; block < 16; block++, b++) {
+    B_PREDICTION_MODE b_mode;
+
+    if( xd->mode_info_context->mbmi.mode == B_PRED ) {
+      b_mode = b->bmi.as_mode.first;
+    }
+
+    // assign scanning order for luma components coded in intra4x4 mode
+    if( ( ( xd->mode_info_context->mbmi.mode == B_PRED ) ||
+          ( xd->mode_info_context->mbmi.mode == I8X8_PRED ) ) &&
+        ( type == PLANE_TYPE_Y_WITH_DC) ) {
+      switch(b_mode) {
+        case B_VE_PRED :
+        case B_VR_PRED :
+          pt_scan = vp8_row_scan;
+          break;
+
+        case B_HE_PRED :
+        case B_HD_PRED :
+        case B_HU_PRED :
+          pt_scan = vp8_col_scan;
+          break;
+
+        default :
+          pt_scan = vp8_default_zig_zag1d;
+          break;
+      }
+    } else {
+      pt_scan = vp8_default_zig_zag1d;
+    }
+
+    tmp1 = vp8_block2above[block];
+    tmp2 = vp8_block2left[block];
+    qcoeff_ptr = b->qcoeff;
+    a = (ENTROPY_CONTEXT *)xd->above_context + tmp1;
+    l = (ENTROPY_CONTEXT *)xd->left_context + tmp2;
+    VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+
+    c = type ? 0 : 1;
+
+    for (; c < b->eob; c++) {
+      rc = pt_scan[c];
+      band = vp8_coef_bands[c];
+      v = qcoeff_ptr[rc];
+
+      t->Extra = vp8_dct_value_tokens_ptr[v].Extra;
+      token    = vp8_dct_value_tokens_ptr[v].Token;
+
+      t->Token = token;
+      t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt];
+
+      t->skip_eob_node = pt == 0 &&
+          ((band > 0 && type > 0) || (band > 1 && type == 0));
+
+      ++cpi->coef_counts       [type] [band] [pt] [token];
+
+      pt = vp8_prev_token_class[token];
+      t++;
+    }
+
+    if (c < seg_eob) {
+      band = vp8_coef_bands[c];
+      t->Token = DCT_EOB_TOKEN;
+      t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt];
+
+      t->skip_eob_node = pt == 0 &&
+          ((band > 0 && type > 0) || (band > 1 && type == 0));
+
+      ++cpi->coef_counts       [type] [band] [pt] [DCT_EOB_TOKEN];
+
+      t++;
+    }
+
+    *tp = t;
+    pt = (c != !type); /* 0 <-> all coeff data is zero */
+    *a = *l = pt;
+  }
+
+  // reset scanning order for chroma components
+  pt_scan = vp8_default_zig_zag1d ;
+
+  /* Chroma */
+  for (block = 16; block < 24; block++, b++) {
+    tmp1 = vp8_block2above[block];
+    tmp2 = vp8_block2left[block];
+    qcoeff_ptr = b->qcoeff;
+    a = (ENTROPY_CONTEXT *)xd->above_context + tmp1;
+    l = (ENTROPY_CONTEXT *)xd->left_context + tmp2;
+
+    VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+
+    for (c = 0; c < b->eob; c++) {
+      rc = pt_scan[c];
+      band = vp8_coef_bands[c];
+      v = qcoeff_ptr[rc];
+
+      t->Extra = vp8_dct_value_tokens_ptr[v].Extra;
+      token    = vp8_dct_value_tokens_ptr[v].Token;
+
+      t->Token = token;
+      t->context_tree = cpi->common.fc.coef_probs [2] [band] [pt];
+
+      t->skip_eob_node = ((pt == 0) && (band > 0));
+
+      ++cpi->coef_counts       [2] [band] [pt] [token];
+
+      pt = vp8_prev_token_class[token];
+      t++;
+  }
+
+    if (c < seg_eob) {
+      band = vp8_coef_bands[c];
+      t->Token = DCT_EOB_TOKEN;
+      t->context_tree = cpi->common.fc.coef_probs [2] [band] [pt];
+
+      t->skip_eob_node = ((pt == 0) && (band > 0));
+
+      ++cpi->coef_counts       [2] [band] [pt] [DCT_EOB_TOKEN];
+
+      t++;
+    }
+
+    *tp = t;
+    pt = (c != 0); /* 0 <-> all coeff data is zero */
+    *a = *l = pt;
+  }
+}
+#endif
+
 static void tokenize1st_order_b
 (
   MACROBLOCKD *xd,
@@ -483,6 +641,11 @@
   int skip_inc;
   int segment_id = x->mode_info_context->mbmi.segment_id;
 
+#if CONFIG_HYBRIDTRANSFORM
+    int QIndex = cpi->mb.q_index;
+    int active_ht = (QIndex < ACTIVE_HT);
+#endif
+
   if (!segfeature_active(x, segment_id, SEG_LVL_EOB) ||
       (get_segdata(x, segment_id, SEG_LVL_EOB) != 0)) {
     skip_inc = 1;
@@ -560,9 +723,17 @@
       *(A + vp8_block2above_8x8[b] + 1) = *(A + vp8_block2above_8x8[b]);
       *(L + vp8_block2left_8x8[b] + 1) = *(L + vp8_block2left_8x8[b]);
     }
-  } else
-
+  } else {
+#if CONFIG_HYBRIDTRANSFORM
+    if(active_ht) {
+      tokenize1st_order_ht(x, t, plane_type, cpi);
+    } else {
+      tokenize1st_order_b(x, t, plane_type, cpi);
+    }
+#else
     tokenize1st_order_b(x, t, plane_type, cpi);
+#endif
+  }
 }