Adds 64x64 hybrid dct/dwt transform

This is to add to the 64x64 transform experiment as an alternative to
a 64x64 DCT.
Two levels of wavelet decomposition is used on a 64x64 block, followed
by 16x16 DCT on the four lowest subbands. The highest three subbands
are left untransformed after the first level DWT.

Change-Id: I3d48d5800468d655191933894df6b46e15adca56
diff --git a/configure b/configure
index 57a6145..396cee7 100755
--- a/configure
+++ b/configure
@@ -249,7 +249,8 @@
     newbintramodes
     comp_interintra_pred
     tx32x32
-    dwt32x32hybrid
+    tx64x64
+    dwtdcthybrid
     cnvcontext
     newcoefcontext
 "
diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index cdc8bc1..bc87384 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -143,7 +143,7 @@
 };
 
 #if CONFIG_TX32X32
-#if CONFIG_DWT32X32HYBRID
+#if CONFIG_DWTDCTHYBRID
 DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]) = {
   0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6,
   6, 6, 6,
@@ -458,7 +458,7 @@
   951,  920,  889,  858,  827,  796,  765,  734,  703,  735,  766,  797,  828,  859,  890,  921,  952,  983, 1014, 1015,  984,  953,  922,  891,  860,  829,  798,  767,  799,  830,  861,  892,
   923,  954,  985, 1016, 1017,  986,  955,  924,  893,  862,  831,  863,  894,  925,  956,  987, 1018, 1019,  988,  957,  926,  895,  927,  958,  989, 1020, 1021,  990,  959,  991, 1022, 1023,
 };
-#endif  // CONFIG_DWT32X32HYBRID
+#endif  // CONFIG_DWTDCTHYBRID
 #endif
 
 /* Array indices are identical to previously-existing CONTEXT_NODE indices */
diff --git a/vp9/common/vp9_idctllm.c b/vp9/common/vp9_idctllm.c
index 4dd540e..baa2245 100644
--- a/vp9/common/vp9_idctllm.c
+++ b/vp9/common/vp9_idctllm.c
@@ -1534,7 +1534,7 @@
 #endif
 
 #if CONFIG_TX32X32
-#if !CONFIG_DWT32X32HYBRID
+#if !CONFIG_DWTDCTHYBRID
 #define DownshiftMultiplyBy2(x) x * 2
 #define DownshiftMultiply(x) x
 static void idct16(double *input, double *output, int stride) {
@@ -1879,7 +1879,7 @@
   vp9_clear_system_state();  // Make it simd safe : __asm emms;
 }
 
-#else  // CONFIG_DWT32X32HYBRID
+#else  // CONFIG_DWTDCTHYBRID
 
 #define DWT_MAX_LENGTH   32
 #define DWT_TYPE         26    // 26/53/97
@@ -1940,8 +1940,8 @@
   *x++ = ((*b) << 1) + *a;
 }
 
-void dyadic_synthesize_53(int levels, int width, int height, int16_t *c,
-                          int pitch_c, int16_t *x, int pitch_x) {
+static void dyadic_synthesize_53(int levels, int width, int height, int16_t *c,
+                                 int pitch_c, int16_t *x, int pitch_x) {
   int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width;
   short buffer[2 * DWT_MAX_LENGTH];
 
@@ -2031,8 +2031,8 @@
   }
 }
 
-void dyadic_synthesize_26(int levels, int width, int height, int16_t *c,
-                          int pitch_c, int16_t *x, int pitch_x) {
+static void dyadic_synthesize_26(int levels, int width, int height, int16_t *c,
+                                 int pitch_c, int16_t *x, int pitch_x) {
   int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width;
   int16_t buffer[2 * DWT_MAX_LENGTH];
 
@@ -2111,8 +2111,8 @@
   x[length - 1] -= 2 * a_predict1 * x[length - 2];
 }
 
-void dyadic_synthesize_97(int levels, int width, int height, int16_t *c,
-                          int pitch_c, int16_t *x, int pitch_x) {
+static void dyadic_synthesize_97(int levels, int width, int height, int16_t *c,
+                                 int pitch_c, int16_t *x, int pitch_x) {
   int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width;
   double buffer[2 * DWT_MAX_LENGTH];
   double y[DWT_MAX_LENGTH * DWT_MAX_LENGTH];
@@ -2358,7 +2358,8 @@
   vp9_clear_system_state();  // Make it simd safe : __asm emms;
 }
 
-void vp9_short_idct16x16_c_f(int16_t *input, int16_t *output, int pitch) {
+static void vp9_short_idct16x16_c_f(int16_t *input, int16_t *output, int pitch,
+                                    int scale) {
   vp9_clear_system_state();  // Make it simd safe : __asm emms;
   {
     double out[16*16], out2[16*16];
@@ -2383,13 +2384,13 @@
         out2[j*16 + i] = temp_out[j];
     }
     for (i = 0; i < 16*16; ++i)
-      output[i] = round(out2[i] / (64 >> DWT_PRECISION_BITS));
+      output[i] = round(out2[i] / (128 >> scale));
   }
   vp9_clear_system_state();  // Make it simd safe : __asm emms;
 }
 
 void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {
-  // assume out is a 32x32 buffer
+  // assume output is a 32x32 buffer
   // Temporary buffer to hold a 16x16 block for 16x16 inverse dct
   int16_t buffer[16 * 16];
   // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt
@@ -2400,20 +2401,24 @@
 
   // TODO(debargha): Implement more efficiently by adding output pitch
   // argument to the idct16x16 function
-  vp9_short_idct16x16_c_f(input, buffer, pitch);
+  vp9_short_idct16x16_c_f(input, buffer, pitch,
+                          1 + DWT_PRECISION_BITS);
   for (i = 0; i < 16; ++i) {
     vpx_memcpy(buffer2 + i * 32, buffer + i * 16, sizeof(*buffer2) * 16);
   }
-  vp9_short_idct16x16_c_f(input + 16, buffer, pitch);
+  vp9_short_idct16x16_c_f(input + 16, buffer, pitch,
+                          1 + DWT_PRECISION_BITS);
   for (i = 0; i < 16; ++i) {
     vpx_memcpy(buffer2 + i * 32 + 16, buffer + i * 16, sizeof(*buffer2) * 16);
   }
-  vp9_short_idct16x16_c_f(input + 16 * short_pitch, buffer, pitch);
+  vp9_short_idct16x16_c_f(input + 16 * short_pitch, buffer, pitch,
+                          1 + DWT_PRECISION_BITS);
   for (i = 0; i < 16; ++i) {
     vpx_memcpy(buffer2 + i * 32 + 16 * 32, buffer + i * 16,
                sizeof(*buffer2) * 16);
   }
-  vp9_short_idct16x16_c_f(input + 16 * short_pitch + 16, buffer, pitch);
+  vp9_short_idct16x16_c_f(input + 16 * short_pitch + 16, buffer, pitch,
+                          1 + DWT_PRECISION_BITS);
   for (i = 0; i < 16; ++i) {
     vpx_memcpy(buffer2 + i * 32 + 16 * 33, buffer + i * 16,
                sizeof(*buffer2) * 16);
@@ -2426,5 +2431,78 @@
   dyadic_synthesize_53(1, 32, 32, buffer2, 32, output, 32);
 #endif
 }
-#endif  // CONFIG_DWT32X32HYBRID
+
+void vp9_short_idct64x64_c(int16_t *input, int16_t *output, int pitch) {
+  // assume output is a 64x64 buffer
+  // Temporary buffer to hold a 16x16 block for 16x16 inverse dct
+  int16_t buffer[16 * 16];
+  // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt
+  int16_t buffer2[64 * 64];
+  // Note: pitch is in bytes, short_pitch is in short units
+  const int short_pitch = pitch >> 1;
+  int i, j;
+
+  // TODO(debargha): Implement more efficiently by adding output pitch
+  // argument to the idct16x16 function
+  vp9_short_idct16x16_c_f(input, buffer, pitch,
+                          2 + DWT_PRECISION_BITS);
+  for (i = 0; i < 16; ++i) {
+    vpx_memcpy(buffer2 + i * 64, buffer + i * 16, sizeof(*buffer2) * 16);
+  }
+  vp9_short_idct16x16_c_f(input + 16, buffer, pitch,
+                          2 + DWT_PRECISION_BITS);
+  for (i = 0; i < 16; ++i) {
+    vpx_memcpy(buffer2 + i * 64 + 16, buffer + i * 16, sizeof(*buffer2) * 16);
+  }
+  vp9_short_idct16x16_c_f(input + 16 * short_pitch, buffer, pitch,
+                          2 + DWT_PRECISION_BITS);
+  for (i = 0; i < 16; ++i) {
+    vpx_memcpy(buffer2 + i * 64 + 16 * 64, buffer + i * 16,
+               sizeof(*buffer2) * 16);
+  }
+  vp9_short_idct16x16_c_f(input + 16 * short_pitch + 16, buffer, pitch,
+                          2 + DWT_PRECISION_BITS);
+  for (i = 0; i < 16; ++i) {
+    vpx_memcpy(buffer2 + i * 64 + 16 * 65, buffer + i * 16,
+               sizeof(*buffer2) * 16);
+  }
+
+  // Copying and scaling highest bands into buffer2
+#if DWT_PRECISION_BITS < 1
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j) {
+      buffer2[i * 64 + 32 + j] =
+          input[i * short_pitch + 32 + j] >> (1 - DWT_PRECISION_BITS);
+    }
+  }
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 64; ++j) {
+      buffer2[i * 64 + j] =
+          input[(i + 32) * short_pitch + j] >> (1 - DWT_PRECISION_BITS);
+    }
+  }
+#else
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j) {
+      buffer2[i * 64 + 32 + j] =
+          input[i * short_pitch + 32 + j] << (DWT_PRECISION_BITS - 1);
+    }
+  }
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 64; ++j) {
+      buffer2[i * 64 + j] =
+          input[(i + 32) * short_pitch + j] << (DWT_PRECISION_BITS - 1);
+    }
+  }
+#endif
+
+#if DWT_TYPE == 26
+  dyadic_synthesize_26(2, 64, 64, buffer2, 64, output, 64);
+#elif DWT_TYPE == 97
+  dyadic_synthesize_97(2, 64, 64, buffer2, 64, output, 64);
+#elif DWT_TYPE == 53
+  dyadic_synthesize_53(2, 64, 64, buffer2, 64, output, 64);
+#endif
+}
+#endif  // CONFIG_DWTDCTHYBRID
 #endif  // CONFIG_TX32X32
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index e14421d..0de6393 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -1332,8 +1332,9 @@
 #undef ROUNDING
 #endif
 
+#if CONFIG_TX32X32 || CONFIG_TX64X64
+#if !CONFIG_DWTDCTHYBRID
 #if CONFIG_TX32X32
-#if !CONFIG_DWT32X32HYBRID
 static void dct32_1d(double *input, double *output, int stride) {
   static const double C1 = 0.998795456205;  // cos(pi * 1 / 64)
   static const double C2 = 0.995184726672;  // cos(pi * 2 / 64)
@@ -1684,8 +1685,9 @@
 
   vp9_clear_system_state();  // Make it simd safe : __asm emms;
 }
+#endif  // CONFIG_TX32X32
 
-#else  // CONFIG_DWT32X32HYBRID
+#else  // CONFIG_DWTDCTHYBRID
 
 #define DWT_MAX_LENGTH   64
 #define DWT_TYPE         26    // 26/53/97
@@ -2108,7 +2110,8 @@
   vp9_clear_system_state();  // Make it simd safe : __asm emms;
 }
 
-void vp9_short_fdct16x16_c_f(short *input, short *out, int pitch) {
+static void vp9_short_fdct16x16_c_f(short *input, short *out, int pitch,
+                                    int scale) {
   vp9_clear_system_state();  // Make it simd safe : __asm emms;
   {
     int shortpitch = pitch >> 1;
@@ -2134,11 +2137,12 @@
     }
     // Scale by some magic number
     for (i = 0; i < 256; i++)
-        out[i] = (short)round(output[i] / (4 << DWT_PRECISION_BITS));
+        out[i] = (short)round(output[i] / (2 << scale));
   }
   vp9_clear_system_state();  // Make it simd safe : __asm emms;
 }
 
+#if CONFIG_TX32X32
 void vp9_short_fdct32x32_c(short *input, short *out, int pitch) {
   // assume out is a 32x32 buffer
   short buffer[16 * 16];
@@ -2153,21 +2157,82 @@
 #endif
   // TODO(debargha): Implement more efficiently by adding output pitch
   // argument to the dct16x16 function
-  vp9_short_fdct16x16_c_f(out, buffer, 64);
+  vp9_short_fdct16x16_c_f(out, buffer, 64, 1 + DWT_PRECISION_BITS);
   for (i = 0; i < 16; ++i)
     vpx_memcpy(out + i * 32, buffer + i * 16, sizeof(short) * 16);
 
-  vp9_short_fdct16x16_c_f(out + 16, buffer, 64);
+  vp9_short_fdct16x16_c_f(out + 16, buffer, 64, 1 + DWT_PRECISION_BITS);
   for (i = 0; i < 16; ++i)
     vpx_memcpy(out + i * 32 + 16, buffer + i * 16, sizeof(short) * 16);
 
-  vp9_short_fdct16x16_c_f(out + 32 * 16, buffer, 64);
+  vp9_short_fdct16x16_c_f(out + 32 * 16, buffer, 64, 1 + DWT_PRECISION_BITS);
   for (i = 0; i < 16; ++i)
     vpx_memcpy(out + i * 32 + 32 * 16, buffer + i * 16, sizeof(short) * 16);
 
-  vp9_short_fdct16x16_c_f(out + 33 * 16, buffer, 64);
+  vp9_short_fdct16x16_c_f(out + 33 * 16, buffer, 64, 1 + DWT_PRECISION_BITS);
   for (i = 0; i < 16; ++i)
     vpx_memcpy(out + i * 32 + 33 * 16, buffer + i * 16, sizeof(short) * 16);
 }
-#endif  // CONFIG_DWT32X32HYBRID
 #endif  // CONFIG_TX32X32
+
+#if CONFIG_TX64X64
+void vp9_short_fdct64x64_c(short *input, short *out, int pitch) {
+  // assume out is a 64x64 buffer
+  short buffer[16 * 16];
+  int i, j;
+  const int short_pitch = pitch >> 1;
+#if DWT_TYPE == 26
+  dyadic_analyze_26(2, 64, 64, input, short_pitch, out, 64);
+#elif DWT_TYPE == 97
+  dyadic_analyze_97(2, 64, 64, input, short_pitch, out, 64);
+#elif DWT_TYPE == 53
+  dyadic_analyze_53(2, 64, 64, input, short_pitch, out, 64);
+#endif
+  // TODO(debargha): Implement more efficiently by adding output pitch
+  // argument to the dct16x16 function
+  vp9_short_fdct16x16_c_f(out, buffer, 128, 2 + DWT_PRECISION_BITS);
+  for (i = 0; i < 16; ++i)
+    vpx_memcpy(out + i * 64, buffer + i * 16, sizeof(short) * 16);
+
+  vp9_short_fdct16x16_c_f(out + 16, buffer, 128, 2 + DWT_PRECISION_BITS);
+  for (i = 0; i < 16; ++i)
+    vpx_memcpy(out + i * 64 + 16, buffer + i * 16, sizeof(short) * 16);
+
+  vp9_short_fdct16x16_c_f(out + 64 * 16, buffer, 128, 2 + DWT_PRECISION_BITS);
+  for (i = 0; i < 16; ++i)
+    vpx_memcpy(out + i * 64 + 64 * 16, buffer + i * 16, sizeof(short) * 16);
+
+  vp9_short_fdct16x16_c_f(out + 65 * 16, buffer, 128, 2 + DWT_PRECISION_BITS);
+  for (i = 0; i < 16; ++i)
+    vpx_memcpy(out + i * 64 + 65 * 16, buffer + i * 16, sizeof(short) * 16);
+
+  // There is no dct used on the highest bands for now.
+  // Need to scale these coeffs by a factor of 2/2^DWT_PRECISION_BITS
+  // TODO(debargha): experiment with turning these coeffs to 0
+#if DWT_PRECISION_BITS < 1
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j) {
+      out[i * 64 + 32 + j] <<= (1 - DWT_PRECISION_BITS);
+    }
+  }
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 64; ++j) {
+      out[i * 64 + j] <<= (1 - DWT_PRECISION_BITS);
+    }
+  }
+#else
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j) {
+      out[i * 64 + 32 + j] >>= (DWT_PRECISION_BITS - 1);
+    }
+  }
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 64; ++j) {
+      out[i * 64 + j] >>= (DWT_PRECISION_BITS - 1);
+    }
+  }
+#endif
+}
+#endif  // CONFIG_TX64X64
+#endif  // CONFIG_DWTDCTHYBRID
+#endif  // CONFIG_TX32X32 || CONFIG_TX64X64
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index c695c04..956d8f9 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -965,17 +965,17 @@
   SUPERBLOCK  * const x_sb = &x->sb_coeff_data;
   MACROBLOCKD * const xd = &x->e_mbd;
   SUPERBLOCKD * const xd_sb = &xd->sb_coeff_data;
-#if DEBUG_ERROR || CONFIG_DWT32X32HYBRID
+#if DEBUG_ERROR || CONFIG_DWTDCTHYBRID
   int16_t out[1024];
 #endif
 
   vp9_transform_sby_32x32(x);
   vp9_quantize_sby_32x32(x);
-#if DEBUG_ERROR || CONFIG_DWT32X32HYBRID
+#if DEBUG_ERROR || CONFIG_DWTDCTHYBRID
   vp9_short_idct32x32(xd_sb->dqcoeff, out, 64);
 #endif
 
-#if !CONFIG_DWT32X32HYBRID
+#if !CONFIG_DWTDCTHYBRID
   *distortion = vp9_sb_block_error_c(x_sb->coeff, xd_sb->dqcoeff, 1024);
 #else
   *distortion = vp9_block_error_c(x_sb->src_diff, out, 1024) << 4;