libvpx: Pull from upstream Current HEAD: 687c56e8026978f8ccfd5b3fbfee4dbd1de466e1 git log from upstream: 7045aec SAD32xh and SAD64xh for AVX2 7c4992c Remove the dependency in token storing locations f0c3da9 Alter adjustment of two pass GF/ARF boost with Q. 73ae6e4 Add highbitdepth function for vp9_avg_8x8 e1111fb Remove unused VAR_BASED_FIXED_PARTITION flag 5e766cc Use rate/distortion thresholds to control non-RD partition search 6f77bff Updates to aggressive denoising mode. e3bf55d Correct the logic of ready_for_new_data. 6356d21 vp9_denoiser_sse2.c: solve windows build error. e2612fb Add init and reset functions for RD_COST struct 94ecfa3 Reset rate cost value in rd mode search 8514d03 vp9_denoiser_sse2.c: eliminate gcc warnings 50c59cd Adds a set of end-to-end encode tests ed100c0 Fix an ioc issue in super_block_uvrd 716ae78 Change initialization of static_scene_max_gf_interval. 68b550f [spatial svc]Another workaround to avoid using prev_mi d5130af Revert "Move input frame scaling into the recode loop" 4680329 Revert "[spatial svc]Another workaround to avoid using prev_mi" 23fc1f7 Fix in bit-shift operation for highbitdepth decode 91657ab fix CONFIG_SPATIAL_SVC warning 09ea74f Some updates for Speed 6/VAR_BASED_PARTITION. <...> TBR=tomfinegan@chromium.org Review URL: https://codereview.chromium.org/668403002 git-svn-id: http://src.chromium.org/svn/trunk/deps/third_party/libvpx@292609 4ff67af0-8c30-449e-8e8b-ad334ec8d88c

commit: 2e5ced5fd62a73f4f5687ab19520b3aad1c53f6f [log] [tgz]
author: johannkoenig@google.com <johannkoenig@google.com> Thu Oct 23 03:24:33 2014 +0000
committer: johannkoenig@google.com <johannkoenig@google.com> Thu Oct 23 03:24:33 2014 +0000
tree: 9163e06f32fc7d69af18bef0b936d4ef135e1f1d
parent: d4edbc4ba038ec534d70f3543890f290b30b902c [diff]
diff --git a/README.chromium b/README.chromium
index f4874b8..ef84c90 100644
--- a/README.chromium
+++ b/README.chromium

@@ -5,9 +5,9 @@
 License File: source/libvpx/LICENSE
 Security Critical: yes
 
-Date: Monday September 22 2014
+Date: Monday October 21 2014
 Branch: master
-Commit: 38b6aed8fdf50e701c070e708ff7933cc3a61117
+Commit: 687c56e8026978f8ccfd5b3fbfee4dbd1de466e1
 
 Description:
 Contains the sources used to compile libvpx binaries used by Google Chrome and

diff --git a/libvpx_srcs.gni b/libvpx_srcs.gni
index 6c1a1b0..eff2cfd 100644
--- a/libvpx_srcs.gni
+++ b/libvpx_srcs.gni

@@ -206,6 +206,7 @@
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.c",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.h",
+  "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_avg.c",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.c",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.h",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_block.h",
@@ -371,8 +372,10 @@
   "//third_party/libvpx/source/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c",
   "//third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c",
   "//third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c",
+  "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c",
   "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c",
   "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c",
+  "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c",
   "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c",
 ]
 libvpx_srcs_x86_sse3 = [
@@ -593,6 +596,7 @@
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.c",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.h",
+  "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_avg.c",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.c",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.h",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_block.h",
@@ -764,8 +768,10 @@
   "//third_party/libvpx/source/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c",
   "//third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c",
   "//third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c",
+  "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c",
   "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c",
   "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c",
+  "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c",
   "//third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c",
 ]
 libvpx_srcs_x86_64_sse3 = [
@@ -989,6 +995,7 @@
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.c",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.h",
+  "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_avg.c",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.c",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.h",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_block.h",
@@ -1354,6 +1361,7 @@
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.c",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.h",
+  "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_avg.c",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.c",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.h",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_block.h",
@@ -1708,6 +1716,7 @@
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.c",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.h",
+  "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_avg.c",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.c",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.h",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_block.h",
@@ -2122,6 +2131,7 @@
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.c",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.h",
+  "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_avg.c",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.c",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.h",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_block.h",
@@ -2423,6 +2433,7 @@
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.c",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.h",
+  "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_avg.c",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.c",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.h",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_block.h",
@@ -2720,6 +2731,7 @@
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.c",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.h",
+  "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_avg.c",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.c",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.h",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_block.h",
@@ -3017,6 +3029,7 @@
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.c",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.h",
+  "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_avg.c",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.c",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.h",
   "//third_party/libvpx/source/libvpx/vp9/encoder/vp9_block.h",

diff --git a/libvpx_srcs_arm.gypi b/libvpx_srcs_arm.gypi
index 56b0428..3292fb0 100644
--- a/libvpx_srcs_arm.gypi
+++ b/libvpx_srcs_arm.gypi

@@ -235,6 +235,7 @@
     '<(libvpx_source)/vp9/encoder/vp9_aq_cyclicrefresh.h',
     '<(libvpx_source)/vp9/encoder/vp9_aq_variance.c',
     '<(libvpx_source)/vp9/encoder/vp9_aq_variance.h',
+    '<(libvpx_source)/vp9/encoder/vp9_avg.c',
     '<(libvpx_source)/vp9/encoder/vp9_bitstream.c',
     '<(libvpx_source)/vp9/encoder/vp9_bitstream.h',
     '<(libvpx_source)/vp9/encoder/vp9_block.h',

diff --git a/libvpx_srcs_arm64.gypi b/libvpx_srcs_arm64.gypi
index c72f537..cb4c8ee 100644
--- a/libvpx_srcs_arm64.gypi
+++ b/libvpx_srcs_arm64.gypi

@@ -236,6 +236,7 @@
     '<(libvpx_source)/vp9/encoder/vp9_aq_cyclicrefresh.h',
     '<(libvpx_source)/vp9/encoder/vp9_aq_variance.c',
     '<(libvpx_source)/vp9/encoder/vp9_aq_variance.h',
+    '<(libvpx_source)/vp9/encoder/vp9_avg.c',
     '<(libvpx_source)/vp9/encoder/vp9_bitstream.c',
     '<(libvpx_source)/vp9/encoder/vp9_bitstream.h',
     '<(libvpx_source)/vp9/encoder/vp9_block.h',

diff --git a/libvpx_srcs_arm_neon.gypi b/libvpx_srcs_arm_neon.gypi
index 1f8c17a..a14b108 100644
--- a/libvpx_srcs_arm_neon.gypi
+++ b/libvpx_srcs_arm_neon.gypi

@@ -288,6 +288,7 @@
     '<(libvpx_source)/vp9/encoder/vp9_aq_cyclicrefresh.h',
     '<(libvpx_source)/vp9/encoder/vp9_aq_variance.c',
     '<(libvpx_source)/vp9/encoder/vp9_aq_variance.h',
+    '<(libvpx_source)/vp9/encoder/vp9_avg.c',
     '<(libvpx_source)/vp9/encoder/vp9_bitstream.c',
     '<(libvpx_source)/vp9/encoder/vp9_bitstream.h',
     '<(libvpx_source)/vp9/encoder/vp9_block.h',

diff --git a/libvpx_srcs_arm_neon_cpu_detect.gypi b/libvpx_srcs_arm_neon_cpu_detect.gypi
index 9bd4eda..8f1d3de 100644
--- a/libvpx_srcs_arm_neon_cpu_detect.gypi
+++ b/libvpx_srcs_arm_neon_cpu_detect.gypi

@@ -256,6 +256,7 @@
     '<(libvpx_source)/vp9/encoder/vp9_aq_cyclicrefresh.h',
     '<(libvpx_source)/vp9/encoder/vp9_aq_variance.c',
     '<(libvpx_source)/vp9/encoder/vp9_aq_variance.h',
+    '<(libvpx_source)/vp9/encoder/vp9_avg.c',
     '<(libvpx_source)/vp9/encoder/vp9_bitstream.c',
     '<(libvpx_source)/vp9/encoder/vp9_bitstream.h',
     '<(libvpx_source)/vp9/encoder/vp9_block.h',

diff --git a/libvpx_srcs_generic.gypi b/libvpx_srcs_generic.gypi
index 4f940cb..078e369 100644
--- a/libvpx_srcs_generic.gypi
+++ b/libvpx_srcs_generic.gypi

@@ -201,6 +201,7 @@
     '<(libvpx_source)/vp9/encoder/vp9_aq_cyclicrefresh.h',
     '<(libvpx_source)/vp9/encoder/vp9_aq_variance.c',
     '<(libvpx_source)/vp9/encoder/vp9_aq_variance.h',
+    '<(libvpx_source)/vp9/encoder/vp9_avg.c',
     '<(libvpx_source)/vp9/encoder/vp9_bitstream.c',
     '<(libvpx_source)/vp9/encoder/vp9_bitstream.h',
     '<(libvpx_source)/vp9/encoder/vp9_block.h',

diff --git a/libvpx_srcs_mips.gypi b/libvpx_srcs_mips.gypi
index 228baaa..4323ed8 100644
--- a/libvpx_srcs_mips.gypi
+++ b/libvpx_srcs_mips.gypi

@@ -203,6 +203,7 @@
     '<(libvpx_source)/vp9/encoder/vp9_aq_cyclicrefresh.h',
     '<(libvpx_source)/vp9/encoder/vp9_aq_variance.c',
     '<(libvpx_source)/vp9/encoder/vp9_aq_variance.h',
+    '<(libvpx_source)/vp9/encoder/vp9_avg.c',
     '<(libvpx_source)/vp9/encoder/vp9_bitstream.c',
     '<(libvpx_source)/vp9/encoder/vp9_bitstream.h',
     '<(libvpx_source)/vp9/encoder/vp9_block.h',

diff --git a/libvpx_srcs_nacl.gypi b/libvpx_srcs_nacl.gypi
index 4f940cb..078e369 100644
--- a/libvpx_srcs_nacl.gypi
+++ b/libvpx_srcs_nacl.gypi

@@ -201,6 +201,7 @@
     '<(libvpx_source)/vp9/encoder/vp9_aq_cyclicrefresh.h',
     '<(libvpx_source)/vp9/encoder/vp9_aq_variance.c',
     '<(libvpx_source)/vp9/encoder/vp9_aq_variance.h',
+    '<(libvpx_source)/vp9/encoder/vp9_avg.c',
     '<(libvpx_source)/vp9/encoder/vp9_bitstream.c',
     '<(libvpx_source)/vp9/encoder/vp9_bitstream.h',
     '<(libvpx_source)/vp9/encoder/vp9_block.h',

diff --git a/libvpx_srcs_x86.gypi b/libvpx_srcs_x86.gypi
index f5b3995..f27b786 100644
--- a/libvpx_srcs_x86.gypi
+++ b/libvpx_srcs_x86.gypi

@@ -245,6 +245,7 @@
     '<(libvpx_source)/vp9/encoder/vp9_aq_cyclicrefresh.h',
     '<(libvpx_source)/vp9/encoder/vp9_aq_variance.c',
     '<(libvpx_source)/vp9/encoder/vp9_aq_variance.h',
+    '<(libvpx_source)/vp9/encoder/vp9_avg.c',
     '<(libvpx_source)/vp9/encoder/vp9_bitstream.c',
     '<(libvpx_source)/vp9/encoder/vp9_bitstream.h',
     '<(libvpx_source)/vp9/encoder/vp9_block.h',

diff --git a/libvpx_srcs_x86_64.gypi b/libvpx_srcs_x86_64.gypi
index f29c6c8..3596ec2 100644
--- a/libvpx_srcs_x86_64.gypi
+++ b/libvpx_srcs_x86_64.gypi

@@ -248,6 +248,7 @@
     '<(libvpx_source)/vp9/encoder/vp9_aq_cyclicrefresh.h',
     '<(libvpx_source)/vp9/encoder/vp9_aq_variance.c',
     '<(libvpx_source)/vp9/encoder/vp9_aq_variance.h',
+    '<(libvpx_source)/vp9/encoder/vp9_avg.c',
     '<(libvpx_source)/vp9/encoder/vp9_bitstream.c',
     '<(libvpx_source)/vp9/encoder/vp9_bitstream.h',
     '<(libvpx_source)/vp9/encoder/vp9_block.h',

diff --git a/libvpx_srcs_x86_64_intrinsics.gypi b/libvpx_srcs_x86_64_intrinsics.gypi
index bb1b203..28d4bc4 100644
--- a/libvpx_srcs_x86_64_intrinsics.gypi
+++ b/libvpx_srcs_x86_64_intrinsics.gypi

@@ -36,8 +36,10 @@
         '<(libvpx_source)/vp8/encoder/x86/vp8_enc_stubs_sse2.c',
         '<(libvpx_source)/vp9/common/x86/vp9_idct_intrin_sse2.c',
         '<(libvpx_source)/vp9/common/x86/vp9_loopfilter_intrin_sse2.c',
+        '<(libvpx_source)/vp9/encoder/x86/vp9_avg_intrin_sse2.c',
         '<(libvpx_source)/vp9/encoder/x86/vp9_dct32x32_sse2.c',
         '<(libvpx_source)/vp9/encoder/x86/vp9_dct_sse2.c',
+        '<(libvpx_source)/vp9/encoder/x86/vp9_quantize_sse2.c',
         '<(libvpx_source)/vp9/encoder/x86/vp9_variance_sse2.c',
       ],
       'cflags': [ '-msse2', ],

diff --git a/libvpx_srcs_x86_intrinsics.gypi b/libvpx_srcs_x86_intrinsics.gypi
index bb1b203..28d4bc4 100644
--- a/libvpx_srcs_x86_intrinsics.gypi
+++ b/libvpx_srcs_x86_intrinsics.gypi

@@ -36,8 +36,10 @@
         '<(libvpx_source)/vp8/encoder/x86/vp8_enc_stubs_sse2.c',
         '<(libvpx_source)/vp9/common/x86/vp9_idct_intrin_sse2.c',
         '<(libvpx_source)/vp9/common/x86/vp9_loopfilter_intrin_sse2.c',
+        '<(libvpx_source)/vp9/encoder/x86/vp9_avg_intrin_sse2.c',
         '<(libvpx_source)/vp9/encoder/x86/vp9_dct32x32_sse2.c',
         '<(libvpx_source)/vp9/encoder/x86/vp9_dct_sse2.c',
+        '<(libvpx_source)/vp9/encoder/x86/vp9_quantize_sse2.c',
         '<(libvpx_source)/vp9/encoder/x86/vp9_variance_sse2.c',
       ],
       'cflags': [ '-msse2', ],

diff --git a/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h b/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h
index 26cf5e2..13f5411 100644
--- a/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h
+++ b/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h

@@ -12,8 +12,8 @@
  */
 
 #include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_enums.h"
-#include "vp9/common/vp9_idct.h"
 
 struct macroblockd;
 
@@ -29,6 +29,9 @@
 extern "C" {
 #endif
 
+unsigned int vp9_avg_8x8_c(const uint8_t *, int p);
+#define vp9_avg_8x8 vp9_avg_8x8_c
+
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 #define vp9_block_error vp9_block_error_c
 

diff --git a/source/config/linux/arm-neon-cpu-detect/vpx_config.asm b/source/config/linux/arm-neon-cpu-detect/vpx_config.asm
index 7a9b810..c3e7b00 100644
--- a/source/config/linux/arm-neon-cpu-detect/vpx_config.asm
+++ b/source/config/linux/arm-neon-cpu-detect/vpx_config.asm

@@ -88,5 +88,5 @@
 .equ CONFIG_SPATIAL_SVC ,  0
 .equ CONFIG_VP9_TEMPORAL_DENOISING ,  0
 .equ CONFIG_FP_MB_STATS ,  0
-.equ CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH ,  0
+.equ CONFIG_EMULATE_HARDWARE ,  0
 	.section	.note.GNU-stack,"",%progbits

diff --git a/source/config/linux/arm-neon-cpu-detect/vpx_config.h b/source/config/linux/arm-neon-cpu-detect/vpx_config.h
index 04fa0d8..78ffd73 100644
--- a/source/config/linux/arm-neon-cpu-detect/vpx_config.h
+++ b/source/config/linux/arm-neon-cpu-detect/vpx_config.h

@@ -97,5 +97,5 @@
 #define CONFIG_SPATIAL_SVC 0
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_FP_MB_STATS 0
-#define CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH 0
+#define CONFIG_EMULATE_HARDWARE 0
 #endif /* VPX_CONFIG_H */

diff --git a/source/config/linux/arm-neon/vp9_rtcd.h b/source/config/linux/arm-neon/vp9_rtcd.h
index 8a31aed..376e295 100644
--- a/source/config/linux/arm-neon/vp9_rtcd.h
+++ b/source/config/linux/arm-neon/vp9_rtcd.h

@@ -12,8 +12,8 @@
  */
 
 #include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_enums.h"
-#include "vp9/common/vp9_idct.h"
 
 struct macroblockd;
 
@@ -29,6 +29,9 @@
 extern "C" {
 #endif
 
+unsigned int vp9_avg_8x8_c(const uint8_t *, int p);
+#define vp9_avg_8x8 vp9_avg_8x8_c
+
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 #define vp9_block_error vp9_block_error_c
 

diff --git a/source/config/linux/arm-neon/vpx_config.asm b/source/config/linux/arm-neon/vpx_config.asm
index e47e4a7..f4abc3b 100644
--- a/source/config/linux/arm-neon/vpx_config.asm
+++ b/source/config/linux/arm-neon/vpx_config.asm

@@ -88,5 +88,5 @@
 .equ CONFIG_SPATIAL_SVC ,  0
 .equ CONFIG_VP9_TEMPORAL_DENOISING ,  0
 .equ CONFIG_FP_MB_STATS ,  0
-.equ CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH ,  0
+.equ CONFIG_EMULATE_HARDWARE ,  0
 	.section	.note.GNU-stack,"",%progbits

diff --git a/source/config/linux/arm-neon/vpx_config.h b/source/config/linux/arm-neon/vpx_config.h
index 6c0ea64..32ffc77 100644
--- a/source/config/linux/arm-neon/vpx_config.h
+++ b/source/config/linux/arm-neon/vpx_config.h

@@ -97,5 +97,5 @@
 #define CONFIG_SPATIAL_SVC 0
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_FP_MB_STATS 0
-#define CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH 0
+#define CONFIG_EMULATE_HARDWARE 0
 #endif /* VPX_CONFIG_H */

diff --git a/source/config/linux/arm/vp9_rtcd.h b/source/config/linux/arm/vp9_rtcd.h
index ad509a7..ec0624b 100644
--- a/source/config/linux/arm/vp9_rtcd.h
+++ b/source/config/linux/arm/vp9_rtcd.h

@@ -12,8 +12,8 @@
  */
 
 #include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_enums.h"
-#include "vp9/common/vp9_idct.h"
 
 struct macroblockd;
 
@@ -29,6 +29,9 @@
 extern "C" {
 #endif
 
+unsigned int vp9_avg_8x8_c(const uint8_t *, int p);
+#define vp9_avg_8x8 vp9_avg_8x8_c
+
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 #define vp9_block_error vp9_block_error_c
 

diff --git a/source/config/linux/arm/vpx_config.asm b/source/config/linux/arm/vpx_config.asm
index d6ea277..8f7487f 100644
--- a/source/config/linux/arm/vpx_config.asm
+++ b/source/config/linux/arm/vpx_config.asm

@@ -88,5 +88,5 @@
 .equ CONFIG_SPATIAL_SVC ,  0
 .equ CONFIG_VP9_TEMPORAL_DENOISING ,  0
 .equ CONFIG_FP_MB_STATS ,  0
-.equ CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH ,  0
+.equ CONFIG_EMULATE_HARDWARE ,  0
 	.section	.note.GNU-stack,"",%progbits

diff --git a/source/config/linux/arm/vpx_config.h b/source/config/linux/arm/vpx_config.h
index e2dff1d..dbce81d 100644
--- a/source/config/linux/arm/vpx_config.h
+++ b/source/config/linux/arm/vpx_config.h

@@ -97,5 +97,5 @@
 #define CONFIG_SPATIAL_SVC 0
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_FP_MB_STATS 0
-#define CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH 0
+#define CONFIG_EMULATE_HARDWARE 0
 #endif /* VPX_CONFIG_H */

diff --git a/source/config/linux/arm64/vp9_rtcd.h b/source/config/linux/arm64/vp9_rtcd.h
index d1034cd..57cbf96 100644
--- a/source/config/linux/arm64/vp9_rtcd.h
+++ b/source/config/linux/arm64/vp9_rtcd.h

@@ -12,8 +12,8 @@
  */
 
 #include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_enums.h"
-#include "vp9/common/vp9_idct.h"
 
 struct macroblockd;
 
@@ -29,6 +29,9 @@
 extern "C" {
 #endif
 
+unsigned int vp9_avg_8x8_c(const uint8_t *, int p);
+#define vp9_avg_8x8 vp9_avg_8x8_c
+
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 #define vp9_block_error vp9_block_error_c
 

diff --git a/source/config/linux/arm64/vpx_config.asm b/source/config/linux/arm64/vpx_config.asm
index 7240985..9423d65 100644
--- a/source/config/linux/arm64/vpx_config.asm
+++ b/source/config/linux/arm64/vpx_config.asm

@@ -88,5 +88,5 @@
 .equ CONFIG_SPATIAL_SVC ,  0
 .equ CONFIG_VP9_TEMPORAL_DENOISING ,  0
 .equ CONFIG_FP_MB_STATS ,  0
-.equ CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH ,  0
+.equ CONFIG_EMULATE_HARDWARE ,  0
 	.section	.note.GNU-stack,"",%progbits

diff --git a/source/config/linux/arm64/vpx_config.h b/source/config/linux/arm64/vpx_config.h
index 4f0107f..fc49515 100644
--- a/source/config/linux/arm64/vpx_config.h
+++ b/source/config/linux/arm64/vpx_config.h

@@ -97,5 +97,5 @@
 #define CONFIG_SPATIAL_SVC 0
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_FP_MB_STATS 0
-#define CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH 0
+#define CONFIG_EMULATE_HARDWARE 0
 #endif /* VPX_CONFIG_H */

diff --git a/source/config/linux/generic/vp9_rtcd.h b/source/config/linux/generic/vp9_rtcd.h
index b60c290..0483682 100644
--- a/source/config/linux/generic/vp9_rtcd.h
+++ b/source/config/linux/generic/vp9_rtcd.h

@@ -12,8 +12,8 @@
  */
 
 #include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_enums.h"
-#include "vp9/common/vp9_idct.h"
 
 struct macroblockd;
 
@@ -29,6 +29,9 @@
 extern "C" {
 #endif
 
+unsigned int vp9_avg_8x8_c(const uint8_t *, int p);
+#define vp9_avg_8x8 vp9_avg_8x8_c
+
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 #define vp9_block_error vp9_block_error_c
 

diff --git a/source/config/linux/generic/vpx_config.asm b/source/config/linux/generic/vpx_config.asm
index 776ff66..71aee39 100644
--- a/source/config/linux/generic/vpx_config.asm
+++ b/source/config/linux/generic/vpx_config.asm

@@ -88,5 +88,5 @@
 .equ CONFIG_SPATIAL_SVC ,  0
 .equ CONFIG_VP9_TEMPORAL_DENOISING ,  0
 .equ CONFIG_FP_MB_STATS ,  0
-.equ CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH ,  0
+.equ CONFIG_EMULATE_HARDWARE ,  0
 	.section	.note.GNU-stack,"",%progbits

diff --git a/source/config/linux/generic/vpx_config.h b/source/config/linux/generic/vpx_config.h
index d02c859..a24d041 100644
--- a/source/config/linux/generic/vpx_config.h
+++ b/source/config/linux/generic/vpx_config.h

@@ -97,5 +97,5 @@
 #define CONFIG_SPATIAL_SVC 0
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_FP_MB_STATS 0
-#define CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH 0
+#define CONFIG_EMULATE_HARDWARE 0
 #endif /* VPX_CONFIG_H */

diff --git a/source/config/linux/ia32/vp9_rtcd.h b/source/config/linux/ia32/vp9_rtcd.h
index 660b652..2d6c530 100644
--- a/source/config/linux/ia32/vp9_rtcd.h
+++ b/source/config/linux/ia32/vp9_rtcd.h

@@ -12,8 +12,8 @@
  */
 
 #include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_enums.h"
-#include "vp9/common/vp9_idct.h"
 
 struct macroblockd;
 
@@ -29,6 +29,10 @@
 extern "C" {
 #endif
 
+unsigned int vp9_avg_8x8_c(const uint8_t *, int p);
+unsigned int vp9_avg_8x8_sse2(const uint8_t *, int p);
+RTCD_EXTERN unsigned int (*vp9_avg_8x8)(const uint8_t *, int p);
+
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 int64_t vp9_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 RTCD_EXTERN int64_t (*vp9_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
@@ -426,7 +430,8 @@
 RTCD_EXTERN unsigned int (*vp9_mse8x8)(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
 
 void vp9_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_b vp9_quantize_b_c
+void vp9_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+RTCD_EXTERN void (*vp9_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 
 void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vp9_quantize_b_32x32 vp9_quantize_b_32x32_c
@@ -882,6 +887,8 @@
 
     (void)flags;
 
+    vp9_avg_8x8 = vp9_avg_8x8_c;
+    if (flags & HAS_SSE2) vp9_avg_8x8 = vp9_avg_8x8_sse2;
     vp9_block_error = vp9_block_error_c;
     if (flags & HAS_SSE2) vp9_block_error = vp9_block_error_sse2;
     vp9_convolve8 = vp9_convolve8_c;
@@ -1047,6 +1054,8 @@
     if (flags & HAS_SSE2) vp9_mse8x16 = vp9_mse8x16_sse2;
     vp9_mse8x8 = vp9_mse8x8_c;
     if (flags & HAS_SSE2) vp9_mse8x8 = vp9_mse8x8_sse2;
+    vp9_quantize_b = vp9_quantize_b_c;
+    if (flags & HAS_SSE2) vp9_quantize_b = vp9_quantize_b_sse2;
     vp9_sad16x16 = vp9_sad16x16_c;
     if (flags & HAS_SSE2) vp9_sad16x16 = vp9_sad16x16_sse2;
     vp9_sad16x16_avg = vp9_sad16x16_avg_c;

diff --git a/source/config/linux/ia32/vpx_config.asm b/source/config/linux/ia32/vpx_config.asm
index b47dd1f..4923184 100644
--- a/source/config/linux/ia32/vpx_config.asm
+++ b/source/config/linux/ia32/vpx_config.asm

@@ -85,4 +85,4 @@
 %define CONFIG_SPATIAL_SVC 0
 %define CONFIG_VP9_TEMPORAL_DENOISING 0
 %define CONFIG_FP_MB_STATS 0
-%define CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH 0
+%define CONFIG_EMULATE_HARDWARE 0

diff --git a/source/config/linux/ia32/vpx_config.h b/source/config/linux/ia32/vpx_config.h
index 58bda73..093cfa0 100644
--- a/source/config/linux/ia32/vpx_config.h
+++ b/source/config/linux/ia32/vpx_config.h

@@ -97,5 +97,5 @@
 #define CONFIG_SPATIAL_SVC 0
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_FP_MB_STATS 0
-#define CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH 0
+#define CONFIG_EMULATE_HARDWARE 0
 #endif /* VPX_CONFIG_H */

diff --git a/source/config/linux/mips64el/vp9_rtcd.h b/source/config/linux/mips64el/vp9_rtcd.h
index b60c290..0483682 100644
--- a/source/config/linux/mips64el/vp9_rtcd.h
+++ b/source/config/linux/mips64el/vp9_rtcd.h

@@ -12,8 +12,8 @@
  */
 
 #include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_enums.h"
-#include "vp9/common/vp9_idct.h"
 
 struct macroblockd;
 
@@ -29,6 +29,9 @@
 extern "C" {
 #endif
 
+unsigned int vp9_avg_8x8_c(const uint8_t *, int p);
+#define vp9_avg_8x8 vp9_avg_8x8_c
+
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 #define vp9_block_error vp9_block_error_c
 

diff --git a/source/config/linux/mips64el/vpx_config.h b/source/config/linux/mips64el/vpx_config.h
index e3a5448..c49041e 100644
--- a/source/config/linux/mips64el/vpx_config.h
+++ b/source/config/linux/mips64el/vpx_config.h

@@ -97,5 +97,5 @@
 #define CONFIG_SPATIAL_SVC 0
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_FP_MB_STATS 0
-#define CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH 0
+#define CONFIG_EMULATE_HARDWARE 0
 #endif /* VPX_CONFIG_H */

diff --git a/source/config/linux/mipsel/vp9_rtcd.h b/source/config/linux/mipsel/vp9_rtcd.h
index b60c290..0483682 100644
--- a/source/config/linux/mipsel/vp9_rtcd.h
+++ b/source/config/linux/mipsel/vp9_rtcd.h

@@ -12,8 +12,8 @@
  */
 
 #include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_enums.h"
-#include "vp9/common/vp9_idct.h"
 
 struct macroblockd;
 
@@ -29,6 +29,9 @@
 extern "C" {
 #endif
 
+unsigned int vp9_avg_8x8_c(const uint8_t *, int p);
+#define vp9_avg_8x8 vp9_avg_8x8_c
+
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 #define vp9_block_error vp9_block_error_c
 

diff --git a/source/config/linux/mipsel/vpx_config.h b/source/config/linux/mipsel/vpx_config.h
index 8c0ba89..d74f9b2 100644
--- a/source/config/linux/mipsel/vpx_config.h
+++ b/source/config/linux/mipsel/vpx_config.h

@@ -97,5 +97,5 @@
 #define CONFIG_SPATIAL_SVC 0
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_FP_MB_STATS 0
-#define CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH 0
+#define CONFIG_EMULATE_HARDWARE 0
 #endif /* VPX_CONFIG_H */

diff --git a/source/config/linux/x64/vp9_rtcd.h b/source/config/linux/x64/vp9_rtcd.h
index 253f565..f1417de 100644
--- a/source/config/linux/x64/vp9_rtcd.h
+++ b/source/config/linux/x64/vp9_rtcd.h

@@ -12,8 +12,8 @@
  */
 
 #include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_enums.h"
-#include "vp9/common/vp9_idct.h"
 
 struct macroblockd;
 
@@ -29,6 +29,10 @@
 extern "C" {
 #endif
 
+unsigned int vp9_avg_8x8_c(const uint8_t *, int p);
+unsigned int vp9_avg_8x8_sse2(const uint8_t *, int p);
+#define vp9_avg_8x8 vp9_avg_8x8_sse2
+
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 int64_t vp9_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 #define vp9_block_error vp9_block_error_sse2
@@ -429,6 +433,7 @@
 #define vp9_mse8x8 vp9_mse8x8_sse2
 
 void vp9_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 void vp9_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 RTCD_EXTERN void (*vp9_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 
@@ -953,7 +958,7 @@
     if (flags & HAS_SSSE3) vp9_idct8x8_12_add = vp9_idct8x8_12_add_ssse3;
     vp9_idct8x8_64_add = vp9_idct8x8_64_add_sse2;
     if (flags & HAS_SSSE3) vp9_idct8x8_64_add = vp9_idct8x8_64_add_ssse3;
-    vp9_quantize_b = vp9_quantize_b_c;
+    vp9_quantize_b = vp9_quantize_b_sse2;
     if (flags & HAS_SSSE3) vp9_quantize_b = vp9_quantize_b_ssse3;
     vp9_quantize_b_32x32 = vp9_quantize_b_32x32_c;
     if (flags & HAS_SSSE3) vp9_quantize_b_32x32 = vp9_quantize_b_32x32_ssse3;

diff --git a/source/config/linux/x64/vpx_config.asm b/source/config/linux/x64/vpx_config.asm
index e66e8b7..159294e 100644
--- a/source/config/linux/x64/vpx_config.asm
+++ b/source/config/linux/x64/vpx_config.asm

@@ -85,4 +85,4 @@
 %define CONFIG_SPATIAL_SVC 0
 %define CONFIG_VP9_TEMPORAL_DENOISING 0
 %define CONFIG_FP_MB_STATS 0
-%define CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH 0
+%define CONFIG_EMULATE_HARDWARE 0

diff --git a/source/config/linux/x64/vpx_config.h b/source/config/linux/x64/vpx_config.h
index 61dd009..220ad67 100644
--- a/source/config/linux/x64/vpx_config.h
+++ b/source/config/linux/x64/vpx_config.h

@@ -97,5 +97,5 @@
 #define CONFIG_SPATIAL_SVC 0
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_FP_MB_STATS 0
-#define CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH 0
+#define CONFIG_EMULATE_HARDWARE 0
 #endif /* VPX_CONFIG_H */

diff --git a/source/config/mac/ia32/vp9_rtcd.h b/source/config/mac/ia32/vp9_rtcd.h
index fa60726..35449ac 100644
--- a/source/config/mac/ia32/vp9_rtcd.h
+++ b/source/config/mac/ia32/vp9_rtcd.h

@@ -12,8 +12,8 @@
  */
 
 #include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_enums.h"
-#include "vp9/common/vp9_idct.h"
 
 struct macroblockd;
 
@@ -29,6 +29,10 @@
 extern "C" {
 #endif
 
+unsigned int vp9_avg_8x8_c(const uint8_t *, int p);
+unsigned int vp9_avg_8x8_sse2(const uint8_t *, int p);
+RTCD_EXTERN unsigned int (*vp9_avg_8x8)(const uint8_t *, int p);
+
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 #define vp9_block_error vp9_block_error_c
 
@@ -392,7 +396,8 @@
 #define vp9_mse8x8 vp9_mse8x8_c
 
 void vp9_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_b vp9_quantize_b_c
+void vp9_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+RTCD_EXTERN void (*vp9_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 
 void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vp9_quantize_b_32x32 vp9_quantize_b_32x32_c
@@ -749,6 +754,8 @@
 
     (void)flags;
 
+    vp9_avg_8x8 = vp9_avg_8x8_c;
+    if (flags & HAS_SSE2) vp9_avg_8x8 = vp9_avg_8x8_sse2;
     vp9_convolve8 = vp9_convolve8_c;
     if (flags & HAS_SSE2) vp9_convolve8 = vp9_convolve8_sse2;
     if (flags & HAS_SSSE3) vp9_convolve8 = vp9_convolve8_ssse3;
@@ -846,6 +853,8 @@
     if (flags & HAS_SSE2) vp9_lpf_vertical_8 = vp9_lpf_vertical_8_sse2;
     vp9_lpf_vertical_8_dual = vp9_lpf_vertical_8_dual_c;
     if (flags & HAS_SSE2) vp9_lpf_vertical_8_dual = vp9_lpf_vertical_8_dual_sse2;
+    vp9_quantize_b = vp9_quantize_b_c;
+    if (flags & HAS_SSE2) vp9_quantize_b = vp9_quantize_b_sse2;
     vp9_sad16x16x3 = vp9_sad16x16x3_c;
     if (flags & HAS_SSE3) vp9_sad16x16x3 = vp9_sad16x16x3_sse3;
     if (flags & HAS_SSSE3) vp9_sad16x16x3 = vp9_sad16x16x3_ssse3;

diff --git a/source/config/mac/ia32/vpx_config.asm b/source/config/mac/ia32/vpx_config.asm
index 6caaebf..e65c0d9 100644
--- a/source/config/mac/ia32/vpx_config.asm
+++ b/source/config/mac/ia32/vpx_config.asm

@@ -85,4 +85,4 @@
 %define CONFIG_SPATIAL_SVC 0
 %define CONFIG_VP9_TEMPORAL_DENOISING 0
 %define CONFIG_FP_MB_STATS 0
-%define CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH 0
+%define CONFIG_EMULATE_HARDWARE 0

diff --git a/source/config/mac/ia32/vpx_config.h b/source/config/mac/ia32/vpx_config.h
index 3e5d038..c7b5890 100644
--- a/source/config/mac/ia32/vpx_config.h
+++ b/source/config/mac/ia32/vpx_config.h

@@ -97,5 +97,5 @@
 #define CONFIG_SPATIAL_SVC 0
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_FP_MB_STATS 0
-#define CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH 0
+#define CONFIG_EMULATE_HARDWARE 0
 #endif /* VPX_CONFIG_H */

diff --git a/source/config/mac/x64/vp9_rtcd.h b/source/config/mac/x64/vp9_rtcd.h
index 253f565..f1417de 100644
--- a/source/config/mac/x64/vp9_rtcd.h
+++ b/source/config/mac/x64/vp9_rtcd.h

@@ -12,8 +12,8 @@
  */
 
 #include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_enums.h"
-#include "vp9/common/vp9_idct.h"
 
 struct macroblockd;
 
@@ -29,6 +29,10 @@
 extern "C" {
 #endif
 
+unsigned int vp9_avg_8x8_c(const uint8_t *, int p);
+unsigned int vp9_avg_8x8_sse2(const uint8_t *, int p);
+#define vp9_avg_8x8 vp9_avg_8x8_sse2
+
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 int64_t vp9_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 #define vp9_block_error vp9_block_error_sse2
@@ -429,6 +433,7 @@
 #define vp9_mse8x8 vp9_mse8x8_sse2
 
 void vp9_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 void vp9_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 RTCD_EXTERN void (*vp9_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 
@@ -953,7 +958,7 @@
     if (flags & HAS_SSSE3) vp9_idct8x8_12_add = vp9_idct8x8_12_add_ssse3;
     vp9_idct8x8_64_add = vp9_idct8x8_64_add_sse2;
     if (flags & HAS_SSSE3) vp9_idct8x8_64_add = vp9_idct8x8_64_add_ssse3;
-    vp9_quantize_b = vp9_quantize_b_c;
+    vp9_quantize_b = vp9_quantize_b_sse2;
     if (flags & HAS_SSSE3) vp9_quantize_b = vp9_quantize_b_ssse3;
     vp9_quantize_b_32x32 = vp9_quantize_b_32x32_c;
     if (flags & HAS_SSSE3) vp9_quantize_b_32x32 = vp9_quantize_b_32x32_ssse3;

diff --git a/source/config/mac/x64/vpx_config.asm b/source/config/mac/x64/vpx_config.asm
index e66e8b7..159294e 100644
--- a/source/config/mac/x64/vpx_config.asm
+++ b/source/config/mac/x64/vpx_config.asm

@@ -85,4 +85,4 @@
 %define CONFIG_SPATIAL_SVC 0
 %define CONFIG_VP9_TEMPORAL_DENOISING 0
 %define CONFIG_FP_MB_STATS 0
-%define CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH 0
+%define CONFIG_EMULATE_HARDWARE 0

diff --git a/source/config/mac/x64/vpx_config.h b/source/config/mac/x64/vpx_config.h
index 61dd009..220ad67 100644
--- a/source/config/mac/x64/vpx_config.h
+++ b/source/config/mac/x64/vpx_config.h

@@ -97,5 +97,5 @@
 #define CONFIG_SPATIAL_SVC 0
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_FP_MB_STATS 0
-#define CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH 0
+#define CONFIG_EMULATE_HARDWARE 0
 #endif /* VPX_CONFIG_H */

diff --git a/source/config/nacl/vp9_rtcd.h b/source/config/nacl/vp9_rtcd.h
index b60c290..0483682 100644
--- a/source/config/nacl/vp9_rtcd.h
+++ b/source/config/nacl/vp9_rtcd.h

@@ -12,8 +12,8 @@
  */
 
 #include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_enums.h"
-#include "vp9/common/vp9_idct.h"
 
 struct macroblockd;
 
@@ -29,6 +29,9 @@
 extern "C" {
 #endif
 
+unsigned int vp9_avg_8x8_c(const uint8_t *, int p);
+#define vp9_avg_8x8 vp9_avg_8x8_c
+
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 #define vp9_block_error vp9_block_error_c
 

diff --git a/source/config/nacl/vpx_config.asm b/source/config/nacl/vpx_config.asm
index 776ff66..71aee39 100644
--- a/source/config/nacl/vpx_config.asm
+++ b/source/config/nacl/vpx_config.asm

@@ -88,5 +88,5 @@
 .equ CONFIG_SPATIAL_SVC ,  0
 .equ CONFIG_VP9_TEMPORAL_DENOISING ,  0
 .equ CONFIG_FP_MB_STATS ,  0
-.equ CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH ,  0
+.equ CONFIG_EMULATE_HARDWARE ,  0
 	.section	.note.GNU-stack,"",%progbits

diff --git a/source/config/nacl/vpx_config.h b/source/config/nacl/vpx_config.h
index d02c859..a24d041 100644
--- a/source/config/nacl/vpx_config.h
+++ b/source/config/nacl/vpx_config.h

@@ -97,5 +97,5 @@
 #define CONFIG_SPATIAL_SVC 0
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_FP_MB_STATS 0
-#define CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH 0
+#define CONFIG_EMULATE_HARDWARE 0
 #endif /* VPX_CONFIG_H */

diff --git a/source/config/win/ia32/vp9_rtcd.h b/source/config/win/ia32/vp9_rtcd.h
index 660b652..2d6c530 100644
--- a/source/config/win/ia32/vp9_rtcd.h
+++ b/source/config/win/ia32/vp9_rtcd.h

@@ -12,8 +12,8 @@
  */
 
 #include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_enums.h"
-#include "vp9/common/vp9_idct.h"
 
 struct macroblockd;
 
@@ -29,6 +29,10 @@
 extern "C" {
 #endif
 
+unsigned int vp9_avg_8x8_c(const uint8_t *, int p);
+unsigned int vp9_avg_8x8_sse2(const uint8_t *, int p);
+RTCD_EXTERN unsigned int (*vp9_avg_8x8)(const uint8_t *, int p);
+
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 int64_t vp9_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 RTCD_EXTERN int64_t (*vp9_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
@@ -426,7 +430,8 @@
 RTCD_EXTERN unsigned int (*vp9_mse8x8)(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
 
 void vp9_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_b vp9_quantize_b_c
+void vp9_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+RTCD_EXTERN void (*vp9_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 
 void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vp9_quantize_b_32x32 vp9_quantize_b_32x32_c
@@ -882,6 +887,8 @@
 
     (void)flags;
 
+    vp9_avg_8x8 = vp9_avg_8x8_c;
+    if (flags & HAS_SSE2) vp9_avg_8x8 = vp9_avg_8x8_sse2;
     vp9_block_error = vp9_block_error_c;
     if (flags & HAS_SSE2) vp9_block_error = vp9_block_error_sse2;
     vp9_convolve8 = vp9_convolve8_c;
@@ -1047,6 +1054,8 @@
     if (flags & HAS_SSE2) vp9_mse8x16 = vp9_mse8x16_sse2;
     vp9_mse8x8 = vp9_mse8x8_c;
     if (flags & HAS_SSE2) vp9_mse8x8 = vp9_mse8x8_sse2;
+    vp9_quantize_b = vp9_quantize_b_c;
+    if (flags & HAS_SSE2) vp9_quantize_b = vp9_quantize_b_sse2;
     vp9_sad16x16 = vp9_sad16x16_c;
     if (flags & HAS_SSE2) vp9_sad16x16 = vp9_sad16x16_sse2;
     vp9_sad16x16_avg = vp9_sad16x16_avg_c;

diff --git a/source/config/win/ia32/vpx_config.asm b/source/config/win/ia32/vpx_config.asm
index a223733..da535f4 100644
--- a/source/config/win/ia32/vpx_config.asm
+++ b/source/config/win/ia32/vpx_config.asm

@@ -85,4 +85,4 @@
 %define CONFIG_SPATIAL_SVC 0
 %define CONFIG_VP9_TEMPORAL_DENOISING 0
 %define CONFIG_FP_MB_STATS 0
-%define CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH 0
+%define CONFIG_EMULATE_HARDWARE 0

diff --git a/source/config/win/ia32/vpx_config.h b/source/config/win/ia32/vpx_config.h
index d579e01..ddd1db5 100644
--- a/source/config/win/ia32/vpx_config.h
+++ b/source/config/win/ia32/vpx_config.h

@@ -97,5 +97,5 @@
 #define CONFIG_SPATIAL_SVC 0
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_FP_MB_STATS 0
-#define CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH 0
+#define CONFIG_EMULATE_HARDWARE 0
 #endif /* VPX_CONFIG_H */

diff --git a/source/config/win/x64/vp9_rtcd.h b/source/config/win/x64/vp9_rtcd.h
index 253f565..f1417de 100644
--- a/source/config/win/x64/vp9_rtcd.h
+++ b/source/config/win/x64/vp9_rtcd.h

@@ -12,8 +12,8 @@
  */
 
 #include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_enums.h"
-#include "vp9/common/vp9_idct.h"
 
 struct macroblockd;
 
@@ -29,6 +29,10 @@
 extern "C" {
 #endif
 
+unsigned int vp9_avg_8x8_c(const uint8_t *, int p);
+unsigned int vp9_avg_8x8_sse2(const uint8_t *, int p);
+#define vp9_avg_8x8 vp9_avg_8x8_sse2
+
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 int64_t vp9_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 #define vp9_block_error vp9_block_error_sse2
@@ -429,6 +433,7 @@
 #define vp9_mse8x8 vp9_mse8x8_sse2
 
 void vp9_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 void vp9_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 RTCD_EXTERN void (*vp9_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 
@@ -953,7 +958,7 @@
     if (flags & HAS_SSSE3) vp9_idct8x8_12_add = vp9_idct8x8_12_add_ssse3;
     vp9_idct8x8_64_add = vp9_idct8x8_64_add_sse2;
     if (flags & HAS_SSSE3) vp9_idct8x8_64_add = vp9_idct8x8_64_add_ssse3;
-    vp9_quantize_b = vp9_quantize_b_c;
+    vp9_quantize_b = vp9_quantize_b_sse2;
     if (flags & HAS_SSSE3) vp9_quantize_b = vp9_quantize_b_ssse3;
     vp9_quantize_b_32x32 = vp9_quantize_b_32x32_c;
     if (flags & HAS_SSSE3) vp9_quantize_b_32x32 = vp9_quantize_b_32x32_ssse3;

diff --git a/source/config/win/x64/vpx_config.asm b/source/config/win/x64/vpx_config.asm
index 49033d7..93f24d1 100644
--- a/source/config/win/x64/vpx_config.asm
+++ b/source/config/win/x64/vpx_config.asm

@@ -85,4 +85,4 @@
 %define CONFIG_SPATIAL_SVC 0
 %define CONFIG_VP9_TEMPORAL_DENOISING 0
 %define CONFIG_FP_MB_STATS 0
-%define CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH 0
+%define CONFIG_EMULATE_HARDWARE 0

diff --git a/source/config/win/x64/vpx_config.h b/source/config/win/x64/vpx_config.h
index 936eb36..2d02d26 100644
--- a/source/config/win/x64/vpx_config.h
+++ b/source/config/win/x64/vpx_config.h

@@ -97,5 +97,5 @@
 #define CONFIG_SPATIAL_SVC 0
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_FP_MB_STATS 0
-#define CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH 0
+#define CONFIG_EMULATE_HARDWARE 0
 #endif /* VPX_CONFIG_H */

diff --git a/source/libvpx/args.h b/source/libvpx/args.h
index 04e0acd..1f37151 100644
--- a/source/libvpx/args.h
+++ b/source/libvpx/args.h

@@ -51,6 +51,7 @@
 unsigned int arg_parse_uint(const struct arg *arg);
 int arg_parse_int(const struct arg *arg);
 struct vpx_rational arg_parse_rational(const struct arg *arg);
+int arg_parse_enum(const struct arg *arg);
 int arg_parse_enum_or_int(const struct arg *arg);
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/source/libvpx/build/make/configure.sh b/source/libvpx/build/make/configure.sh
index ab6687f..56e9f44 100755
--- a/source/libvpx/build/make/configure.sh
+++ b/source/libvpx/build/make/configure.sh

@@ -736,6 +736,9 @@
     # PIC is probably what we want when building shared libs
     enabled shared && soft_enable pic
 
+    # Minimum iOS version for all target platforms (darwin and iphonesimulator).
+    IOS_VERSION_MIN="6.0"
+
     # Handle darwin variants. Newer SDKs allow targeting older
     # platforms, so find the newest SDK available.
     case ${toolchain} in
@@ -788,8 +791,8 @@
             add_ldflags "-mmacosx-version-min=10.9"
             ;;
         *-iphonesimulator-*)
-            add_cflags  "-miphoneos-version-min=5.0"
-            add_ldflags "-miphoneos-version-min=5.0"
+            add_cflags  "-miphoneos-version-min=${IOS_VERSION_MIN}"
+            add_ldflags "-miphoneos-version-min=${IOS_VERSION_MIN}"
             osx_sdk_dir="$(xcrun --sdk iphonesimulator --show-sdk-path)"
             add_cflags  "-isysroot ${osx_sdk_dir}"
             add_ldflags "-isysroot ${osx_sdk_dir}"
@@ -970,18 +973,28 @@
           ;;
 
         darwin*)
-
             XCRUN_FIND="xcrun --sdk iphoneos -find"
             CXX="$(${XCRUN_FIND} clang++)"
             CC="$(${XCRUN_FIND} clang)"
             AR="$(${XCRUN_FIND} ar)"
-            LD="$(${XCRUN_FIND} ld)"
             AS="$(${XCRUN_FIND} as)"
             STRIP="$(${XCRUN_FIND} strip)"
             NM="$(${XCRUN_FIND} nm)"
             RANLIB="$(${XCRUN_FIND} ranlib)"
             AS_SFX=.s
 
+            # Special handling of ld for armv6 because libclang_rt.ios.a does
+            # not contain armv6 support in Apple's clang package:
+            #   Apple LLVM version 5.1 (clang-503.0.40) (based on LLVM 3.4svn).
+            # TODO(tomfinegan): Remove this. Our minimum iOS version (6.0)
+            # renders support for armv6 unnecessary because the 3GS and up
+            # support neon.
+            if [ "${tgt_isa}" = "armv6" ]; then
+                LD="$(${XCRUN_FIND} ld)"
+            else
+                LD="${CXX:-$(${XCRUN_FIND} ld)}"
+            fi
+
             # ASFLAGS is written here instead of using check_add_asflags
             # because we need to overwrite all of ASFLAGS and purge the
             # options that were put in above
@@ -989,7 +1002,13 @@
 
             alt_libc="$(xcrun --sdk iphoneos --show-sdk-path)"
             add_cflags -arch ${tgt_isa} -isysroot ${alt_libc}
-            add_ldflags -arch ${tgt_isa} -ios_version_min 7.0
+            add_ldflags -arch ${tgt_isa}
+
+            if [ "${LD}" = "${CXX}" ]; then
+                add_ldflags -miphoneos-version-min="${IOS_VERSION_MIN}"
+            else
+                add_ldflags -ios_version_min "${IOS_VERSION_MIN}"
+            fi
 
             for d in lib usr/lib usr/lib/system; do
                 try_dir="${alt_libc}/${d}"

diff --git a/source/libvpx/configure b/source/libvpx/configure
index 7b9c211..3ed976c 100755
--- a/source/libvpx/configure
+++ b/source/libvpx/configure

@@ -281,7 +281,7 @@
     spatial_svc
     vp9_temporal_denoising
     fp_mb_stats
-    emulate_hardware_highbitdepth
+    emulate_hardware
 "
 CONFIG_LIST="
     external_build

diff --git a/source/libvpx/examples/simple_decoder.c b/source/libvpx/examples/simple_decoder.c
index 3f7d6aa..c58b014 100644
--- a/source/libvpx/examples/simple_decoder.c
+++ b/source/libvpx/examples/simple_decoder.c

@@ -33,24 +33,25 @@
 //
 // Initializing The Codec
 // ----------------------
-// The decoder is initialized by the following code. This is an example for
-// the VP8 decoder, but the code is analogous for all algorithms. Replace
-// `vpx_codec_vp8_dx()` with a pointer to the interface exposed by the
-// algorithm you want to use. The `cfg` argument is left as NULL in this
-// example, because we want the algorithm to determine the stream
-// configuration (width/height) and allocate memory automatically. This
-// parameter is generally only used if you need to preallocate memory,
-// particularly in External Memory Allocation mode.
+// The libvpx decoder is initialized by the call to vpx_codec_dec_init().
+// Determining the codec interface to use is handled by VpxVideoReader and the
+// functions prefixed with vpx_video_reader_. Discussion of those functions is
+// beyond the scope of this example, but the main gist is to open the input file
+// and parse just enough of it to determine if it's a VPx file and which VPx
+// codec is contained within the file.
+// Note the NULL pointer passed to vpx_codec_dec_init(). We do that in this
+// example because we want the algorithm to determine the stream configuration
+// (width/height) and allocate memory automatically.
 //
 // Decoding A Frame
 // ----------------
 // Once the frame has been read into memory, it is decoded using the
 // `vpx_codec_decode` function. The call takes a pointer to the data
-// (`frame`) and the length of the data (`frame_sz`). No application data
+// (`frame`) and the length of the data (`frame_size`). No application data
 // is associated with the frame in this example, so the `user_priv`
 // parameter is NULL. The `deadline` parameter is left at zero for this
-// example. This parameter is generally only used when doing adaptive
-// postprocessing.
+// example. This parameter is generally only used when doing adaptive post
+// processing.
 //
 // Codecs may produce a variable number of output frames for every call to
 // `vpx_codec_decode`. These frames are retrieved by the
@@ -72,14 +73,13 @@
 // --------------
 // This example does not special case any error return codes. If there was
 // an error, a descriptive message is printed and the program exits. With
-// few exeptions, vpx_codec functions return an enumerated error status,
+// few exceptions, vpx_codec functions return an enumerated error status,
 // with the value `0` indicating success.
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
-#include "vpx/vp8dx.h"
 #include "vpx/vpx_decoder.h"
 
 #include "./tools_common.h"

diff --git a/source/libvpx/examples/vp8cx_set_ref.c b/source/libvpx/examples/vp8cx_set_ref.c
index b0961a2..5e29d80 100644
--- a/source/libvpx/examples/vp8cx_set_ref.c
+++ b/source/libvpx/examples/vp8cx_set_ref.c

@@ -178,7 +178,7 @@
   }
 
   // Flush encoder.
-  while (encode_frame(&codec, NULL, -1, writer)) {};
+  while (encode_frame(&codec, NULL, -1, writer)) {}
 
   printf("\n");
   fclose(infile);

diff --git a/source/libvpx/examples/vp9_spatial_svc_encoder.c b/source/libvpx/examples/vp9_spatial_svc_encoder.c
index 9cd716b..53ede94 100644
--- a/source/libvpx/examples/vp9_spatial_svc_encoder.c
+++ b/source/libvpx/examples/vp9_spatial_svc_encoder.c

@@ -61,12 +61,30 @@
 static const arg_def_t max_bitrate_arg =
     ARG_DEF(NULL, "max-bitrate", 1, "Maximum bitrate");
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static const struct arg_enum_list bitdepth_enum[] = {
+  {"8",  VPX_BITS_8},
+  {"10", VPX_BITS_10},
+  {"12", VPX_BITS_12},
+  {NULL, 0}
+};
+
+static const arg_def_t bitdepth_arg =
+    ARG_DEF_ENUM("d", "bit-depth", 1, "Bit depth for codec 8, 10 or 12. ",
+                 bitdepth_enum);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+
 static const arg_def_t *svc_args[] = {
   &frames_arg,        &width_arg,         &height_arg,
   &timebase_arg,      &bitrate_arg,       &skip_frames_arg, &spatial_layers_arg,
   &kf_dist_arg,       &scale_factors_arg, &passes_arg,      &pass_arg,
   &fpf_name_arg,      &min_q_arg,         &max_q_arg,       &min_bitrate_arg,
-  &max_bitrate_arg,   &temporal_layers_arg,                 NULL
+  &max_bitrate_arg,   &temporal_layers_arg,
+#if CONFIG_VP9_HIGHBITDEPTH
+  &bitdepth_arg,
+#endif
+  NULL
 };
 
 static const uint32_t default_frames_to_skip = 0;
@@ -165,7 +183,7 @@
       enc_cfg->kf_min_dist = arg_parse_uint(&arg);
       enc_cfg->kf_max_dist = enc_cfg->kf_min_dist;
     } else if (arg_match(&arg, &scale_factors_arg, argi)) {
-      snprintf(string_options, 1024, "%s scale-factors=%s",
+      snprintf(string_options, sizeof(string_options), "%s scale-factors=%s",
                string_options, arg.val);
     } else if (arg_match(&arg, &passes_arg, argi)) {
       passes = arg_parse_uint(&arg);
@@ -180,15 +198,36 @@
     } else if (arg_match(&arg, &fpf_name_arg, argi)) {
       fpf_file_name = arg.val;
     } else if (arg_match(&arg, &min_q_arg, argi)) {
-      snprintf(string_options, 1024, "%s min-quantizers=%s",
+      snprintf(string_options, sizeof(string_options), "%s min-quantizers=%s",
                string_options, arg.val);
     } else if (arg_match(&arg, &max_q_arg, argi)) {
-      snprintf(string_options, 1024, "%s max-quantizers=%s",
+      snprintf(string_options, sizeof(string_options), "%s max-quantizers=%s",
                string_options, arg.val);
     } else if (arg_match(&arg, &min_bitrate_arg, argi)) {
       min_bitrate = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &max_bitrate_arg, argi)) {
       max_bitrate = arg_parse_uint(&arg);
+#if CONFIG_VP9_HIGHBITDEPTH
+    } else if (arg_match(&arg, &bitdepth_arg, argi)) {
+      enc_cfg->g_bit_depth = arg_parse_enum_or_int(&arg);
+      switch (enc_cfg->g_bit_depth) {
+        case VPX_BITS_8:
+          enc_cfg->g_input_bit_depth = 8;
+          enc_cfg->g_profile = 0;
+          break;
+        case VPX_BITS_10:
+          enc_cfg->g_input_bit_depth = 10;
+          enc_cfg->g_profile = 2;
+          break;
+         case VPX_BITS_12:
+          enc_cfg->g_input_bit_depth = 12;
+          enc_cfg->g_profile = 2;
+          break;
+        default:
+          die("Error: Invalid bit depth selected (%d)\n", enc_cfg->g_bit_depth);
+          break;
+      }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     } else {
       ++argj;
     }
@@ -291,8 +330,17 @@
   parse_command_line(argc, argv, &app_input, &svc_ctx, &enc_cfg);
 
   // Allocate image buffer
-  if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, enc_cfg.g_w, enc_cfg.g_h, 32))
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (!vpx_img_alloc(&raw, enc_cfg.g_input_bit_depth == 8 ?
+                         VPX_IMG_FMT_I420 : VPX_IMG_FMT_I42016,
+                     enc_cfg.g_w, enc_cfg.g_h, 32)) {
     die("Failed to allocate image %dx%d\n", enc_cfg.g_w, enc_cfg.g_h);
+  }
+#else
+  if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, enc_cfg.g_w, enc_cfg.g_h, 32)) {
+    die("Failed to allocate image %dx%d\n", enc_cfg.g_w, enc_cfg.g_h);
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
   if (!(infile = fopen(app_input.input_filename, "rb")))
     die("Failed to open %s for reading\n", app_input.input_filename);

diff --git a/source/libvpx/examples/vpx_temporal_svc_encoder.c b/source/libvpx/examples/vpx_temporal_svc_encoder.c
index 1674804..ecae2fe 100644
--- a/source/libvpx/examples/vpx_temporal_svc_encoder.c
+++ b/source/libvpx/examples/vpx_temporal_svc_encoder.c

@@ -461,13 +461,27 @@
   FILE *infile = NULL;
   struct RateControlMetrics rc;
   int64_t cx_time = 0;
+  const int min_args_base = 11;
+#if CONFIG_VP9_HIGHBITDEPTH
+  vpx_bit_depth_t bit_depth = VPX_BITS_8;
+  int input_bit_depth = 8;
+  const int min_args = min_args_base + 1;
+#else
+  const int min_args = min_args_base;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
   exec_name = argv[0];
   // Check usage and arguments.
-  if (argc < 11) {
+  if (argc < min_args) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    die("Usage: %s <infile> <outfile> <codec_type(vp8/vp9)> <width> <height> "
+        "<rate_num> <rate_den> <speed> <frame_drop_threshold> <mode> "
+        "<Rate_0> ... <Rate_nlayers-1> <bit-depth> \n", argv[0]);
+#else
     die("Usage: %s <infile> <outfile> <codec_type(vp8/vp9)> <width> <height> "
         "<rate_num> <rate_den> <speed> <frame_drop_threshold> <mode> "
         "<Rate_0> ... <Rate_nlayers-1> \n", argv[0]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   }
 
   encoder = get_vpx_encoder_by_name(argv[3]);
@@ -487,13 +501,38 @@
     die("Invalid layering mode (0..12) %s", argv[10]);
   }
 
-  if (argc != 11 + mode_to_num_layers[layering_mode]) {
+  if (argc != min_args + mode_to_num_layers[layering_mode]) {
     die("Invalid number of arguments");
   }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  switch (strtol(argv[argc-1], NULL, 0)) {
+    case 8:
+      bit_depth = VPX_BITS_8;
+      input_bit_depth = 8;
+      break;
+    case 10:
+      bit_depth = VPX_BITS_10;
+      input_bit_depth = 10;
+      break;
+    case 12:
+      bit_depth = VPX_BITS_12;
+      input_bit_depth = 12;
+      break;
+    default:
+      die("Invalid bit depth (8, 10, 12) %s", argv[argc-1]);
+  }
+  if (!vpx_img_alloc(&raw,
+                     bit_depth == VPX_BITS_8 ? VPX_IMG_FMT_I420 :
+                                               VPX_IMG_FMT_I42016,
+                     width, height, 32)) {
+    die("Failed to allocate image", width, height);
+  }
+#else
   if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, width, height, 32)) {
     die("Failed to allocate image", width, height);
   }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
   // Populate encoder configuration.
   res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
@@ -506,6 +545,14 @@
   cfg.g_w = width;
   cfg.g_h = height;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (bit_depth != VPX_BITS_8) {
+    cfg.g_bit_depth = bit_depth;
+    cfg.g_input_bit_depth = input_bit_depth;
+    cfg.g_profile = 2;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
   // Timebase format e.g. 30fps: numerator=1, demoninator = 30.
   cfg.g_timebase.num = strtol(argv[6], NULL, 0);
   cfg.g_timebase.den = strtol(argv[7], NULL, 0);
@@ -515,7 +562,9 @@
     die("Invalid speed setting: must be positive");
   }
 
-  for (i = 11; (int)i < 11 + mode_to_num_layers[layering_mode]; ++i) {
+  for (i = min_args_base;
+       (int)i < min_args_base + mode_to_num_layers[layering_mode];
+       ++i) {
     cfg.ts_target_bitrate[i - 11] = strtol(argv[i], NULL, 0);
   }
 
@@ -576,7 +625,13 @@
   cfg.ss_number_layers = 1;
 
   // Initialize codec.
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (vpx_codec_enc_init(
+          &codec, encoder->codec_interface(), &cfg,
+          bit_depth == VPX_BITS_8 ? 0 : VPX_CODEC_USE_HIGHBITDEPTH))
+#else
   if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     die_codec(&codec, "Failed to initialize encoder");
 
   if (strncmp(encoder->name, "vp8", 3) == 0) {

diff --git a/source/libvpx/test/convolve_test.cc b/source/libvpx/test/convolve_test.cc
index de947aa..470c436 100644
--- a/source/libvpx/test/convolve_test.cc
+++ b/source/libvpx/test/convolve_test.cc

@@ -36,7 +36,7 @@
                     ConvolveFunc hv8, ConvolveFunc hv8_avg,
                     int bd)
       : h8_(h8), v8_(v8), hv8_(hv8), h8_avg_(h8_avg), v8_avg_(v8_avg),
-        hv8_avg_(hv8_avg), use_high_bd_(bd) {}
+        hv8_avg_(hv8_avg), use_highbd_(bd) {}
 
   ConvolveFunc h8_;
   ConvolveFunc v8_;
@@ -44,7 +44,7 @@
   ConvolveFunc h8_avg_;
   ConvolveFunc v8_avg_;
   ConvolveFunc hv8_avg_;
-  int use_high_bd_;  // 0 if high bitdepth not used, else the actual bit depth.
+  int use_highbd_;  // 0 if high bitdepth not used, else the actual bit depth.
 };
 
 typedef std::tr1::tuple<int, int, const ConvolveFunctions *> ConvolveParam;
@@ -170,15 +170,15 @@
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void high_filter_block2d_8_c(const uint16_t *src_ptr,
-                             const unsigned int src_stride,
-                             const int16_t *HFilter,
-                             const int16_t *VFilter,
-                             uint16_t *dst_ptr,
-                             unsigned int dst_stride,
-                             unsigned int output_width,
-                             unsigned int output_height,
-                             int bd) {
+void highbd_filter_block2d_8_c(const uint16_t *src_ptr,
+                               const unsigned int src_stride,
+                               const int16_t *HFilter,
+                               const int16_t *VFilter,
+                               uint16_t *dst_ptr,
+                               unsigned int dst_stride,
+                               unsigned int output_width,
+                               unsigned int output_height,
+                               int bd) {
   // Between passes, we use an intermediate buffer whose height is extended to
   // have enough horizontally filtered values as input for the vertical pass.
   // This buffer is allocated to be big enough for the largest block type we
@@ -217,7 +217,7 @@
                          (VP9_FILTER_WEIGHT >> 1);  // Rounding
 
         // Normalize back to 0-255...
-        *output_ptr = clip_pixel_high(temp >> VP9_FILTER_SHIFT, bd);
+        *output_ptr = clip_pixel_highbd(temp >> VP9_FILTER_SHIFT, bd);
         ++src_ptr;
         output_ptr += intermediate_height;
       }
@@ -245,7 +245,7 @@
                          (VP9_FILTER_WEIGHT >> 1);  // Rounding
 
         // Normalize back to 0-255...
-        *dst_ptr++ = clip_pixel_high(temp >> VP9_FILTER_SHIFT, bd);
+        *dst_ptr++ = clip_pixel_highbd(temp >> VP9_FILTER_SHIFT, bd);
         src_ptr += intermediate_height;
       }
       src_ptr += intermediate_next_stride;
@@ -254,13 +254,13 @@
   }
 }
 
-void high_block2d_average_c(uint16_t *src,
-                            unsigned int src_stride,
-                            uint16_t *output_ptr,
-                            unsigned int output_stride,
-                            unsigned int output_width,
-                            unsigned int output_height,
-                            int bd) {
+void highbd_block2d_average_c(uint16_t *src,
+                              unsigned int src_stride,
+                              uint16_t *output_ptr,
+                              unsigned int output_stride,
+                              unsigned int output_width,
+                              unsigned int output_height,
+                              int bd) {
   unsigned int i, j;
   for (i = 0; i < output_height; ++i) {
     for (j = 0; j < output_width; ++j) {
@@ -270,23 +270,23 @@
   }
 }
 
-void high_filter_average_block2d_8_c(const uint16_t *src_ptr,
-                                     const unsigned int src_stride,
-                                     const int16_t *HFilter,
-                                     const int16_t *VFilter,
-                                     uint16_t *dst_ptr,
-                                     unsigned int dst_stride,
-                                     unsigned int output_width,
-                                     unsigned int output_height,
-                                     int bd) {
+void highbd_filter_average_block2d_8_c(const uint16_t *src_ptr,
+                                       const unsigned int src_stride,
+                                       const int16_t *HFilter,
+                                       const int16_t *VFilter,
+                                       uint16_t *dst_ptr,
+                                       unsigned int dst_stride,
+                                       unsigned int output_width,
+                                       unsigned int output_height,
+                                       int bd) {
   uint16_t tmp[kMaxDimension * kMaxDimension];
 
   assert(output_width <= kMaxDimension);
   assert(output_height <= kMaxDimension);
-  high_filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, 64,
-                          output_width, output_height, bd);
-  high_block2d_average_c(tmp, 64, dst_ptr, dst_stride,
-                         output_width, output_height, bd);
+  highbd_filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, 64,
+                            output_width, output_height, bd);
+  highbd_block2d_average_c(tmp, 64, dst_ptr, dst_stride,
+                           output_width, output_height, bd);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -346,8 +346,8 @@
   virtual void SetUp() {
     UUT_ = GET_PARAM(2);
 #if CONFIG_VP9_HIGHBITDEPTH
-    if (UUT_->use_high_bd_ != 0)
-      mask_ = (1 << UUT_->use_high_bd_) - 1;
+    if (UUT_->use_highbd_ != 0)
+      mask_ = (1 << UUT_->use_highbd_) - 1;
     else
       mask_ = 255;
 #endif
@@ -391,7 +391,7 @@
 
   uint8_t *input() const {
 #if CONFIG_VP9_HIGHBITDEPTH
-    if (UUT_->use_high_bd_ == 0) {
+    if (UUT_->use_highbd_ == 0) {
       return input_ + BorderTop() * kOuterBlockSize + BorderLeft();
     } else {
       return CONVERT_TO_BYTEPTR(input16_ + BorderTop() * kOuterBlockSize +
@@ -404,7 +404,7 @@
 
   uint8_t *output() const {
 #if CONFIG_VP9_HIGHBITDEPTH
-    if (UUT_->use_high_bd_ == 0) {
+    if (UUT_->use_highbd_ == 0) {
       return output_ + BorderTop() * kOuterBlockSize + BorderLeft();
     } else {
       return CONVERT_TO_BYTEPTR(output16_ + BorderTop() * kOuterBlockSize +
@@ -417,7 +417,7 @@
 
   uint16_t lookup(uint8_t *list, int index) const {
 #if CONFIG_VP9_HIGHBITDEPTH
-    if (UUT_->use_high_bd_ == 0) {
+    if (UUT_->use_highbd_ == 0) {
       return list[index];
     } else {
       return CONVERT_TO_SHORTPTR(list)[index];
@@ -429,7 +429,7 @@
 
   void assign_val(uint8_t *list, int index, uint16_t val) const {
 #if CONFIG_VP9_HIGHBITDEPTH
-    if (UUT_->use_high_bd_ == 0) {
+    if (UUT_->use_highbd_ == 0) {
       list[index] = (uint8_t) val;
     } else {
       CONVERT_TO_SHORTPTR(list)[index] = val;
@@ -448,16 +448,16 @@
                                           unsigned int output_width,
                                           unsigned int output_height) {
 #if CONFIG_VP9_HIGHBITDEPTH
-    if (UUT_->use_high_bd_ == 0) {
+    if (UUT_->use_highbd_ == 0) {
       filter_average_block2d_8_c(src_ptr, src_stride, HFilter, VFilter,
                                  dst_ptr, dst_stride, output_width,
                                  output_height);
     } else {
-      high_filter_average_block2d_8_c(CONVERT_TO_SHORTPTR(src_ptr), src_stride,
-                                      HFilter, VFilter,
-                                      CONVERT_TO_SHORTPTR(dst_ptr), dst_stride,
-                                      output_width, output_height,
-                                      UUT_->use_high_bd_);
+      highbd_filter_average_block2d_8_c(CONVERT_TO_SHORTPTR(src_ptr),
+                                        src_stride, HFilter, VFilter,
+                                        CONVERT_TO_SHORTPTR(dst_ptr),
+                                        dst_stride, output_width, output_height,
+                                        UUT_->use_highbd_);
     }
 #else
     filter_average_block2d_8_c(src_ptr, src_stride, HFilter, VFilter,
@@ -475,14 +475,14 @@
                                   unsigned int output_width,
                                   unsigned int output_height) {
 #if CONFIG_VP9_HIGHBITDEPTH
-    if (UUT_->use_high_bd_ == 0) {
+    if (UUT_->use_highbd_ == 0) {
       filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter,
                          dst_ptr, dst_stride, output_width, output_height);
     } else {
-      high_filter_block2d_8_c(CONVERT_TO_SHORTPTR(src_ptr), src_stride,
-                              HFilter, VFilter,
-                              CONVERT_TO_SHORTPTR(dst_ptr), dst_stride,
-                              output_width, output_height, UUT_->use_high_bd_);
+      highbd_filter_block2d_8_c(CONVERT_TO_SHORTPTR(src_ptr), src_stride,
+                                HFilter, VFilter,
+                                CONVERT_TO_SHORTPTR(dst_ptr), dst_stride,
+                                output_width, output_height, UUT_->use_highbd_);
     }
 #else
     filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter,
@@ -598,7 +598,7 @@
   uint8_t ref8[kOutputStride * kMaxDimension];
   uint16_t ref16[kOutputStride * kMaxDimension];
   uint8_t* ref;
-  if (UUT_->use_high_bd_ == 0) {
+  if (UUT_->use_highbd_ == 0) {
     ref = ref8;
   } else {
     ref = CONVERT_TO_BYTEPTR(ref16);
@@ -657,7 +657,7 @@
   uint8_t ref8[kOutputStride * kMaxDimension];
   uint16_t ref16[kOutputStride * kMaxDimension];
   uint8_t* ref;
-  if (UUT_->use_high_bd_ == 0) {
+  if (UUT_->use_highbd_ == 0) {
     ref = ref8;
   } else {
     ref = CONVERT_TO_BYTEPTR(ref16);
@@ -672,7 +672,7 @@
     for (int x = 0; x < Width(); ++x) {
       uint16_t r;
 #if CONFIG_VP9_HIGHBITDEPTH
-      if (UUT_->use_high_bd_ == 0 || UUT_->use_high_bd_ == 8) {
+      if (UUT_->use_highbd_ == 0 || UUT_->use_highbd_ == 8) {
         r = prng.Rand8Extremes();
       } else {
         r = prng.Rand16() & mask_;
@@ -736,7 +736,7 @@
   uint8_t ref8[kOutputStride * kMaxDimension];
   uint16_t ref16[kOutputStride * kMaxDimension];
   uint8_t *ref;
-  if (UUT_->use_high_bd_ == 0) {
+  if (UUT_->use_highbd_ == 0) {
     ref = ref8;
   } else {
     ref = CONVERT_TO_BYTEPTR(ref16);
@@ -751,7 +751,7 @@
     for (int x = 0; x < Width(); ++x) {
       uint16_t r;
 #if CONFIG_VP9_HIGHBITDEPTH
-      if (UUT_->use_high_bd_ == 0 || UUT_->use_high_bd_ == 8) {
+      if (UUT_->use_highbd_ == 0 || UUT_->use_highbd_ == 8) {
         r = prng.Rand8Extremes();
       } else {
         r = prng.Rand16() & mask_;
@@ -978,9 +978,9 @@
                                  const int16_t *filter_y,
                                  int filter_y_stride,
                                  int w, int h) {
-  vp9_high_convolve8_horiz_sse2(src, src_stride, dst, dst_stride, filter_x,
-                                filter_x_stride, filter_y, filter_y_stride,
-                                w, h, 8);
+  vp9_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride, filter_x,
+                                  filter_x_stride, filter_y, filter_y_stride,
+                                  w, h, 8);
 }
 
 void wrap_convolve8_avg_horiz_sse2_8(const uint8_t *src, ptrdiff_t src_stride,
@@ -990,8 +990,9 @@
                                      const int16_t *filter_y,
                                      int filter_y_stride,
                                      int w, int h) {
-  vp9_high_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride, filter_x,
-    filter_x_stride, filter_y, filter_y_stride, w, h, 8);
+  vp9_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_x_stride,
+                                      filter_y, filter_y_stride, w, h, 8);
 }
 
 void wrap_convolve8_vert_sse2_8(const uint8_t *src, ptrdiff_t src_stride,
@@ -1001,8 +1002,9 @@
                                 const int16_t *filter_y,
                                 int filter_y_stride,
                                 int w, int h) {
-  vp9_high_convolve8_vert_sse2(src, src_stride, dst, dst_stride, filter_x,
-    filter_x_stride, filter_y, filter_y_stride, w, h, 8);
+  vp9_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride,
+                                 filter_x, filter_x_stride,
+                                 filter_y, filter_y_stride, w, h, 8);
 }
 
 void wrap_convolve8_avg_vert_sse2_8(const uint8_t *src, ptrdiff_t src_stride,
@@ -1012,9 +1014,9 @@
                                     const int16_t *filter_y,
                                     int filter_y_stride,
                                     int w, int h) {
-  vp9_high_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride, filter_x,
-                                   filter_x_stride, filter_y, filter_y_stride,
-                                   w, h, 8);
+  vp9_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride,
+                                     filter_x, filter_x_stride,
+                                     filter_y, filter_y_stride, w, h, 8);
 }
 
 void wrap_convolve8_sse2_8(const uint8_t *src, ptrdiff_t src_stride,
@@ -1024,8 +1026,9 @@
                            const int16_t *filter_y,
                            int filter_y_stride,
                            int w, int h) {
-  vp9_high_convolve8_sse2(src, src_stride, dst, dst_stride, filter_x,
-    filter_x_stride, filter_y, filter_y_stride, w, h, 8);
+  vp9_highbd_convolve8_sse2(src, src_stride, dst, dst_stride,
+                            filter_x, filter_x_stride,
+                            filter_y, filter_y_stride, w, h, 8);
 }
 
 void wrap_convolve8_avg_sse2_8(const uint8_t *src, ptrdiff_t src_stride,
@@ -1035,8 +1038,9 @@
                                const int16_t *filter_y,
                                int filter_y_stride,
                                int w, int h) {
-  vp9_high_convolve8_avg_sse2(src, src_stride, dst, dst_stride, filter_x,
-    filter_x_stride, filter_y, filter_y_stride, w, h, 8);
+  vp9_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride,
+                                filter_x, filter_x_stride,
+                                filter_y, filter_y_stride, w, h, 8);
 }
 
 void wrap_convolve8_horiz_sse2_10(const uint8_t *src, ptrdiff_t src_stride,
@@ -1046,8 +1050,9 @@
                                   const int16_t *filter_y,
                                   int filter_y_stride,
                                   int w, int h) {
-  vp9_high_convolve8_horiz_sse2(src, src_stride, dst, dst_stride, filter_x,
-    filter_x_stride, filter_y, filter_y_stride, w, h, 10);
+  vp9_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride,
+                                  filter_x, filter_x_stride,
+                                  filter_y, filter_y_stride, w, h, 10);
 }
 
 void wrap_convolve8_avg_horiz_sse2_10(const uint8_t *src, ptrdiff_t src_stride,
@@ -1057,8 +1062,9 @@
                                       const int16_t *filter_y,
                                       int filter_y_stride,
                                       int w, int h) {
-  vp9_high_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride, filter_x,
-    filter_x_stride, filter_y, filter_y_stride, w, h, 10);
+  vp9_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_x_stride,
+                                      filter_y, filter_y_stride, w, h, 10);
 }
 
 void wrap_convolve8_vert_sse2_10(const uint8_t *src, ptrdiff_t src_stride,
@@ -1068,8 +1074,9 @@
                                  const int16_t *filter_y,
                                  int filter_y_stride,
                                  int w, int h) {
-  vp9_high_convolve8_vert_sse2(src, src_stride, dst, dst_stride, filter_x,
-    filter_x_stride, filter_y, filter_y_stride, w, h, 10);
+  vp9_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride,
+                                 filter_x, filter_x_stride,
+                                 filter_y, filter_y_stride, w, h, 10);
 }
 
 void wrap_convolve8_avg_vert_sse2_10(const uint8_t *src, ptrdiff_t src_stride,
@@ -1079,8 +1086,9 @@
                                      const int16_t *filter_y,
                                      int filter_y_stride,
                                      int w, int h) {
-  vp9_high_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride, filter_x,
-    filter_x_stride, filter_y, filter_y_stride, w, h, 10);
+  vp9_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride,
+                                     filter_x, filter_x_stride,
+                                     filter_y, filter_y_stride, w, h, 10);
 }
 
 void wrap_convolve8_sse2_10(const uint8_t *src, ptrdiff_t src_stride,
@@ -1090,8 +1098,9 @@
                             const int16_t *filter_y,
                             int filter_y_stride,
                             int w, int h) {
-  vp9_high_convolve8_sse2(src, src_stride, dst, dst_stride, filter_x,
-    filter_x_stride, filter_y, filter_y_stride, w, h, 10);
+  vp9_highbd_convolve8_sse2(src, src_stride, dst, dst_stride,
+                            filter_x, filter_x_stride,
+                            filter_y, filter_y_stride, w, h, 10);
 }
 
 void wrap_convolve8_avg_sse2_10(const uint8_t *src, ptrdiff_t src_stride,
@@ -1101,9 +1110,9 @@
                                 const int16_t *filter_y,
                                 int filter_y_stride,
                                 int w, int h) {
-  vp9_high_convolve8_avg_sse2(src, src_stride, dst, dst_stride, filter_x,
-                              filter_x_stride, filter_y, filter_y_stride,
-                              w, h, 10);
+  vp9_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride,
+                                filter_x, filter_x_stride,
+                                filter_y, filter_y_stride, w, h, 10);
 }
 
 void wrap_convolve8_horiz_sse2_12(const uint8_t *src, ptrdiff_t src_stride,
@@ -1113,9 +1122,9 @@
                                   const int16_t *filter_y,
                                   int filter_y_stride,
                                   int w, int h) {
-  vp9_high_convolve8_horiz_sse2(src, src_stride, dst, dst_stride, filter_x,
-                                filter_x_stride, filter_y, filter_y_stride,
-                                w, h, 12);
+  vp9_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride,
+                                  filter_x, filter_x_stride,
+                                  filter_y, filter_y_stride, w, h, 12);
 }
 
 void wrap_convolve8_avg_horiz_sse2_12(const uint8_t *src, ptrdiff_t src_stride,
@@ -1125,9 +1134,9 @@
                                       const int16_t *filter_y,
                                       int filter_y_stride,
                                       int w, int h) {
-  vp9_high_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride, filter_x,
-                                    filter_x_stride, filter_y, filter_y_stride,
-                                    w, h, 12);
+  vp9_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_x_stride,
+                                      filter_y, filter_y_stride, w, h, 12);
 }
 
 void wrap_convolve8_vert_sse2_12(const uint8_t *src, ptrdiff_t src_stride,
@@ -1137,9 +1146,9 @@
                                  const int16_t *filter_y,
                                  int filter_y_stride,
                                  int w, int h) {
-  vp9_high_convolve8_vert_sse2(src, src_stride, dst, dst_stride, filter_x,
-                               filter_x_stride, filter_y, filter_y_stride,
-                               w, h, 12);
+  vp9_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride,
+                                 filter_x, filter_x_stride,
+                                 filter_y, filter_y_stride, w, h, 12);
 }
 
 void wrap_convolve8_avg_vert_sse2_12(const uint8_t *src, ptrdiff_t src_stride,
@@ -1149,8 +1158,9 @@
                                      const int16_t *filter_y,
                                      int filter_y_stride,
                                      int w, int h) {
-  vp9_high_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride, filter_x,
-                                   filter_x_stride, filter_y, filter_y_stride, w, h, 12);
+  vp9_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride,
+                                     filter_x, filter_x_stride,
+                                     filter_y, filter_y_stride, w, h, 12);
 }
 
 void wrap_convolve8_sse2_12(const uint8_t *src, ptrdiff_t src_stride,
@@ -1160,8 +1170,9 @@
                             const int16_t *filter_y,
                             int filter_y_stride,
                             int w, int h) {
-  vp9_high_convolve8_sse2(src, src_stride, dst, dst_stride, filter_x,
-    filter_x_stride, filter_y, filter_y_stride, w, h, 12);
+  vp9_highbd_convolve8_sse2(src, src_stride, dst, dst_stride,
+                            filter_x, filter_x_stride,
+                            filter_y, filter_y_stride, w, h, 12);
 }
 
 void wrap_convolve8_avg_sse2_12(const uint8_t *src, ptrdiff_t src_stride,
@@ -1171,8 +1182,9 @@
                                 const int16_t *filter_y,
                                 int filter_y_stride,
                                 int w, int h) {
-  vp9_high_convolve8_avg_sse2(src, src_stride, dst, dst_stride, filter_x,
-                              filter_x_stride, filter_y, filter_y_stride, w, h, 12);
+  vp9_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride,
+                                filter_x, filter_x_stride,
+                                filter_y, filter_y_stride, w, h, 12);
 }
 #endif  // HAVE_SSE2 && ARCH_X86_64
 
@@ -1183,8 +1195,9 @@
                               const int16_t *filter_y,
                               int filter_y_stride,
                               int w, int h) {
-  vp9_high_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                             filter_x_stride, filter_y, filter_y_stride, w, h, 8);
+  vp9_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+                               filter_x, filter_x_stride,
+                               filter_y, filter_y_stride, w, h, 8);
 }
 
 void wrap_convolve8_avg_horiz_c_8(const uint8_t *src, ptrdiff_t src_stride,
@@ -1194,8 +1207,9 @@
                                   const int16_t *filter_y,
                                   int filter_y_stride,
                                   int w, int h) {
-  vp9_high_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                                 filter_x_stride, filter_y, filter_y_stride, w, h, 8);
+  vp9_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_x_stride,
+                                   filter_y, filter_y_stride, w, h, 8);
 }
 
 void wrap_convolve8_vert_c_8(const uint8_t *src, ptrdiff_t src_stride,
@@ -1205,8 +1219,9 @@
                              const int16_t *filter_y,
                              int filter_y_stride,
                              int w, int h) {
-  vp9_high_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                            filter_x_stride, filter_y, filter_y_stride, w, h, 8);
+  vp9_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride,
+                              filter_x, filter_x_stride,
+                              filter_y, filter_y_stride, w, h, 8);
 }
 
 void wrap_convolve8_avg_vert_c_8(const uint8_t *src, ptrdiff_t src_stride,
@@ -1216,8 +1231,9 @@
                                  const int16_t *filter_y,
                                  int filter_y_stride,
                                  int w, int h) {
-  vp9_high_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                                filter_x_stride, filter_y, filter_y_stride, w, h, 8);
+  vp9_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+                                  filter_x, filter_x_stride,
+                                  filter_y, filter_y_stride, w, h, 8);
 }
 
 void wrap_convolve8_c_8(const uint8_t *src, ptrdiff_t src_stride,
@@ -1227,8 +1243,9 @@
                         const int16_t *filter_y,
                         int filter_y_stride,
                         int w, int h) {
-  vp9_high_convolve8_c(src, src_stride, dst, dst_stride, filter_x,
-                       filter_x_stride, filter_y, filter_y_stride, w, h, 8);
+  vp9_highbd_convolve8_c(src, src_stride, dst, dst_stride,
+                         filter_x, filter_x_stride,
+                         filter_y, filter_y_stride, w, h, 8);
 }
 
 void wrap_convolve8_avg_c_8(const uint8_t *src, ptrdiff_t src_stride,
@@ -1238,9 +1255,9 @@
                             const int16_t *filter_y,
                             int filter_y_stride,
                             int w, int h) {
-  vp9_high_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,
-                           filter_x_stride, filter_y, filter_y_stride,
-                           w, h, 8);
+  vp9_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride,
+                             filter_x, filter_x_stride,
+                             filter_y, filter_y_stride, w, h, 8);
 }
 
 void wrap_convolve8_horiz_c_10(const uint8_t *src, ptrdiff_t src_stride,
@@ -1250,8 +1267,9 @@
                                const int16_t *filter_y,
                                int filter_y_stride,
                                int w, int h) {
-  vp9_high_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                             filter_x_stride, filter_y, filter_y_stride, w, h, 10);
+  vp9_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+                               filter_x, filter_x_stride,
+                               filter_y, filter_y_stride, w, h, 10);
 }
 
 void wrap_convolve8_avg_horiz_c_10(const uint8_t *src, ptrdiff_t src_stride,
@@ -1261,9 +1279,9 @@
                                    const int16_t *filter_y,
                                    int filter_y_stride,
                                    int w, int h) {
-  vp9_high_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                                 filter_x_stride, filter_y, filter_y_stride,
-                                 w, h, 10);
+  vp9_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_x_stride,
+                                   filter_y, filter_y_stride, w, h, 10);
 }
 
 void wrap_convolve8_vert_c_10(const uint8_t *src, ptrdiff_t src_stride,
@@ -1273,8 +1291,9 @@
                               const int16_t *filter_y,
                               int filter_y_stride,
                               int w, int h) {
-  vp9_high_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                            filter_x_stride, filter_y, filter_y_stride, w, h, 10);
+  vp9_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride,
+                              filter_x, filter_x_stride,
+                              filter_y, filter_y_stride, w, h, 10);
 }
 
 void wrap_convolve8_avg_vert_c_10(const uint8_t *src, ptrdiff_t src_stride,
@@ -1284,8 +1303,9 @@
                                   const int16_t *filter_y,
                                   int filter_y_stride,
                                   int w, int h) {
-  vp9_high_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                                filter_x_stride, filter_y, filter_y_stride, w, h, 10);
+  vp9_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+                                  filter_x, filter_x_stride,
+                                  filter_y, filter_y_stride, w, h, 10);
 }
 
 void wrap_convolve8_c_10(const uint8_t *src, ptrdiff_t src_stride,
@@ -1295,8 +1315,9 @@
                          const int16_t *filter_y,
                          int filter_y_stride,
                          int w, int h) {
-  vp9_high_convolve8_c(src, src_stride, dst, dst_stride, filter_x,
-    filter_x_stride, filter_y, filter_y_stride, w, h, 10);
+  vp9_highbd_convolve8_c(src, src_stride, dst, dst_stride,
+                         filter_x, filter_x_stride,
+                         filter_y, filter_y_stride, w, h, 10);
 }
 
 void wrap_convolve8_avg_c_10(const uint8_t *src, ptrdiff_t src_stride,
@@ -1306,8 +1327,9 @@
                              const int16_t *filter_y,
                              int filter_y_stride,
                              int w, int h) {
-  vp9_high_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,
-                           filter_x_stride, filter_y, filter_y_stride, w, h, 10);
+  vp9_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride,
+                             filter_x, filter_x_stride,
+                             filter_y, filter_y_stride, w, h, 10);
 }
 
 void wrap_convolve8_horiz_c_12(const uint8_t *src, ptrdiff_t src_stride,
@@ -1317,9 +1339,9 @@
                                const int16_t *filter_y,
                                int filter_y_stride,
                                int w, int h) {
-  vp9_high_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                             filter_x_stride, filter_y, filter_y_stride,
-                             w, h, 12);
+  vp9_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+                               filter_x, filter_x_stride,
+                               filter_y, filter_y_stride, w, h, 12);
 }
 
 void wrap_convolve8_avg_horiz_c_12(const uint8_t *src, ptrdiff_t src_stride,
@@ -1329,9 +1351,9 @@
                                    const int16_t *filter_y,
                                    int filter_y_stride,
                                    int w, int h) {
-  vp9_high_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                                 filter_x_stride, filter_y, filter_y_stride,
-                                 w, h, 12);
+  vp9_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_x_stride,
+                                   filter_y, filter_y_stride, w, h, 12);
 }
 
 void wrap_convolve8_vert_c_12(const uint8_t *src, ptrdiff_t src_stride,
@@ -1341,9 +1363,9 @@
                               const int16_t *filter_y,
                               int filter_y_stride,
                               int w, int h) {
-  vp9_high_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                            filter_x_stride, filter_y, filter_y_stride,
-                            w, h, 12);
+  vp9_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride,
+                              filter_x, filter_x_stride,
+                              filter_y, filter_y_stride, w, h, 12);
 }
 
 void wrap_convolve8_avg_vert_c_12(const uint8_t *src, ptrdiff_t src_stride,
@@ -1353,9 +1375,9 @@
                                   const int16_t *filter_y,
                                   int filter_y_stride,
                                   int w, int h) {
-  vp9_high_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                                filter_x_stride, filter_y, filter_y_stride,
-                                w, h, 12);
+  vp9_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+                                  filter_x, filter_x_stride,
+                                  filter_y, filter_y_stride, w, h, 12);
 }
 
 void wrap_convolve8_c_12(const uint8_t *src, ptrdiff_t src_stride,
@@ -1365,9 +1387,9 @@
                          const int16_t *filter_y,
                          int filter_y_stride,
                          int w, int h) {
-  vp9_high_convolve8_c(src, src_stride, dst, dst_stride, filter_x,
-                       filter_x_stride, filter_y, filter_y_stride,
-                       w, h, 12);
+  vp9_highbd_convolve8_c(src, src_stride, dst, dst_stride,
+                         filter_x, filter_x_stride,
+                         filter_y, filter_y_stride, w, h, 12);
 }
 
 void wrap_convolve8_avg_c_12(const uint8_t *src, ptrdiff_t src_stride,
@@ -1377,9 +1399,9 @@
                              const int16_t *filter_y,
                              int filter_y_stride,
                              int w, int h) {
-  vp9_high_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,
-                           filter_x_stride, filter_y, filter_y_stride,
-                           w, h, 12);
+  vp9_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride,
+                             filter_x, filter_x_stride,
+                             filter_y, filter_y_stride, w, h, 12);
 }
 
 const ConvolveFunctions convolve8_c(

diff --git a/source/libvpx/test/dct16x16_test.cc b/source/libvpx/test/dct16x16_test.cc
index d1ce109..5222d49 100644
--- a/source/libvpx/test/dct16x16_test.cc
+++ b/source/libvpx/test/dct16x16_test.cc

@@ -287,11 +287,11 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void idct16x16_10(const tran_low_t *in, uint8_t *out, int stride) {
-  vp9_high_idct16x16_256_add_c(in, out, stride, 10);
+  vp9_highbd_idct16x16_256_add_c(in, out, stride, 10);
 }
 
 void idct16x16_12(const tran_low_t *in, uint8_t *out, int stride) {
-  vp9_high_idct16x16_256_add_c(in, out, stride, 12);
+  vp9_highbd_idct16x16_256_add_c(in, out, stride, 12);
 }
 
 void idct16x16_10_ref(const tran_low_t *in, uint8_t *out, int stride,
@@ -305,11 +305,11 @@
 }
 
 void iht16x16_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
-  vp9_high_iht16x16_256_add_c(in, out, stride, tx_type, 10);
+  vp9_highbd_iht16x16_256_add_c(in, out, stride, tx_type, 10);
 }
 
 void iht16x16_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
-  vp9_high_iht16x16_256_add_c(in, out, stride, tx_type, 12);
+  vp9_highbd_iht16x16_256_add_c(in, out, stride, tx_type, 12);
 }
 #endif
 
@@ -443,7 +443,7 @@
 
   void RunQuantCheck(int dc_thred, int ac_thred) {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 1000;
+    const int count_test_block = 100000;
     DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
     DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
     DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kNumCoeffs);
@@ -700,7 +700,7 @@
 TEST_P(Trans16x16HT, QuantCheck) {
   // The encoder skips any non-DC intra prediction modes,
   // when the quantization step size goes beyond 988.
-  RunQuantCheck(549, 988);
+  RunQuantCheck(429, 729);
 }
 
 using std::tr1::make_tuple;
@@ -709,8 +709,8 @@
 INSTANTIATE_TEST_CASE_P(
     C, Trans16x16DCT,
     ::testing::Values(
-        make_tuple(&vp9_high_fdct16x16_c, &idct16x16_10, 0, VPX_BITS_10),
-        make_tuple(&vp9_high_fdct16x16_c, &idct16x16_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fdct16x16_c, &idct16x16_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fdct16x16_c, &idct16x16_12, 0, VPX_BITS_12),
         make_tuple(&vp9_fdct16x16_c, &vp9_idct16x16_256_add_c, 0, VPX_BITS_8)));
 #else
 INSTANTIATE_TEST_CASE_P(
@@ -723,14 +723,14 @@
 INSTANTIATE_TEST_CASE_P(
     C, Trans16x16HT,
     ::testing::Values(
-        make_tuple(&vp9_high_fht16x16_c, &iht16x16_10, 0, VPX_BITS_10),
-        make_tuple(&vp9_high_fht16x16_c, &iht16x16_10, 1, VPX_BITS_10),
-        make_tuple(&vp9_high_fht16x16_c, &iht16x16_10, 2, VPX_BITS_10),
-        make_tuple(&vp9_high_fht16x16_c, &iht16x16_10, 3, VPX_BITS_10),
-        make_tuple(&vp9_high_fht16x16_c, &iht16x16_12, 0, VPX_BITS_12),
-        make_tuple(&vp9_high_fht16x16_c, &iht16x16_12, 1, VPX_BITS_12),
-        make_tuple(&vp9_high_fht16x16_c, &iht16x16_12, 2, VPX_BITS_12),
-        make_tuple(&vp9_high_fht16x16_c, &iht16x16_12, 3, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 1, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 2, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 3, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 1, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 2, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 3, VPX_BITS_12),
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
@@ -745,7 +745,7 @@
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
 #endif
 
-#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     NEON, Trans16x16DCT,
     ::testing::Values(
@@ -753,7 +753,7 @@
                    &vp9_idct16x16_256_add_neon, 0, VPX_BITS_8)));
 #endif
 
-#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans16x16DCT,
     ::testing::Values(
@@ -772,7 +772,7 @@
                    VPX_BITS_8)));
 #endif
 
-#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     SSSE3, Trans16x16DCT,
     ::testing::Values(

diff --git a/source/libvpx/test/dct32x32_test.cc b/source/libvpx/test/dct32x32_test.cc
index c7a1931..4f08be5 100644
--- a/source/libvpx/test/dct32x32_test.cc
+++ b/source/libvpx/test/dct32x32_test.cc

@@ -80,11 +80,11 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void idct32x32_10(const tran_low_t *in, uint8_t *out, int stride) {
-  vp9_high_idct32x32_1024_add_c(in, out, stride, 10);
+  vp9_highbd_idct32x32_1024_add_c(in, out, stride, 10);
 }
 
 void idct32x32_12(const tran_low_t *in, uint8_t *out, int stride) {
-  vp9_high_idct32x32_1024_add_c(in, out, stride, 12);
+  vp9_highbd_idct32x32_1024_add_c(in, out, stride, 12);
 }
 #endif
 
@@ -311,13 +311,13 @@
 INSTANTIATE_TEST_CASE_P(
     C, Trans32x32Test,
     ::testing::Values(
-        make_tuple(&vp9_high_fdct32x32_c,
+        make_tuple(&vp9_highbd_fdct32x32_c,
                    &idct32x32_10, 0, VPX_BITS_10),
-        make_tuple(&vp9_high_fdct32x32_rd_c,
+        make_tuple(&vp9_highbd_fdct32x32_rd_c,
                    &idct32x32_10, 1, VPX_BITS_10),
-        make_tuple(&vp9_high_fdct32x32_c,
+        make_tuple(&vp9_highbd_fdct32x32_c,
                    &idct32x32_12, 0, VPX_BITS_12),
-        make_tuple(&vp9_high_fdct32x32_rd_c,
+        make_tuple(&vp9_highbd_fdct32x32_rd_c,
                    &idct32x32_12, 1, VPX_BITS_12),
         make_tuple(&vp9_fdct32x32_c,
                    &vp9_idct32x32_1024_add_c, 0, VPX_BITS_8),
@@ -333,7 +333,7 @@
                    &vp9_idct32x32_1024_add_c, 1, VPX_BITS_8)));
 #endif
 
-#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     NEON, Trans32x32Test,
     ::testing::Values(
@@ -343,7 +343,7 @@
                    &vp9_idct32x32_1024_add_neon, 1, VPX_BITS_8)));
 #endif
 
-#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans32x32Test,
     ::testing::Values(
@@ -353,7 +353,7 @@
                    &vp9_idct32x32_1024_add_sse2, 1, VPX_BITS_8)));
 #endif
 
-#if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     AVX2, Trans32x32Test,
     ::testing::Values(

diff --git a/source/libvpx/test/fdct4x4_test.cc b/source/libvpx/test/fdct4x4_test.cc
index f803c8e..ef35902 100644
--- a/source/libvpx/test/fdct4x4_test.cc
+++ b/source/libvpx/test/fdct4x4_test.cc

@@ -53,27 +53,27 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void idct4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
-  vp9_high_idct4x4_16_add_c(in, out, stride, 10);
+  vp9_highbd_idct4x4_16_add_c(in, out, stride, 10);
 }
 
 void idct4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
-  vp9_high_idct4x4_16_add_c(in, out, stride, 12);
+  vp9_highbd_idct4x4_16_add_c(in, out, stride, 12);
 }
 
 void iht4x4_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
-  vp9_high_iht4x4_16_add_c(in, out, stride, tx_type, 10);
+  vp9_highbd_iht4x4_16_add_c(in, out, stride, tx_type, 10);
 }
 
 void iht4x4_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
-  vp9_high_iht4x4_16_add_c(in, out, stride, tx_type, 12);
+  vp9_highbd_iht4x4_16_add_c(in, out, stride, tx_type, 12);
 }
 
 void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
-  vp9_high_iwht4x4_16_add_c(in, out, stride, 10);
+  vp9_highbd_iwht4x4_16_add_c(in, out, stride, 10);
 }
 
 void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
-  vp9_high_iwht4x4_16_add_c(in, out, stride, 12);
+  vp9_highbd_iwht4x4_16_add_c(in, out, stride, 12);
 }
 #endif
 
@@ -408,8 +408,8 @@
 INSTANTIATE_TEST_CASE_P(
     C, Trans4x4DCT,
     ::testing::Values(
-        make_tuple(&vp9_high_fdct4x4_c, &idct4x4_10, 0, VPX_BITS_10),
-        make_tuple(&vp9_high_fdct4x4_c, &idct4x4_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fdct4x4_c, &idct4x4_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fdct4x4_c, &idct4x4_12, 0, VPX_BITS_12),
         make_tuple(&vp9_fdct4x4_c, &vp9_idct4x4_16_add_c, 0, VPX_BITS_8)));
 #else
 INSTANTIATE_TEST_CASE_P(
@@ -422,14 +422,14 @@
 INSTANTIATE_TEST_CASE_P(
     C, Trans4x4HT,
     ::testing::Values(
-        make_tuple(&vp9_high_fht4x4_c, &iht4x4_10, 0, VPX_BITS_10),
-        make_tuple(&vp9_high_fht4x4_c, &iht4x4_10, 1, VPX_BITS_10),
-        make_tuple(&vp9_high_fht4x4_c, &iht4x4_10, 2, VPX_BITS_10),
-        make_tuple(&vp9_high_fht4x4_c, &iht4x4_10, 3, VPX_BITS_10),
-        make_tuple(&vp9_high_fht4x4_c, &iht4x4_12, 0, VPX_BITS_12),
-        make_tuple(&vp9_high_fht4x4_c, &iht4x4_12, 1, VPX_BITS_12),
-        make_tuple(&vp9_high_fht4x4_c, &iht4x4_12, 2, VPX_BITS_12),
-        make_tuple(&vp9_high_fht4x4_c, &iht4x4_12, 3, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 1, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 2, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 3, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 1, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 2, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 3, VPX_BITS_12),
         make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
         make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
         make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
@@ -448,8 +448,8 @@
 INSTANTIATE_TEST_CASE_P(
     C, Trans4x4WHT,
     ::testing::Values(
-        make_tuple(&vp9_high_fwht4x4_c, &iwht4x4_10, 0, VPX_BITS_10),
-        make_tuple(&vp9_high_fwht4x4_c, &iwht4x4_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_12, 0, VPX_BITS_12),
         make_tuple(&vp9_fwht4x4_c, &vp9_iwht4x4_16_add_c, 0, VPX_BITS_8)));
 #else
 INSTANTIATE_TEST_CASE_P(
@@ -458,7 +458,7 @@
         make_tuple(&vp9_fwht4x4_c, &vp9_iwht4x4_16_add_c, 0, VPX_BITS_8)));
 #endif
 
-#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     NEON, Trans4x4DCT,
     ::testing::Values(
@@ -473,14 +473,15 @@
         make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3, VPX_BITS_8)));
 #endif
 
-#if CONFIG_USE_X86INC && HAVE_MMX && !CONFIG_VP9_HIGHBITDEPTH
+#if CONFIG_USE_X86INC && HAVE_MMX && !CONFIG_VP9_HIGHBITDEPTH && \
+    !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     MMX, Trans4x4WHT,
     ::testing::Values(
         make_tuple(&vp9_fwht4x4_mmx, &vp9_iwht4x4_16_add_c, 0, VPX_BITS_8)));
 #endif
 
-#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans4x4DCT,
     ::testing::Values(

diff --git a/source/libvpx/test/fdct8x8_test.cc b/source/libvpx/test/fdct8x8_test.cc
index 60d0be5..103556d 100644
--- a/source/libvpx/test/fdct8x8_test.cc
+++ b/source/libvpx/test/fdct8x8_test.cc

@@ -82,19 +82,19 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void idct8x8_10(const tran_low_t *in, uint8_t *out, int stride) {
-  vp9_high_idct8x8_64_add_c(in, out, stride, 10);
+  vp9_highbd_idct8x8_64_add_c(in, out, stride, 10);
 }
 
 void idct8x8_12(const tran_low_t *in, uint8_t *out, int stride) {
-  vp9_high_idct8x8_64_add_c(in, out, stride, 12);
+  vp9_highbd_idct8x8_64_add_c(in, out, stride, 12);
 }
 
 void iht8x8_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
-  vp9_high_iht8x8_64_add_c(in, out, stride, tx_type, 10);
+  vp9_highbd_iht8x8_64_add_c(in, out, stride, tx_type, 10);
 }
 
 void iht8x8_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
-  vp9_high_iht8x8_64_add_c(in, out, stride, tx_type, 12);
+  vp9_highbd_iht8x8_64_add_c(in, out, stride, tx_type, 12);
 }
 #endif
 
@@ -532,8 +532,8 @@
 INSTANTIATE_TEST_CASE_P(
     C, FwdTrans8x8DCT,
     ::testing::Values(
-        make_tuple(&vp9_high_fdct8x8_c, &idct8x8_10, 0, VPX_BITS_10),
-        make_tuple(&vp9_high_fdct8x8_c, &idct8x8_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fdct8x8_c, &idct8x8_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fdct8x8_c, &idct8x8_12, 0, VPX_BITS_12),
         make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8)));
 #else
 INSTANTIATE_TEST_CASE_P(
@@ -546,14 +546,14 @@
 INSTANTIATE_TEST_CASE_P(
     C, FwdTrans8x8HT,
     ::testing::Values(
-        make_tuple(&vp9_high_fht8x8_c, &iht8x8_10, 0, VPX_BITS_10),
-        make_tuple(&vp9_high_fht8x8_c, &iht8x8_10, 1, VPX_BITS_10),
-        make_tuple(&vp9_high_fht8x8_c, &iht8x8_10, 2, VPX_BITS_10),
-        make_tuple(&vp9_high_fht8x8_c, &iht8x8_10, 3, VPX_BITS_10),
-        make_tuple(&vp9_high_fht8x8_c, &iht8x8_12, 0, VPX_BITS_12),
-        make_tuple(&vp9_high_fht8x8_c, &iht8x8_12, 1, VPX_BITS_12),
-        make_tuple(&vp9_high_fht8x8_c, &iht8x8_12, 2, VPX_BITS_12),
-        make_tuple(&vp9_high_fht8x8_c, &iht8x8_12, 3, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 1, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 2, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 3, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 1, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 2, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 3, VPX_BITS_12),
         make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8),
         make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 1, VPX_BITS_8),
         make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8),
@@ -568,7 +568,7 @@
         make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8)));
 #endif
 
-#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     NEON, FwdTrans8x8DCT,
     ::testing::Values(
@@ -583,7 +583,7 @@
         make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 3, VPX_BITS_8)));
 #endif
 
-#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     SSE2, FwdTrans8x8DCT,
     ::testing::Values(
@@ -598,7 +598,8 @@
         make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 3, VPX_BITS_8)));
 #endif
 
-#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH && \
+    !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     SSSE3, FwdTrans8x8DCT,
     ::testing::Values(

diff --git a/source/libvpx/test/frame_size_tests.cc b/source/libvpx/test/frame_size_tests.cc
index 1c9a522..95cc66a 100644
--- a/source/libvpx/test/frame_size_tests.cc
+++ b/source/libvpx/test/frame_size_tests.cc

@@ -84,4 +84,13 @@
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 #endif
 }
+
+TEST_F(VP9FrameSizeTestsLarge, OneByOneVideo) {
+  ::libvpx_test::RandomVideoSource video;
+
+  video.SetSize(1, 1);
+  video.set_limit(2);
+  expected_res_ = VPX_CODEC_OK;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
 }  // namespace

diff --git a/source/libvpx/test/i420_video_source.h b/source/libvpx/test/i420_video_source.h
index c3315f9..0a18480 100644
--- a/source/libvpx/test/i420_video_source.h
+++ b/source/libvpx/test/i420_video_source.h

@@ -13,104 +13,22 @@
 #include <cstdlib>
 #include <string>
 
-#include "test/video_source.h"
+#include "test/yuv_video_source.h"
 
 namespace libvpx_test {
 
 // This class extends VideoSource to allow parsing of raw yv12
 // so that we can do actual file encodes.
-class I420VideoSource : public VideoSource {
+class I420VideoSource : public YUVVideoSource {
  public:
   I420VideoSource(const std::string &file_name,
                   unsigned int width, unsigned int height,
                   int rate_numerator, int rate_denominator,
                   unsigned int start, int limit)
-      : file_name_(file_name),
-        input_file_(NULL),
-        img_(NULL),
-        start_(start),
-        limit_(limit),
-        frame_(0),
-        width_(0),
-        height_(0),
-        framerate_numerator_(rate_numerator),
-        framerate_denominator_(rate_denominator) {
-    // This initializes raw_sz_, width_, height_ and allocates an img.
-    SetSize(width, height);
-  }
-
-  virtual ~I420VideoSource() {
-    vpx_img_free(img_);
-    if (input_file_)
-      fclose(input_file_);
-  }
-
-  virtual void Begin() {
-    if (input_file_)
-      fclose(input_file_);
-    input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
-        << file_name_;
-    if (start_) {
-      fseek(input_file_, static_cast<unsigned>(raw_sz_) * start_, SEEK_SET);
-    }
-
-    frame_ = start_;
-    FillFrame();
-  }
-
-  virtual void Next() {
-    ++frame_;
-    FillFrame();
-  }
-
-  virtual vpx_image_t *img() const { return (frame_ < limit_) ? img_ : NULL;  }
-
-  // Models a stream where Timebase = 1/FPS, so pts == frame.
-  virtual vpx_codec_pts_t pts() const { return frame_; }
-
-  virtual unsigned long duration() const { return 1; }
-
-  virtual vpx_rational_t timebase() const {
-    const vpx_rational_t t = { framerate_denominator_, framerate_numerator_ };
-    return t;
-  }
-
-  virtual unsigned int frame() const { return frame_; }
-
-  virtual unsigned int limit() const { return limit_; }
-
-  void SetSize(unsigned int width, unsigned int height) {
-    if (width != width_ || height != height_) {
-      vpx_img_free(img_);
-      img_ = vpx_img_alloc(NULL, VPX_IMG_FMT_I420, width, height, 1);
-      ASSERT_TRUE(img_ != NULL);
-      width_ = width;
-      height_ = height;
-      raw_sz_ = width * height * 3 / 2;
-    }
-  }
-
-  virtual void FillFrame() {
-    ASSERT_TRUE(input_file_ != NULL);
-    // Read a frame from input_file.
-    if (fread(img_->img_data, raw_sz_, 1, input_file_) == 0) {
-      limit_ = frame_;
-    }
-  }
-
- protected:
-  std::string file_name_;
-  FILE *input_file_;
-  vpx_image_t *img_;
-  size_t raw_sz_;
-  unsigned int start_;
-  unsigned int limit_;
-  unsigned int frame_;
-  unsigned int width_;
-  unsigned int height_;
-  int framerate_numerator_;
-  int framerate_denominator_;
+      : YUVVideoSource(file_name, VPX_IMG_FMT_I420,
+                       width, height,
+                       rate_numerator, rate_denominator,
+                       start, limit) {}
 };
 
 }  // namespace libvpx_test

diff --git a/source/libvpx/test/lpf_8_test.cc b/source/libvpx/test/lpf_8_test.cc
new file mode 100644
index 0000000..abc4107
--- /dev/null
+++ b/source/libvpx/test/lpf_8_test.cc

@@ -0,0 +1,587 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cmath>
+#include <cstdlib>
+#include <string>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_integer.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+// Horizontally and Vertically need 32x32: 8  Coeffs preceeding filtered section
+//                                         16 Coefs within filtered section
+//                                         8  Coeffs following filtered section
+const int kNumCoeffs = 1024;
+
+const int number_of_iterations = 10000;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef void (*loop_op_t)(uint16_t *s, int p, const uint8_t *blimit,
+                          const uint8_t *limit, const uint8_t *thresh,
+                          int count, int bd);
+typedef void (*dual_loop_op_t)(uint16_t *s, int p, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0,
+                               const uint8_t *blimit1, const uint8_t *limit1,
+                               const uint8_t *thresh1, int bd);
+#else
+typedef void (*loop_op_t)(uint8_t *s, int p, const uint8_t *blimit,
+                          const uint8_t *limit, const uint8_t *thresh,
+                          int count);
+typedef void (*dual_loop_op_t)(uint8_t *s, int p, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0,
+                               const uint8_t *blimit1, const uint8_t *limit1,
+                               const uint8_t *thresh1);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+typedef std::tr1::tuple<loop_op_t, loop_op_t, int> loop8_param_t;
+typedef std::tr1::tuple<dual_loop_op_t, dual_loop_op_t, int> dualloop8_param_t;
+
+#if HAVE_SSE2
+#if CONFIG_VP9_HIGHBITDEPTH
+void wrapper_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,
+                              const uint8_t *limit, const uint8_t *thresh,
+                              int count, int bd) {
+  vp9_highbd_lpf_vertical_16_sse2(s, p, blimit, limit, thresh, bd);
+}
+
+void wrapper_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
+                           const uint8_t *limit, const uint8_t *thresh,
+                           int count, int bd) {
+  vp9_highbd_lpf_vertical_16_c(s, p, blimit, limit, thresh, bd);
+}
+
+void wrapper_vertical_16_dual_sse2(uint16_t *s, int p, const uint8_t *blimit,
+                                   const uint8_t *limit, const uint8_t *thresh,
+                                   int count, int bd) {
+  vp9_highbd_lpf_vertical_16_dual_sse2(s, p, blimit, limit, thresh, bd);
+}
+
+void wrapper_vertical_16_dual_c(uint16_t *s, int p, const uint8_t *blimit,
+                                const uint8_t *limit, const uint8_t *thresh,
+                                int count, int bd) {
+  vp9_highbd_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh, bd);
+}
+#else
+void wrapper_vertical_16_sse2(uint8_t *s, int p, const uint8_t *blimit,
+                              const uint8_t *limit, const uint8_t *thresh,
+                              int count) {
+  vp9_lpf_vertical_16_sse2(s, p, blimit, limit, thresh);
+}
+
+void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
+                           const uint8_t *limit, const uint8_t *thresh,
+                           int count) {
+  vp9_lpf_vertical_16_c(s, p, blimit, limit, thresh);
+}
+
+void wrapper_vertical_16_dual_sse2(uint8_t *s, int p, const uint8_t *blimit,
+                                   const uint8_t *limit, const uint8_t *thresh,
+                                   int count) {
+  vp9_lpf_vertical_16_dual_sse2(s, p, blimit, limit, thresh);
+}
+
+void wrapper_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
+                                const uint8_t *limit, const uint8_t *thresh,
+                                int count) {
+  vp9_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_SSE2
+
+class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
+ public:
+  virtual ~Loop8Test6Param() {}
+  virtual void SetUp() {
+    loopfilter_op_ = GET_PARAM(0);
+    ref_loopfilter_op_ = GET_PARAM(1);
+    bit_depth_ = GET_PARAM(2);
+    mask_ = (1 << bit_depth_) - 1;
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  int bit_depth_;
+  int mask_;
+  loop_op_t loopfilter_op_;
+  loop_op_t ref_loopfilter_op_;
+};
+
+class Loop8Test9Param : public ::testing::TestWithParam<dualloop8_param_t> {
+ public:
+  virtual ~Loop8Test9Param() {}
+  virtual void SetUp() {
+    loopfilter_op_ = GET_PARAM(0);
+    ref_loopfilter_op_ = GET_PARAM(1);
+    bit_depth_ = GET_PARAM(2);
+    mask_ = (1 << bit_depth_) - 1;
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  int bit_depth_;
+  int mask_;
+  dual_loop_op_t loopfilter_op_;
+  dual_loop_op_t ref_loopfilter_op_;
+};
+
+TEST_P(Loop8Test6Param, OperationCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = number_of_iterations;
+#if CONFIG_VP9_HIGHBITDEPTH
+  int32_t bd = bit_depth_;
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, s, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_s, kNumCoeffs);
+#else
+  DECLARE_ALIGNED_ARRAY(8, uint8_t, s, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(8, uint8_t, ref_s, kNumCoeffs);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  int err_count_total = 0;
+  int first_failure = -1;
+  for (int i = 0; i < count_test_block; ++i) {
+    int err_count = 0;
+    uint8_t tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, blimit[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, limit[16])  = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, thresh[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    int32_t p = kNumCoeffs/32;
+    int count = 1;
+
+    uint16_t tmp_s[kNumCoeffs];
+    int j = 0;
+    while (j < kNumCoeffs) {
+      uint8_t val = rnd.Rand8();
+      if (val & 0x80) {  // 50% chance to choose a new value.
+        tmp_s[j] = rnd.Rand16();
+        j++;
+      } else {  // 50% chance to repeat previous value in row X times
+        int k = 0;
+        while (k++ < ((val & 0x1f) + 1) && j < kNumCoeffs) {
+          if (j < 1) {
+            tmp_s[j] = rnd.Rand16();
+          } else if (val & 0x20) {  // Increment by an value within the limit
+            tmp_s[j] = (tmp_s[j - 1] + (*limit - 1));
+          } else {  // Decrement by an value within the limit
+            tmp_s[j] = (tmp_s[j - 1] - (*limit - 1));
+          }
+          j++;
+        }
+      }
+    }
+    for (j = 0; j < kNumCoeffs; j++) {
+      if (i % 2) {
+        s[j] = tmp_s[j] & mask_;
+      } else {
+        s[j] = tmp_s[p * (j % p) + j / p] & mask_;
+      }
+      ref_s[j] = s[j];
+    }
+#if CONFIG_VP9_HIGHBITDEPTH
+    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count, bd);
+    ASM_REGISTER_STATE_CHECK(
+        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count, bd));
+#else
+    ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count);
+    ASM_REGISTER_STATE_CHECK(
+        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      err_count += ref_s[j] != s[j];
+    }
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Loop8Test6Param, C output doesn't match SSE2 "
+         "loopfilter output. "
+      << "First failed at test case " << first_failure;
+}
+
+TEST_P(Loop8Test6Param, ValueCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = number_of_iterations;
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int32_t bd = bit_depth_;
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, s, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_s, kNumCoeffs);
+#else
+  DECLARE_ALIGNED_ARRAY(8, uint8_t, s, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(8, uint8_t, ref_s, kNumCoeffs);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  int err_count_total = 0;
+  int first_failure = -1;
+  for (int i = 0; i < count_test_block; ++i) {
+    int err_count = 0;
+    uint8_t tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, blimit[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, limit[16])  = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, thresh[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    int32_t p = kNumCoeffs / 32;
+    int count = 1;
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      s[j] = rnd.Rand16() & mask_;
+      ref_s[j] = s[j];
+    }
+#if CONFIG_VP9_HIGHBITDEPTH
+    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count, bd);
+    ASM_REGISTER_STATE_CHECK(
+        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count, bd));
+#else
+    ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count);
+    ASM_REGISTER_STATE_CHECK(
+        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      err_count += ref_s[j] != s[j];
+    }
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Loop8Test6Param, C output doesn't match SSE2 "
+         "loopfilter output. "
+      << "First failed at test case " << first_failure;
+}
+
+TEST_P(Loop8Test9Param, OperationCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = number_of_iterations;
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int32_t bd = bit_depth_;
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, s, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_s, kNumCoeffs);
+#else
+  DECLARE_ALIGNED_ARRAY(8,  uint8_t,  s, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(8,  uint8_t,  ref_s, kNumCoeffs);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  int err_count_total = 0;
+  int first_failure = -1;
+  for (int i = 0; i < count_test_block; ++i) {
+    int err_count = 0;
+    uint8_t tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, blimit0[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, limit0[16])  = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, thresh0[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, blimit1[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, limit1[16])  = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, thresh1[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    int32_t p = kNumCoeffs / 32;
+    uint16_t tmp_s[kNumCoeffs];
+    int j = 0;
+    const uint8_t limit = *limit0 < *limit1 ? *limit0 : *limit1;
+    while (j < kNumCoeffs) {
+      uint8_t val = rnd.Rand8();
+      if (val & 0x80) {  // 50% chance to choose a new value.
+        tmp_s[j] = rnd.Rand16();
+        j++;
+      } else {  // 50% chance to repeat previous value in row X times.
+        int k = 0;
+        while (k++ < ((val & 0x1f) + 1) && j < kNumCoeffs) {
+          if (j < 1) {
+            tmp_s[j] = rnd.Rand16();
+          } else if (val & 0x20) {  // Increment by a value within the limit.
+            tmp_s[j] = (tmp_s[j - 1] + (limit - 1));
+          } else {  // Decrement by an value within the limit.
+            tmp_s[j] = (tmp_s[j - 1] - (limit - 1));
+          }
+          j++;
+        }
+      }
+    }
+    for (j = 0; j < kNumCoeffs; j++) {
+      if (i % 2) {
+        s[j] = tmp_s[j] & mask_;
+      } else {
+        s[j] = tmp_s[p * (j % p) + j / p] & mask_;
+      }
+      ref_s[j] = s[j];
+    }
+#if CONFIG_VP9_HIGHBITDEPTH
+    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0,
+                       blimit1, limit1, thresh1, bd);
+    ASM_REGISTER_STATE_CHECK(
+        loopfilter_op_(s + 8 + p * 8, p, blimit0, limit0, thresh0,
+                       blimit1, limit1, thresh1, bd));
+#else
+    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0,
+                       blimit1, limit1, thresh1);
+    ASM_REGISTER_STATE_CHECK(
+        loopfilter_op_(s + 8 + p * 8, p, blimit0, limit0, thresh0,
+                       blimit1, limit1, thresh1));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      err_count += ref_s[j] != s[j];
+    }
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Loop8Test9Param, C output doesn't match SSE2 "
+         "loopfilter output. "
+      << "First failed at test case " << first_failure;
+}
+
+TEST_P(Loop8Test9Param, ValueCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = number_of_iterations;
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, s, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_s, kNumCoeffs);
+#else
+  DECLARE_ALIGNED_ARRAY(8,  uint8_t, s, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(8,  uint8_t, ref_s, kNumCoeffs);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  int err_count_total = 0;
+  int first_failure = -1;
+  for (int i = 0; i < count_test_block; ++i) {
+    int err_count = 0;
+    uint8_t tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, blimit0[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, limit0[16])  = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, thresh0[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, blimit1[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, limit1[16])  = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, thresh1[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    int32_t p = kNumCoeffs / 32;  // TODO(pdlf) can we have non-square here?
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      s[j] = rnd.Rand16() & mask_;
+      ref_s[j] = s[j];
+    }
+#if CONFIG_VP9_HIGHBITDEPTH
+    const int32_t bd = bit_depth_;
+    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0,
+                       blimit1, limit1, thresh1, bd);
+    ASM_REGISTER_STATE_CHECK(
+        loopfilter_op_(s + 8 + p * 8, p, blimit0, limit0,
+                       thresh0, blimit1, limit1, thresh1, bd));
+#else
+    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0,
+                       blimit1, limit1, thresh1);
+    ASM_REGISTER_STATE_CHECK(
+        loopfilter_op_(s + 8 + p * 8, p, blimit0, limit0, thresh0,
+                       blimit1, limit1, thresh1));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      err_count += ref_s[j] != s[j];
+    }
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Loop8Test9Param, C output doesn't match SSE2"
+         "loopfilter output. "
+      << "First failed at test case " << first_failure;
+}
+
+using std::tr1::make_tuple;
+
+#if HAVE_SSE2
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    SSE2_C_COMPARE_SINGLE, Loop8Test6Param,
+    ::testing::Values(
+        make_tuple(&vp9_highbd_lpf_horizontal_4_sse2,
+                   &vp9_highbd_lpf_horizontal_4_c, 8),
+        make_tuple(&vp9_highbd_lpf_vertical_4_sse2,
+                   &vp9_highbd_lpf_vertical_4_c, 8),
+        make_tuple(&vp9_highbd_lpf_horizontal_8_sse2,
+                   &vp9_highbd_lpf_horizontal_8_c, 8),
+        make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
+                   &vp9_highbd_lpf_horizontal_16_c, 8),
+        make_tuple(&vp9_highbd_lpf_vertical_8_sse2,
+                   &vp9_highbd_lpf_vertical_8_c, 8),
+        make_tuple(&wrapper_vertical_16_sse2,
+                   &wrapper_vertical_16_c, 8),
+        make_tuple(&vp9_highbd_lpf_horizontal_4_sse2,
+                   &vp9_highbd_lpf_horizontal_4_c, 10),
+        make_tuple(&vp9_highbd_lpf_vertical_4_sse2,
+                   &vp9_highbd_lpf_vertical_4_c, 10),
+        make_tuple(&vp9_highbd_lpf_horizontal_8_sse2,
+                   &vp9_highbd_lpf_horizontal_8_c, 10),
+        make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
+                   &vp9_highbd_lpf_horizontal_16_c, 10),
+        make_tuple(&vp9_highbd_lpf_vertical_8_sse2,
+                   &vp9_highbd_lpf_vertical_8_c, 10),
+        make_tuple(&wrapper_vertical_16_sse2,
+                   &wrapper_vertical_16_c, 10),
+        make_tuple(&vp9_highbd_lpf_horizontal_4_sse2,
+                   &vp9_highbd_lpf_horizontal_4_c, 12),
+        make_tuple(&vp9_highbd_lpf_vertical_4_sse2,
+                   &vp9_highbd_lpf_vertical_4_c, 12),
+        make_tuple(&vp9_highbd_lpf_horizontal_8_sse2,
+                   &vp9_highbd_lpf_horizontal_8_c, 12),
+        make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
+                   &vp9_highbd_lpf_horizontal_16_c, 12),
+        make_tuple(&vp9_highbd_lpf_vertical_8_sse2,
+                   &vp9_highbd_lpf_vertical_8_c, 12),
+        make_tuple(&wrapper_vertical_16_sse2,
+                   &wrapper_vertical_16_c, 12)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    SSE2_C_COMPARE_SINGLE, Loop8Test6Param,
+    ::testing::Values(
+        make_tuple(&vp9_lpf_horizontal_8_sse2, &vp9_lpf_horizontal_8_c, 8),
+        make_tuple(&vp9_lpf_horizontal_16_sse2, &vp9_lpf_horizontal_16_c, 8),
+        make_tuple(&vp9_lpf_vertical_8_sse2, &vp9_lpf_vertical_8_c, 8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif
+
+#if HAVE_SSE2
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    SSE2_C_COMPARE_DUAL, Loop8Test6Param,
+    ::testing::Values(
+        make_tuple(&wrapper_vertical_16_dual_sse2,
+                   &wrapper_vertical_16_dual_c, 8),
+        make_tuple(&wrapper_vertical_16_dual_sse2,
+                   &wrapper_vertical_16_dual_c, 10),
+        make_tuple(&wrapper_vertical_16_dual_sse2,
+                   &wrapper_vertical_16_dual_c, 12)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    SSE2_C_COMPARE_DUAL, Loop8Test6Param,
+    ::testing::Values(
+        make_tuple(&wrapper_vertical_16_sse2, &wrapper_vertical_16_c, 8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_SSE2
+
+#if HAVE_SSE2
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    SSE_C_COMPARE_DUAL, Loop8Test9Param,
+    ::testing::Values(
+        make_tuple(&vp9_highbd_lpf_horizontal_4_dual_sse2,
+                   &vp9_highbd_lpf_horizontal_4_dual_c, 8),
+        make_tuple(&vp9_highbd_lpf_horizontal_8_dual_sse2,
+                   &vp9_highbd_lpf_horizontal_8_dual_c, 8),
+        make_tuple(&vp9_highbd_lpf_vertical_4_dual_sse2,
+                   &vp9_highbd_lpf_vertical_4_dual_c, 8),
+        make_tuple(&vp9_highbd_lpf_vertical_8_dual_sse2,
+                   &vp9_highbd_lpf_vertical_8_dual_c, 8),
+        make_tuple(&vp9_highbd_lpf_horizontal_4_dual_sse2,
+                   &vp9_highbd_lpf_horizontal_4_dual_c, 10),
+        make_tuple(&vp9_highbd_lpf_horizontal_8_dual_sse2,
+                   &vp9_highbd_lpf_horizontal_8_dual_c, 10),
+        make_tuple(&vp9_highbd_lpf_vertical_4_dual_sse2,
+                   &vp9_highbd_lpf_vertical_4_dual_c, 10),
+        make_tuple(&vp9_highbd_lpf_vertical_8_dual_sse2,
+                   &vp9_highbd_lpf_vertical_8_dual_c, 10),
+        make_tuple(&vp9_highbd_lpf_horizontal_4_dual_sse2,
+                   &vp9_highbd_lpf_horizontal_4_dual_c, 12),
+        make_tuple(&vp9_highbd_lpf_horizontal_8_dual_sse2,
+                   &vp9_highbd_lpf_horizontal_8_dual_c, 12),
+        make_tuple(&vp9_highbd_lpf_vertical_4_dual_sse2,
+                   &vp9_highbd_lpf_vertical_4_dual_c, 12),
+        make_tuple(&vp9_highbd_lpf_vertical_8_dual_sse2,
+                   &vp9_highbd_lpf_vertical_8_dual_c, 12)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    SSE_C_COMPARE_DUAL, Loop8Test9Param,
+    ::testing::Values(
+        make_tuple(&vp9_lpf_horizontal_4_dual_sse2,
+                   &vp9_lpf_horizontal_4_dual_c, 8),
+        make_tuple(&vp9_lpf_horizontal_8_dual_sse2,
+                   &vp9_lpf_horizontal_8_dual_c, 8),
+        make_tuple(&vp9_lpf_vertical_4_dual_sse2,
+                   &vp9_lpf_vertical_4_dual_c, 8),
+        make_tuple(&vp9_lpf_vertical_8_dual_sse2,
+                   &vp9_lpf_vertical_8_dual_c, 8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif
+}  // namespace

diff --git a/source/libvpx/test/partial_idct_test.cc b/source/libvpx/test/partial_idct_test.cc
index 9c24fee..536273e 100644
--- a/source/libvpx/test/partial_idct_test.cc
+++ b/source/libvpx/test/partial_idct_test.cc

@@ -260,7 +260,7 @@
                    TX_4X4, 1)));
 #endif
 
-#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     SSE2, PartialIDctTest,
     ::testing::Values(
@@ -294,7 +294,8 @@
                    TX_4X4, 1)));
 #endif
 
-#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH && \
+    !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     SSSE3_64, PartialIDctTest,
     ::testing::Values(
@@ -304,7 +305,7 @@
                    TX_8X8, 12)));
 #endif
 
-#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     SSSE3, PartialIDctTest,
     ::testing::Values(

diff --git a/source/libvpx/test/sad_test.cc b/source/libvpx/test/sad_test.cc
index 5377c1e..c7042fe 100644
--- a/source/libvpx/test/sad_test.cc
+++ b/source/libvpx/test/sad_test.cc

@@ -625,6 +625,20 @@
 
 #if HAVE_AVX2
 #if CONFIG_VP9_ENCODER
+const SadMxNVp9Func sad_64x64_avx2_vp9 = vp9_sad64x64_avx2;
+const SadMxNVp9Func sad_64x32_avx2_vp9 = vp9_sad64x32_avx2;
+const SadMxNVp9Func sad_32x64_avx2_vp9 = vp9_sad32x64_avx2;
+const SadMxNVp9Func sad_32x32_avx2_vp9 = vp9_sad32x32_avx2;
+const SadMxNVp9Func sad_32x16_avx2_vp9 = vp9_sad32x16_avx2;
+const SadMxNVp9Param avx2_vp9_tests[] = {
+  make_tuple(64, 64, sad_64x64_avx2_vp9),
+  make_tuple(64, 32, sad_64x32_avx2_vp9),
+  make_tuple(32, 64, sad_32x64_avx2_vp9),
+  make_tuple(32, 32, sad_32x32_avx2_vp9),
+  make_tuple(32, 16, sad_32x16_avx2_vp9),
+};
+INSTANTIATE_TEST_CASE_P(AVX2, SADVP9Test, ::testing::ValuesIn(avx2_vp9_tests));
+
 const SadMxNx4Func sad_64x64x4d_avx2 = vp9_sad64x64x4d_avx2;
 const SadMxNx4Func sad_32x32x4d_avx2 = vp9_sad32x32x4d_avx2;
 INSTANTIATE_TEST_CASE_P(AVX2, SADx4Test, ::testing::Values(

diff --git a/source/libvpx/test/svc_test.cc b/source/libvpx/test/svc_test.cc
index fdde702..67e83e3 100644
--- a/source/libvpx/test/svc_test.cc
+++ b/source/libvpx/test/svc_test.cc

@@ -225,10 +225,9 @@
     EXPECT_EQ(received_frames, n);
   }
 
-  void DropLayersAndMakeItVP9Comaptible(struct vpx_fixed_buf *const inputs,
-                                        const int num_super_frames,
-                                        const int remained_spatial_layers,
-                                        const bool is_multiple_frame_contexts) {
+  void DropEnhancementLayers(struct vpx_fixed_buf *const inputs,
+                             const int num_super_frames,
+                             const int remained_spatial_layers) {
     ASSERT_TRUE(inputs != NULL);
     ASSERT_GT(num_super_frames, 0);
     ASSERT_GT(remained_spatial_layers, 0);
@@ -250,45 +249,6 @@
       if (frame_count == 0) {
         // There's no super frame but only a single frame.
         ASSERT_EQ(1, remained_spatial_layers);
-        if (is_multiple_frame_contexts) {
-          // Make a new super frame.
-          uint8_t marker = 0xc1;
-          unsigned int mask;
-          int mag;
-
-          // Choose the magnitude.
-          for (mag = 0, mask = 0xff; mag < 4; ++mag) {
-            if (inputs[i].sz < mask)
-              break;
-            mask <<= 8;
-            mask |= 0xff;
-          }
-          marker |= mag << 3;
-          int index_sz = 2 + (mag + 1) * 2;
-
-          inputs[i].buf = realloc(inputs[i].buf, inputs[i].sz + index_sz + 16);
-          ASSERT_TRUE(inputs[i].buf != NULL);
-          uint8_t *frame_data = static_cast<uint8_t*>(inputs[i].buf);
-          frame_data[0] &= ~2;      // Set the show_frame flag to 0.
-          frame_data += inputs[i].sz;
-          // Add an one byte frame with show_existing_frame.
-          *frame_data++ = 0x88;
-
-          // Write the super frame index.
-          *frame_data++ = marker;
-
-          frame_sizes[0] = inputs[i].sz;
-          frame_sizes[1] = 1;
-          for (int j = 0; j < 2; ++j) {
-            unsigned int this_sz = frame_sizes[j];
-            for (int k = 0; k <= mag; k++) {
-              *frame_data++ = this_sz & 0xff;
-              this_sz >>= 8;
-            }
-          }
-          *frame_data++ = marker;
-          inputs[i].sz += index_sz + 1;
-        }
       } else {
         // Found a super frame.
         uint8_t *frame_data = static_cast<uint8_t*>(inputs[i].buf);
@@ -304,16 +264,13 @@
         }
         ASSERT_LT(frame, frame_count) << "Couldn't find a visible frame. "
             << "remained_spatial_layers: " << remained_spatial_layers
-            << "    super_frame: " << i
-            << "    is_multiple_frame_context: " << is_multiple_frame_contexts;
-        if (frame == frame_count - 1 && !is_multiple_frame_contexts)
+            << "    super_frame: " << i;
+        if (frame == frame_count - 1)
           continue;
 
         frame_data += frame_sizes[frame];
 
         // We need to add one more frame for multiple frame contexts.
-        if (is_multiple_frame_contexts)
-          ++frame;
         uint8_t marker =
             static_cast<const uint8_t*>(inputs[i].buf)[inputs[i].sz - 1];
         const uint32_t mag = ((marker >> 3) & 0x3) + 1;
@@ -323,35 +280,14 @@
         marker |= frame;
 
         // Copy existing frame sizes.
-        memmove(frame_data + (is_multiple_frame_contexts ? 2 : 1),
-                frame_start + inputs[i].sz - index_sz + 1, new_index_sz - 2);
-        if (is_multiple_frame_contexts) {
-          // Add a one byte frame with flag show_existing_frame.
-          *frame_data++ = 0x88 | (remained_spatial_layers - 1);
-        }
+        memmove(frame_data + 1, frame_start + inputs[i].sz - index_sz + 1,
+                new_index_sz - 2);
         // New marker.
         frame_data[0] = marker;
         frame_data += (mag * (frame + 1) + 1);
 
-        if (is_multiple_frame_contexts) {
-          // Write the frame size for the one byte frame.
-          frame_data -= mag;
-          *frame_data++ = 1;
-          for (uint32_t j = 1; j < mag; ++j) {
-            *frame_data++ = 0;
-          }
-        }
-
         *frame_data++ = marker;
         inputs[i].sz = frame_data - frame_start;
-
-        if (is_multiple_frame_contexts) {
-          // Change the show frame flag to 0 for all frames.
-          for (int j = 0; j < frame; ++j) {
-            frame_start[0] &= ~2;
-            frame_start += frame_sizes[j];
-          }
-        }
       }
     }
   }
@@ -555,7 +491,7 @@
   vpx_fixed_buf outputs[10];
   memset(&outputs[0], 0, sizeof(outputs));
   Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
-  DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 1, false);
+  DropEnhancementLayers(&outputs[0], 10, 1);
   DecodeNFrames(&outputs[0], 10);
   FreeBitstreamBuffers(&outputs[0], 10);
 }
@@ -573,13 +509,13 @@
   Pass2EncodeNFrames(&stats_buf, 10, 5, &outputs[0]);
 
   DecodeNFrames(&outputs[0], 10);
-  DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 4, false);
+  DropEnhancementLayers(&outputs[0], 10, 4);
   DecodeNFrames(&outputs[0], 10);
-  DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 3, false);
+  DropEnhancementLayers(&outputs[0], 10, 3);
   DecodeNFrames(&outputs[0], 10);
-  DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 2, false);
+  DropEnhancementLayers(&outputs[0], 10, 2);
   DecodeNFrames(&outputs[0], 10);
-  DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 1, false);
+  DropEnhancementLayers(&outputs[0], 10, 1);
   DecodeNFrames(&outputs[0], 10);
 
   FreeBitstreamBuffers(&outputs[0], 10);
@@ -616,9 +552,9 @@
   memset(&outputs[0], 0, sizeof(outputs));
   Pass2EncodeNFrames(&stats_buf, 20, 3, &outputs[0]);
   DecodeNFrames(&outputs[0], 20);
-  DropLayersAndMakeItVP9Comaptible(&outputs[0], 20, 2, false);
+  DropEnhancementLayers(&outputs[0], 20, 2);
   DecodeNFrames(&outputs[0], 20);
-  DropLayersAndMakeItVP9Comaptible(&outputs[0], 20, 1, false);
+  DropEnhancementLayers(&outputs[0], 20, 1);
   DecodeNFrames(&outputs[0], 20);
 
   FreeBitstreamBuffers(&outputs[0], 20);
@@ -649,7 +585,6 @@
   vpx_fixed_buf outputs[10];
   memset(&outputs[0], 0, sizeof(outputs));
   Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
-  DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 2, true);
   DecodeNFrames(&outputs[0], 10);
   FreeBitstreamBuffers(&outputs[0], 10);
 }
@@ -667,7 +602,7 @@
   vpx_fixed_buf outputs[10];
   memset(&outputs[0], 0, sizeof(outputs));
   Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
-  DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 1, true);
+  DropEnhancementLayers(&outputs[0], 10, 1);
   DecodeNFrames(&outputs[0], 10);
   FreeBitstreamBuffers(&outputs[0], 10);
 }
@@ -686,7 +621,6 @@
   vpx_fixed_buf outputs[10];
   memset(&outputs[0], 0, sizeof(outputs));
   Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
-  DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 2, true);
   DecodeNFrames(&outputs[0], 10);
   FreeBitstreamBuffers(&outputs[0], 10);
 }
@@ -707,32 +641,13 @@
   memset(&outputs[0], 0, sizeof(outputs));
   Pass2EncodeNFrames(&stats_buf, 10, 3, &outputs[0]);
 
-  vpx_fixed_buf outputs_new[10];
-  for (int i = 0; i < 10; ++i) {
-    outputs_new[i].buf = malloc(outputs[i].sz + 16);
-    ASSERT_TRUE(outputs_new[i].buf != NULL);
-    memcpy(outputs_new[i].buf, outputs[i].buf, outputs[i].sz);
-    outputs_new[i].sz = outputs[i].sz;
-  }
-  DropLayersAndMakeItVP9Comaptible(&outputs_new[0], 10, 3, true);
-  DecodeNFrames(&outputs_new[0], 10);
-
-  for (int i = 0; i < 10; ++i) {
-    memcpy(outputs_new[i].buf, outputs[i].buf, outputs[i].sz);
-    outputs_new[i].sz = outputs[i].sz;
-  }
-  DropLayersAndMakeItVP9Comaptible(&outputs_new[0], 10, 2, true);
-  DecodeNFrames(&outputs_new[0], 10);
-
-  for (int i = 0; i < 10; ++i) {
-    memcpy(outputs_new[i].buf, outputs[i].buf, outputs[i].sz);
-    outputs_new[i].sz = outputs[i].sz;
-  }
-  DropLayersAndMakeItVP9Comaptible(&outputs_new[0], 10, 1, true);
-  DecodeNFrames(&outputs_new[0], 10);
+  DecodeNFrames(&outputs[0], 10);
+  DropEnhancementLayers(&outputs[0], 10, 2);
+  DecodeNFrames(&outputs[0], 10);
+  DropEnhancementLayers(&outputs[0], 10, 1);
+  DecodeNFrames(&outputs[0], 10);
 
   FreeBitstreamBuffers(&outputs[0], 10);
-  FreeBitstreamBuffers(&outputs_new[0], 10);
 }
 
 TEST_F(SvcTest, TwoPassEncode2TemporalLayers) {
@@ -769,7 +684,6 @@
   vpx_fixed_buf outputs[10];
   memset(&outputs[0], 0, sizeof(outputs));
   Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
-  DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 1, true);
   DecodeNFrames(&outputs[0], 10);
   FreeBitstreamBuffers(&outputs[0], 10);
 }
@@ -814,7 +728,6 @@
   vpx_fixed_buf outputs[10];
   memset(&outputs[0], 0, sizeof(outputs));
   Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
-  DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 1, true);
 
   vpx_fixed_buf base_layer[5];
   for (int i = 0; i < 5; ++i)

diff --git a/source/libvpx/test/test-data.mk b/source/libvpx/test/test-data.mk
new file mode 100644
index 0000000..e4dae3a
--- /dev/null
+++ b/source/libvpx/test/test-data.mk

@@ -0,0 +1,726 @@
+LIBVPX_TEST_SRCS-yes += test-data.mk
+
+# Encoder test source
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_collage_w352h288.yuv
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_odd.yuv
+
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_420.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_422.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_444.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_440.yuv
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_420.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_422.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_444.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_440.yuv
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_420.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_422.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_444.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_440.yuv
+
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += screendata.y4m
+
+# Test vectors
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-002.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-002.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-003.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-003.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-004.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-004.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-005.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-005.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-006.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-006.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-007.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-007.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-008.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-008.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-009.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-009.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-010.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-010.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-011.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-011.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-012.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-012.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-013.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-013.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-014.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-014.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-015.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-015.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-016.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-016.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-017.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-017.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-018.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-018.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1400.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1400.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1411.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1411.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1416.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1416.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1417.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1417.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1402.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1402.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1412.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1412.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1418.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1418.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1424.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1424.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-01.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-01.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-02.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-02.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-03.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-03.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-04.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-04.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1401.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1401.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1403.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1403.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1407.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1407.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1408.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1408.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1409.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1409.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1410.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1410.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1413.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1413.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1414.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1414.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1415.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1415.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1425.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1425.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1426.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1426.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1427.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1427.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1432.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1432.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1435.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1435.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1436.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1436.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1437.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1437.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1441.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1441.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1442.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1442.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1404.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1404.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1405.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1405.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1406.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1406.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1428.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1428.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1429.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1429.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1430.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1430.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1431.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1431.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1433.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1433.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1434.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1434.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1438.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1438.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1439.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1439.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1440.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1440.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1443.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1443.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-06-smallsize.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-06-smallsize.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-00.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-00.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-01.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-01.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-02.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-02.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-03.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-03.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-04.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-04.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-05.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-05.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-06.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-06.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-07.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-07.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-09.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-09.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-11.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-11.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-12.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-12.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-13.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-13.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-14.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-14.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-15.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-15.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-17.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-17.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-19.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-19.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-20.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-20.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-21.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-21.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-22.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-22.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-23.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-23.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-24.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-24.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-25.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-25.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-26.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-26.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-27.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-27.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-28.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-28.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-29.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-29.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-30.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-30.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-31.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-31.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-33.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-33.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-35.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-35.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-36.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-36.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-37.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-37.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-38.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-38.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-39.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-39.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-40.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-40.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-41.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-41.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-42.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-42.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-43.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-43.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-44.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-44.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-45.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-45.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-46.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-46.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-47.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-47.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-48.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-48.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-49.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-49.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-50.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-50.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-51.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-51.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-52.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-52.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-53.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-53.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-54.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-54.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-55.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-55.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-56.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-56.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-57.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-57.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-58.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-58.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-59.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-59.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-60.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-60.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-61.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-61.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-62.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-62.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-63.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-63.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-1.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-3.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-3.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-5.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-5.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-6.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-6.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-7.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-7.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-lf-1920x1080.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-lf-1920x1080.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-05-resize.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-05-resize.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-06-bilinear.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-06-bilinear.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-07-frame_parallel.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-07-frame_parallel.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x1.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2_frame_parallel.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2_frame_parallel.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4_frame_parallel.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4_frame_parallel.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8_frame_parallel.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8_frame_parallel.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-aq2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-aq2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-lf_deltas.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-lf_deltas.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-subpixel-00.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-subpixel-00.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x287.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x287.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x288.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x288.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-352x287.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-352x287.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_1.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_1.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_2.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_2.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_3.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_3.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-13-largescaling.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-13-largescaling.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-8.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-8.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-1.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-8.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-8.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-1.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-8.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-8.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-1.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-8.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-8.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-1.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey_adpq.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey_adpq.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-16-intra-only.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-16-intra-only.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-17-show-existing-frame.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-17-show-existing-frame.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-18-resize.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-18-resize.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip-01.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip-01.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip-02.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip-02.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm.md5
+
+# Invalid files for testing libvpx error checking.
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v2.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02-v2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02-v2.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v3.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v3.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.ivf.res
+
+ifeq ($(CONFIG_DECODE_PERF_TESTS),yes)
+# BBB VP9 streams
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-bbb_426x240_tile_1x1_180kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-bbb_640x360_tile_1x2_337kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-bbb_854x480_tile_1x2_651kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-bbb_1280x720_tile_1x4_1310kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-bbb_1920x1080_tile_1x1_2581kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-bbb_1920x1080_tile_1x4_2586kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-bbb_1920x1080_tile_1x4_fpm_2304kbps.webm
+# Sintel VP9 streams
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-sintel_426x182_tile_1x1_171kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-sintel_640x272_tile_1x2_318kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-sintel_854x364_tile_1x2_621kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-sintel_1280x546_tile_1x4_1257kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-sintel_1920x818_tile_1x4_fpm_2279kbps.webm
+# TOS VP9 streams
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-tos_426x178_tile_1x1_181kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-tos_640x266_tile_1x2_336kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-tos_854x356_tile_1x2_656kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-tos_854x356_tile_1x2_fpm_546kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-tos_1280x534_tile_1x4_1306kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-tos_1280x534_tile_1x4_fpm_952kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-tos_1920x800_tile_1x4_fpm_2335kbps.webm
+endif  # CONFIG_DECODE_PERF_TESTS
+
+ifeq ($(CONFIG_ENCODE_PERF_TESTS),yes)
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += desktop_640_360_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += kirland_640_480_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += macmarcomoving_640_480_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += macmarcostationary_640_480_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_1280_720_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_640_480_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += tacomanarrows_640_480_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += tacomasmallcameramovement_640_480_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += thaloundeskmtg_640_480_30.yuv
+endif  # CONFIG_ENCODE_PERF_TESTS

diff --git a/source/libvpx/test/test-data.sha1 b/source/libvpx/test/test-data.sha1
index e6114ab..69d1d2f 100644
--- a/source/libvpx/test/test-data.sha1
+++ b/source/libvpx/test/test-data.sha1

@@ -17,12 +17,15 @@
 a432f96ff0a787268e2f94a8092ab161a18d1b06  park_joy_90p_10_420.y4m
 0b194cc312c3a2e84d156a221b0a5eb615dfddc5  park_joy_90p_10_422.y4m
 ff0e0a21dc2adc95b8c1b37902713700655ced17  park_joy_90p_10_444.y4m
+c934da6fb8cc54ee2a8c17c54cf6076dac37ead0  park_joy_90p_10_440.yuv
 614c32ae1eca391e867c70d19974f0d62664dd99  park_joy_90p_12_420.y4m
 c92825f1ea25c5c37855083a69faac6ac4641a9e  park_joy_90p_12_422.y4m
 b592189b885b6cc85db55cc98512a197d73d3b34  park_joy_90p_12_444.y4m
+82c1bfcca368c2f22bad7d693d690d5499ecdd11  park_joy_90p_12_440.yuv
 4e0eb61e76f0684188d9bc9f3ce61f6b6b77bb2c  park_joy_90p_8_420.y4m
 7a193ff7dfeb96ba5f82b2afd7afa9e1fe83d947  park_joy_90p_8_422.y4m
 bdb7856e6bc93599bdda05c2e773a9f22b6c6d03  park_joy_90p_8_444.y4m
+81e1f3843748438b8f2e71db484eb22daf72e939  park_joy_90p_8_440.yuv
 b1f1c3ec79114b9a0651af24ce634afb44a9a419  rush_hour_444.y4m
 5184c46ddca8b1fadd16742e8500115bc8f749da  vp80-00-comprehensive-001.ivf
 65bf1bbbced81b97bd030f376d1b7f61a224793f  vp80-00-comprehensive-002.ivf
@@ -691,6 +694,8 @@
 368dccdde5288c13c25695d2eacdc7402cadf613  vp90-2-19-skip.webm.md5
 ffe460282df2b0e7d4603c2158653ad96f574b02  vp90-2-19-skip-01.webm
 bd21bc9eda4a4a36b221d71ede3a139fc3c7bd85  vp90-2-19-skip-01.webm.md5
+178f5bd239e38cc1cc2657a7a5e1a9f52ad2d3fe  vp90-2-19-skip-02.webm
+9020d5e260bd7df08e2b3d4b86f8623cee3daea2  vp90-2-19-skip-02.webm.md5
 b03c408cf23158638da18dbc3323b99a1635c68a  invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf
 0a3884edb3fd8f9d9b500223e650f7de257b67d8  invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf.res
 5e67e24e7f53fd189e565513cef8519b1bd6c712  invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf

diff --git a/source/libvpx/test/test.mk b/source/libvpx/test/test.mk
index abf815c..30c13a1 100644
--- a/source/libvpx/test/test.mk
+++ b/source/libvpx/test/test.mk

@@ -23,6 +23,7 @@
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += error_resilience_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += i420_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += y4m_video_source.h
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += yuv_video_source.h
 
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += altref_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += config_test.cc
@@ -30,6 +31,7 @@
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc
 
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += external_frame_buffer_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += invalid_file_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += user_priv_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc
@@ -37,6 +39,7 @@
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += frame_size_tests.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += resize_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_lossless_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_end_to_end_test.cc
 
 LIBVPX_TEST_SRCS-yes                   += decode_test_driver.cc
 LIBVPX_TEST_SRCS-yes                   += decode_test_driver.h
@@ -60,7 +63,6 @@
 endif
 
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += decode_api_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += invalid_file_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += test_vector_test.cc
 
 # Currently we only support decoder perf tests for vp9. Also they read from WebM
@@ -103,6 +105,10 @@
 LIBVPX_TEST_SRCS-yes                   += sixtap_predict_test.cc
 LIBVPX_TEST_SRCS-yes                   += vpx_scale_test.cc
 
+ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_TEMPORAL_DENOISING),yesyes)
+LIBVPX_TEST_SRCS-$(HAVE_SSE2) += vp8_denoiser_sse2_test.cc
+endif
+
 endif # VP8
 
 ## VP9
@@ -128,755 +134,22 @@
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += lpf_8_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_avg_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9)         += vp9_intrapred_test.cc
 
 ifeq ($(CONFIG_VP9_ENCODER),yes)
 LIBVPX_TEST_SRCS-$(CONFIG_SPATIAL_SVC) += svc_test.cc
 endif
 
+ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_TEMPORAL_DENOISING),yesyes)
+LIBVPX_TEST_SRCS-$(HAVE_SSE2) += vp9_denoiser_sse2_test.cc
+endif
+
 endif # VP9
 
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += sad_test.cc
 
 endif # CONFIG_SHARED
 
-
-##
-## TEST DATA
-##
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_collage_w352h288.yuv
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_odd.yuv
-
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_420.y4m
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_422.y4m
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_444.y4m
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_420.y4m
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_422.y4m
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_444.y4m
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_420.y4m
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_422.y4m
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_444.y4m
-
-LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m
-LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += screendata.y4m
-
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-002.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-002.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-003.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-003.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-004.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-004.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-005.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-005.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-006.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-006.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-007.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-007.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-008.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-008.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-009.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-009.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-010.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-010.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-011.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-011.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-012.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-012.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-013.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-013.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-014.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-014.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-015.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-015.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-016.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-016.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-017.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-017.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-018.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-018.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1400.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1400.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1411.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1411.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1416.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1416.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1417.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1417.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1402.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1402.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1412.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1412.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1418.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1418.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1424.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1424.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-01.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-01.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-02.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-02.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-03.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-03.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-04.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-04.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1401.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1401.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1403.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1403.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1407.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1407.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1408.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1408.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1409.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1409.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1410.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1410.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1413.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1413.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1414.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1414.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1415.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1415.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1425.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1425.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1426.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1426.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1427.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1427.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1432.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1432.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1435.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1435.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1436.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1436.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1437.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1437.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1441.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1441.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1442.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1442.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1404.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1404.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1405.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1405.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1406.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1406.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1428.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1428.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1429.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1429.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1430.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1430.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1431.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1431.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1433.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1433.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1434.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1434.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1438.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1438.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1439.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1439.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1440.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1440.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1443.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1443.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-06-smallsize.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-06-smallsize.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-00.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-00.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-01.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-01.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-02.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-02.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-03.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-03.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-04.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-04.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-05.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-05.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-06.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-06.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-07.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-07.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-08.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-08.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-09.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-09.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-10.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-10.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-11.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-11.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-12.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-12.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-13.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-13.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-14.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-14.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-15.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-15.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-16.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-16.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-17.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-17.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-18.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-18.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-19.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-19.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-20.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-20.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-21.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-21.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-22.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-22.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-23.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-23.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-24.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-24.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-25.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-25.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-26.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-26.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-27.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-27.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-28.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-28.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-29.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-29.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-30.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-30.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-31.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-31.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-32.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-32.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-33.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-33.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-34.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-34.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-35.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-35.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-36.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-36.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-37.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-37.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-38.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-38.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-39.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-39.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-40.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-40.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-41.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-41.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-42.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-42.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-43.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-43.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-44.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-44.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-45.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-45.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-46.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-46.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-47.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-47.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-48.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-48.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-49.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-49.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-50.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-50.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-51.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-51.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-52.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-52.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-53.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-53.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-54.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-54.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-55.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-55.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-56.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-56.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-57.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-57.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-58.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-58.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-59.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-59.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-60.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-60.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-61.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-61.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-62.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-62.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-63.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-63.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-1.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-1.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-2.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-2.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-3.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-3.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-4.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-4.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-5.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-5.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-6.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-6.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-7.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-7.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x08.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x08.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x10.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x10.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x16.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x16.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x18.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x18.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x32.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x32.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x34.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x34.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x64.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x64.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x66.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x66.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x08.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x08.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x10.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x10.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x16.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x16.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x18.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x18.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x32.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x32.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x34.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x34.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x64.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x64.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x66.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x66.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x08.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x08.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x10.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x10.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x16.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x16.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x18.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x18.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x32.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x32.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x34.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x34.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x64.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x64.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x66.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x66.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x08.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x08.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x10.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x10.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x16.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x16.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x18.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x18.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x32.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x32.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x34.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x34.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x64.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x64.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x66.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x66.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x08.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x08.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x10.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x10.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x16.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x16.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x18.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x18.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x32.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x32.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x34.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x34.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x64.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x64.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x66.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x66.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x08.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x08.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x10.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x10.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x16.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x16.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x18.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x18.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x32.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x32.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x34.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x34.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x64.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x64.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x66.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x66.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x08.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x08.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x10.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x10.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x16.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x16.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x18.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x18.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x32.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x32.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x34.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x34.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x64.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x64.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x66.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x66.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x08.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x08.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x10.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x10.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x16.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x16.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x18.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x18.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x32.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x32.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x34.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x34.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x64.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x64.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x66.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x66.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-lf-1920x1080.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-lf-1920x1080.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x196.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x196.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x198.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x198.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x202.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x202.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x208.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x208.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x210.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x210.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x224.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x224.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x226.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x226.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x196.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x196.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x198.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x198.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x202.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x202.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x208.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x208.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x210.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x210.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x224.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x224.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x226.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x226.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x196.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x196.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x198.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x198.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x202.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x202.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x208.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x208.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x210.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x210.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x224.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x224.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x226.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x226.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x196.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x196.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x198.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x198.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x202.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x202.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x208.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x208.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x210.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x210.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x224.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x224.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x226.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x226.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x196.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x196.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x198.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x198.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x202.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x202.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x208.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x208.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x210.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x210.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x224.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x224.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x226.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x226.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x196.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x196.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x198.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x198.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x202.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x202.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x208.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x208.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x210.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x210.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x224.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x224.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x226.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x226.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x196.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x196.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x198.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x198.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x202.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x202.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x208.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x208.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x210.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x210.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x224.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x224.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x226.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x226.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x196.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x196.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x198.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x198.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x202.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x202.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x208.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x208.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x210.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x210.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-05-resize.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-05-resize.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-06-bilinear.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-06-bilinear.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-07-frame_parallel.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-07-frame_parallel.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x1.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x1.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x4.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x4.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2_frame_parallel.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2_frame_parallel.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4_frame_parallel.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4_frame_parallel.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8_frame_parallel.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8_frame_parallel.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-aq2.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-aq2.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-lf_deltas.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-lf_deltas.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-subpixel-00.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-subpixel-00.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame2.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame2.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x287.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x287.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x288.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x288.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-352x287.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-352x287.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_1.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_1.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_2.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_2.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_3.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_3.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-13-largescaling.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-13-largescaling.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-2.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-2.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-4.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-4.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-8.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-8.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-16.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-16.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-1.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-1.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-1.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-1.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-1.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-1.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-1.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-1.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-4.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-4.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-8.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-8.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-16.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-16.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-2.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-2.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-2.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-2.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-2.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-2.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-8.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-8.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-16.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-16.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-4.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-4.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-4.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-4.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-16.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-16.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-8.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-8.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey_adpq.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey_adpq.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-16-intra-only.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-16-intra-only.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-17-show-existing-frame.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-17-show-existing-frame.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-18-resize.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-18-resize.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip-01.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip-01.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm.md5
-
-# Invalid files for testing libvpx error checking.
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v2.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v2.webm.res
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02-v2.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02-v2.webm.res
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v3.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v3.webm.res
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf.res
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf.res
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf.res
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf.res
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf.res
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm.res
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf.res
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf.res
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf.res
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf.res
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf.res
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.ivf.res
-
-ifeq ($(CONFIG_DECODE_PERF_TESTS),yes)
-# BBB VP9 streams
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-bbb_426x240_tile_1x1_180kbps.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-bbb_640x360_tile_1x2_337kbps.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-bbb_854x480_tile_1x2_651kbps.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-bbb_1280x720_tile_1x4_1310kbps.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-bbb_1920x1080_tile_1x1_2581kbps.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-bbb_1920x1080_tile_1x4_2586kbps.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-bbb_1920x1080_tile_1x4_fpm_2304kbps.webm
-#Sintel VP9 streams
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-sintel_426x182_tile_1x1_171kbps.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-sintel_640x272_tile_1x2_318kbps.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-sintel_854x364_tile_1x2_621kbps.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-sintel_1280x546_tile_1x4_1257kbps.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-sintel_1920x818_tile_1x4_fpm_2279kbps.webm
-# TOS VP9 streams
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-tos_426x178_tile_1x1_181kbps.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-tos_640x266_tile_1x2_336kbps.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-tos_854x356_tile_1x2_656kbps.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-tos_854x356_tile_1x2_fpm_546kbps.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-tos_1280x534_tile_1x4_1306kbps.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-tos_1280x534_tile_1x4_fpm_952kbps.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-tos_1920x800_tile_1x4_fpm_2335kbps.webm
-endif  # CONFIG_DECODE_PERF_TESTS
-
-ifeq ($(CONFIG_ENCODE_PERF_TESTS),yes)
-LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += desktop_640_360_30.yuv
-LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += kirland_640_480_30.yuv
-LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += macmarcomoving_640_480_30.yuv
-LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += macmarcostationary_640_480_30.yuv
-LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_640_480_30.yuv
-LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += tacomanarrows_640_480_30.yuv
-LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += tacomasmallcameramovement_640_480_30.yuv
-LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += thaloundeskmtg_640_480_30.yuv
-LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_1280_720_30.yuv
-endif  # CONFIG_ENCODE_PERF_TESTS
+include $(SRC_PATH_BARE)/test/test-data.mk

diff --git a/source/libvpx/test/test_vectors.cc b/source/libvpx/test/test_vectors.cc
index cccebf8..7efa8c0 100644
--- a/source/libvpx/test/test_vectors.cc
+++ b/source/libvpx/test/test_vectors.cc

@@ -182,7 +182,8 @@
   "vp90-2-15-segkey.webm", "vp90-2-15-segkey_adpq.webm",
   "vp90-2-16-intra-only.webm", "vp90-2-17-show-existing-frame.webm",
   "vp90-2-18-resize.ivf", "vp90-2-19-skip.webm",
-  "vp90-2-19-skip-01.webm", "vp91-2-04-yuv444.webm",
+  "vp90-2-19-skip-01.webm", "vp90-2-19-skip-02.webm",
+  "vp91-2-04-yuv444.webm",
 };
 const int kNumVP9TestVectors = NELEMENTS(kVP9TestVectors);
 #endif  // CONFIG_VP9_DECODER

diff --git a/source/libvpx/test/vp8_denoiser_sse2_test.cc b/source/libvpx/test/vp8_denoiser_sse2_test.cc
new file mode 100644
index 0000000..d4abdad
--- /dev/null
+++ b/source/libvpx/test/vp8_denoiser_sse2_test.cc

@@ -0,0 +1,116 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+#include "vp8/encoder/denoising.h"
+#include "vp8/common/reconinter.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+
+const int kNumPixels = 16 * 16;
+class VP8DenoiserTest : public ::testing::TestWithParam<int> {
+ public:
+  virtual ~VP8DenoiserTest() {}
+
+  virtual void SetUp() {
+    increase_denoising_ = GetParam();
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  int increase_denoising_;
+};
+
+TEST_P(VP8DenoiserTest, BitexactCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 4000;
+  const int stride = 16;
+
+  // Allocate the space for input and output,
+  // where sig_block_c/_sse2 is the block to be denoised,
+  // mc_avg_block is the denoised reference block,
+  // avg_block_c is the denoised result from C code,
+  // avg_block_sse2 is the denoised result from SSE2 code.
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, sig_block_c, kNumPixels);
+  // Since in VP8 denoiser, the source signal will be changed,
+  // we need another copy of the source signal as the input of sse2 code.
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, sig_block_sse2, kNumPixels);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, mc_avg_block, kNumPixels);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, avg_block_c, kNumPixels);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, avg_block_sse2, kNumPixels);
+
+  for (int i = 0; i < count_test_block; ++i) {
+    // Generate random motion magnitude, 20% of which exceed the threshold.
+    const int motion_magnitude_ran =
+        rnd.Rand8() % static_cast<int>(MOTION_MAGNITUDE_THRESHOLD * 1.2);
+
+    // Initialize a test block with random number in range [0, 255].
+    for (int j = 0; j < kNumPixels; ++j) {
+      int temp = 0;
+      sig_block_sse2[j] = sig_block_c[j] = rnd.Rand8();
+      // The pixels in mc_avg_block are generated by adding a random
+      // number in range [-19, 19] to corresponding pixels in sig_block.
+      temp = sig_block_c[j] + (rnd.Rand8() % 2 == 0 ? -1 : 1) *
+             (rnd.Rand8() % 20);
+      // Clip.
+      mc_avg_block[j] = (temp < 0) ? 0 : ((temp > 255) ? 255 : temp);
+    }
+
+    // Test denosiser on Y component.
+    ASM_REGISTER_STATE_CHECK(vp8_denoiser_filter_c(
+        mc_avg_block, stride, avg_block_c, stride, sig_block_c, stride,
+        motion_magnitude_ran, increase_denoising_));
+
+    ASM_REGISTER_STATE_CHECK(vp8_denoiser_filter_sse2(
+        mc_avg_block, stride, avg_block_sse2, stride, sig_block_sse2, stride,
+        motion_magnitude_ran, increase_denoising_));
+
+    // Check bitexactness.
+    for (int h = 0; h < 16; ++h) {
+      for (int w = 0; w < 16; ++w) {
+        EXPECT_EQ(avg_block_c[h * stride + w], avg_block_sse2[h * stride + w]);
+      }
+    }
+
+    // Test denoiser on UV component.
+    ASM_REGISTER_STATE_CHECK(vp8_denoiser_filter_uv_c(
+        mc_avg_block, stride, avg_block_c, stride, sig_block_c, stride,
+        motion_magnitude_ran, increase_denoising_));
+
+    ASM_REGISTER_STATE_CHECK(vp8_denoiser_filter_uv_sse2(
+        mc_avg_block, stride, avg_block_sse2, stride, sig_block_sse2, stride,
+        motion_magnitude_ran, increase_denoising_));
+
+    // Check bitexactness.
+    for (int h = 0; h < 16; ++h) {
+      for (int w = 0; w < 16; ++w) {
+        EXPECT_EQ(avg_block_c[h * stride + w], avg_block_sse2[h * stride + w]);
+      }
+    }
+  }
+}
+
+// Test for all block size.
+INSTANTIATE_TEST_CASE_P(SSE2, VP8DenoiserTest, ::testing::Values(0, 1));
+}  // namespace

diff --git a/source/libvpx/test/vp9_avg_test.cc b/source/libvpx/test/vp9_avg_test.cc
new file mode 100644
index 0000000..c2e472b
--- /dev/null
+++ b/source/libvpx/test/vp9_avg_test.cc

@@ -0,0 +1,150 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <string.h>
+#include <limits.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#if CONFIG_VP9_ENCODER
+#include "./vp9_rtcd.h"
+#endif
+#include "vpx_mem/vpx_mem.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+class AverageTestBase : public ::testing::Test {
+ public:
+  AverageTestBase(int width, int height) : width_(width), height_(height) {}
+
+  static void SetUpTestCase() {
+    source_data_ = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, kDataBlockSize));
+  }
+
+  static void TearDownTestCase() {
+    vpx_free(source_data_);
+    source_data_ = NULL;
+  }
+
+  virtual void TearDown() {
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  // Handle blocks up to 4 blocks 64x64 with stride up to 128
+  static const int kDataAlignment = 16;
+  static const int kDataBlockSize = 64 * 128;
+
+  virtual void SetUp() {
+    source_stride_ = (width_ + 31) & ~31;
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+  // Sum Pixels
+  unsigned int ReferenceAverage(const uint8_t* source, int pitch ) {
+    unsigned int average = 0;
+    for (int h = 0; h < 8; ++h)
+      for (int w = 0; w < 8; ++w)
+        average += source[h * source_stride_ + w];
+    return ((average + 32) >> 6);
+  }
+
+  void FillConstant(uint8_t fill_constant) {
+    for (int i = 0; i < width_ * height_; ++i) {
+        source_data_[i] = fill_constant;
+    }
+  }
+
+  void FillRandom() {
+    for (int i = 0; i < width_ * height_; ++i) {
+        source_data_[i] = rnd_.Rand8();
+    }
+  }
+
+  int width_, height_;
+  static uint8_t* source_data_;
+  int source_stride_;
+
+  ACMRandom rnd_;
+};
+typedef unsigned int (*AverageFunction)(const uint8_t* s, int pitch);
+
+typedef std::tr1::tuple<int, int, int, AverageFunction> AvgFunc;
+
+class AverageTest
+    : public AverageTestBase,
+      public ::testing::WithParamInterface<AvgFunc>{
+ public:
+  AverageTest() : AverageTestBase(GET_PARAM(0), GET_PARAM(1)) {}
+
+ protected:
+  void CheckAverages() {
+    unsigned int expected = ReferenceAverage(source_data_+ GET_PARAM(2),
+                                             source_stride_);
+
+    ASM_REGISTER_STATE_CHECK(GET_PARAM(3)(source_data_+ GET_PARAM(2),
+                                          source_stride_));
+    unsigned int actual = GET_PARAM(3)(source_data_+ GET_PARAM(2),
+                                       source_stride_);
+
+    EXPECT_EQ(expected, actual);
+  }
+};
+
+
+uint8_t* AverageTestBase::source_data_ = NULL;
+
+TEST_P(AverageTest, MinValue) {
+  FillConstant(0);
+  CheckAverages();
+}
+
+TEST_P(AverageTest, MaxValue) {
+  FillConstant(255);
+  CheckAverages();
+}
+
+TEST_P(AverageTest, Random) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  for (int i = 0; i < 1000; i++) {
+    FillRandom();
+    CheckAverages();
+  }
+}
+
+using std::tr1::make_tuple;
+
+INSTANTIATE_TEST_CASE_P(
+    C, AverageTest,
+    ::testing::Values(
+        make_tuple(16, 16, 1, &vp9_avg_8x8_c)));
+
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, AverageTest,
+    ::testing::Values(
+        make_tuple(16, 16, 0, &vp9_avg_8x8_sse2),
+        make_tuple(16, 16, 5, &vp9_avg_8x8_sse2),
+        make_tuple(32, 32, 15, &vp9_avg_8x8_sse2)));
+
+#endif
+
+}  // namespace

diff --git a/source/libvpx/test/vp9_denoiser_sse2_test.cc b/source/libvpx/test/vp9_denoiser_sse2_test.cc
new file mode 100644
index 0000000..0ecba07
--- /dev/null
+++ b/source/libvpx/test/vp9_denoiser_sse2_test.cc

@@ -0,0 +1,101 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+#include "vpx_scale/yv12config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/encoder/vp9_context_tree.h"
+#include "vp9/encoder/vp9_denoiser.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+
+const int kNumPixels = 64 * 64;
+class VP9DenoiserTest : public ::testing::TestWithParam<BLOCK_SIZE> {
+ public:
+  virtual ~VP9DenoiserTest() {}
+
+  virtual void SetUp() {
+    bs_ = GetParam();
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  BLOCK_SIZE bs_;
+};
+
+TEST_P(VP9DenoiserTest, BitexactCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 4000;
+
+  // Allocate the space for input and output,
+  // where sig_block is the block to be denoised,
+  // mc_avg_block is the denoised reference block,
+  // avg_block_c is the denoised result from C code,
+  // avg_block_sse2 is the denoised result from SSE2 code.
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, sig_block, kNumPixels);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, mc_avg_block, kNumPixels);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, avg_block_c, kNumPixels);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, avg_block_sse2, kNumPixels);
+
+  for (int i = 0; i < count_test_block; ++i) {
+    // Generate random motion magnitude, 20% of which exceed the threshold.
+    const int motion_magnitude_random =
+        rnd.Rand8() % static_cast<int>(MOTION_MAGNITUDE_THRESHOLD * 1.2);
+
+    // Initialize a test block with random number in range [0, 255].
+    for (int j = 0; j < kNumPixels; ++j) {
+      int temp = 0;
+      sig_block[j] = rnd.Rand8();
+      // The pixels in mc_avg_block are generated by adding a random
+      // number in range [-19, 19] to corresponding pixels in sig_block.
+      temp = sig_block[j] + ((rnd.Rand8() % 2 == 0) ? -1 : 1) *
+             (rnd.Rand8() % 20);
+      // Clip.
+      mc_avg_block[j] = (temp < 0) ? 0 : ((temp > 255) ? 255 : temp);
+    }
+
+    ASM_REGISTER_STATE_CHECK(vp9_denoiser_filter_c(
+        sig_block, 64, mc_avg_block, 64, avg_block_c,
+        64, 0, bs_, motion_magnitude_random));
+
+    ASM_REGISTER_STATE_CHECK(vp9_denoiser_filter_sse2(
+        sig_block, 64, mc_avg_block, 64, avg_block_sse2,
+        64, 0, bs_, motion_magnitude_random));
+
+    // Test bitexactness.
+    for (int h = 0; h < (4 << b_height_log2_lookup[bs_]); ++h) {
+      for (int w = 0; w < (4 << b_width_log2_lookup[bs_]); ++w) {
+        EXPECT_EQ(avg_block_c[h * 64 + w], avg_block_sse2[h * 64 + w]);
+      }
+    }
+  }
+}
+
+// Test for all block size.
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VP9DenoiserTest,
+    ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, BLOCK_8X8,
+                      BLOCK_8X16, BLOCK_16X8, BLOCK_16X16, BLOCK_16X32,
+                      BLOCK_32X16, BLOCK_32X32, BLOCK_32X64, BLOCK_64X32,
+                      BLOCK_64X64));
+}  // namespace

diff --git a/source/libvpx/test/vp9_end_to_end_test.cc b/source/libvpx/test/vp9_end_to_end_test.cc
new file mode 100644
index 0000000..a8f6793
--- /dev/null
+++ b/source/libvpx/test/vp9_end_to_end_test.cc

@@ -0,0 +1,155 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+#include "test/util.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+namespace {
+
+const unsigned int kWidth  = 160;
+const unsigned int kHeight = 90;
+const unsigned int kFramerate = 50;
+const unsigned int kFrames = 10;
+const int kBitrate = 500;
+const int kCpuUsed = 2;
+const double psnr_threshold = 35.0;
+
+typedef struct {
+  const char *filename;
+  unsigned int input_bit_depth;
+  vpx_img_fmt fmt;
+  vpx_bit_depth_t bit_depth;
+  unsigned int profile;
+} TestVideoParam;
+
+const TestVideoParam TestVectors[] = {
+  {"park_joy_90p_8_420.y4m", 8, VPX_IMG_FMT_I420, VPX_BITS_8, 0},
+  {"park_joy_90p_8_422.y4m", 8, VPX_IMG_FMT_I422, VPX_BITS_8, 1},
+  {"park_joy_90p_8_444.y4m", 8, VPX_IMG_FMT_I444, VPX_BITS_8, 1},
+  {"park_joy_90p_8_440.yuv", 8, VPX_IMG_FMT_I440, VPX_BITS_8, 1},
+#if CONFIG_VP9_HIGHBITDEPTH
+  {"park_joy_90p_10_420.y4m", 10, VPX_IMG_FMT_I42016, VPX_BITS_10, 2},
+  {"park_joy_90p_10_422.y4m", 10, VPX_IMG_FMT_I42216, VPX_BITS_10, 3},
+  {"park_joy_90p_10_444.y4m", 10, VPX_IMG_FMT_I44416, VPX_BITS_10, 3},
+  {"park_joy_90p_10_440.yuv", 10, VPX_IMG_FMT_I44016, VPX_BITS_10, 3},
+  {"park_joy_90p_12_420.y4m", 12, VPX_IMG_FMT_I42016, VPX_BITS_12, 2},
+  {"park_joy_90p_12_422.y4m", 12, VPX_IMG_FMT_I42216, VPX_BITS_12, 3},
+  {"park_joy_90p_12_444.y4m", 12, VPX_IMG_FMT_I44416, VPX_BITS_12, 3},
+  {"park_joy_90p_12_440.yuv", 12, VPX_IMG_FMT_I44016, VPX_BITS_12, 3},
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+
+int is_extension_y4m(const char *filename) {
+  const char *dot = strrchr(filename, '.');
+  if (!dot || dot == filename)
+    return 0;
+  else
+    return !strcmp(dot, ".y4m");
+}
+
+class EndToEndTestLarge
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, \
+                                                 TestVideoParam> {
+ protected:
+  EndToEndTestLarge()
+      : EncoderTest(GET_PARAM(0)),
+        psnr_(0.0),
+        nframes_(0),
+        encoding_mode_(GET_PARAM(1)) {
+  }
+
+  virtual ~EndToEndTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    if (encoding_mode_ != ::libvpx_test::kRealTime) {
+      cfg_.g_lag_in_frames = 5;
+      cfg_.rc_end_usage = VPX_VBR;
+    } else {
+      cfg_.g_lag_in_frames = 0;
+      cfg_.rc_end_usage = VPX_CBR;
+    }
+    test_video_param_ = GET_PARAM(2);
+  }
+
+  virtual void BeginPassHook(unsigned int) {
+    psnr_ = 0.0;
+    nframes_ = 0;
+  }
+
+  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+    psnr_ += pkt->data.psnr.psnr[0];
+    nframes_++;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(VP8E_SET_CPUUSED, kCpuUsed);
+      if (encoding_mode_ != ::libvpx_test::kRealTime) {
+        encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
+        encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+      }
+    }
+  }
+
+  double GetAveragePsnr() const {
+    if (nframes_)
+      return psnr_ / nframes_;
+    return 0.0;
+  }
+
+  TestVideoParam test_video_param_;
+
+ private:
+  double psnr_;
+  unsigned int nframes_;
+  libvpx_test::TestMode encoding_mode_;
+};
+
+TEST_P(EndToEndTestLarge, EndtoEndPSNRTest) {
+  cfg_.rc_target_bitrate = kBitrate;
+  cfg_.g_error_resilient = 0;
+  cfg_.g_profile = test_video_param_.profile;
+  cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
+  cfg_.g_bit_depth = test_video_param_.bit_depth;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+
+  libvpx_test::VideoSource *video;
+  if (is_extension_y4m(test_video_param_.filename)) {
+    video = new libvpx_test::Y4mVideoSource(test_video_param_.filename,
+                                            0, kFrames);
+  } else {
+    video = new libvpx_test::YUVVideoSource(test_video_param_.filename,
+                                            test_video_param_.fmt,
+                                            kWidth, kHeight,
+                                            kFramerate, 1, 0, kFrames);
+  }
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video));
+  const double psnr = GetAveragePsnr();
+  EXPECT_GT(psnr, psnr_threshold);
+  delete(video);
+}
+
+VP9_INSTANTIATE_TEST_CASE(
+    EndToEndTestLarge,
+    ::testing::Values(::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood),
+    ::testing::ValuesIn(TestVectors));
+
+}  // namespace

diff --git a/source/libvpx/test/vp9_intrapred_test.cc b/source/libvpx/test/vp9_intrapred_test.cc
index 7d08d9e..694db1b 100644
--- a/source/libvpx/test/vp9_intrapred_test.cc
+++ b/source/libvpx/test/vp9_intrapred_test.cc

@@ -134,150 +134,164 @@
 #if ARCH_X86_64
 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest,
                         ::testing::Values(
-                            make_tuple(&vp9_high_dc_predictor_32x32_sse2,
-                                       &vp9_high_dc_predictor_32x32_c, 32, 8),
-                            make_tuple(&vp9_high_tm_predictor_16x16_sse2,
-                                       &vp9_high_tm_predictor_16x16_c, 16, 8),
-                            make_tuple(&vp9_high_tm_predictor_32x32_sse2,
-                                       &vp9_high_tm_predictor_32x32_c, 32, 8),
-                            make_tuple(&vp9_high_dc_predictor_4x4_sse,
-                                       &vp9_high_dc_predictor_4x4_c, 4, 8),
-                            make_tuple(&vp9_high_dc_predictor_8x8_sse2,
-                                       &vp9_high_dc_predictor_8x8_c, 8, 8),
-                            make_tuple(&vp9_high_dc_predictor_16x16_sse2,
-                                       &vp9_high_dc_predictor_16x16_c, 16, 8),
-                            make_tuple(&vp9_high_v_predictor_4x4_sse,
-                                       &vp9_high_v_predictor_4x4_c, 4, 8),
-                            make_tuple(&vp9_high_v_predictor_8x8_sse2,
-                                       &vp9_high_v_predictor_8x8_c, 8, 8),
-                            make_tuple(&vp9_high_v_predictor_16x16_sse2,
-                                       &vp9_high_v_predictor_16x16_c, 16, 8),
-                            make_tuple(&vp9_high_v_predictor_32x32_sse2,
-                                       &vp9_high_v_predictor_32x32_c, 32, 8),
-                            make_tuple(&vp9_high_tm_predictor_4x4_sse,
-                                       &vp9_high_tm_predictor_4x4_c, 4, 8),
-                            make_tuple(&vp9_high_tm_predictor_8x8_sse2,
-                                       &vp9_high_tm_predictor_8x8_c, 8, 8)));
+                            make_tuple(&vp9_highbd_dc_predictor_32x32_sse2,
+                                       &vp9_highbd_dc_predictor_32x32_c, 32, 8),
+                            make_tuple(&vp9_highbd_tm_predictor_16x16_sse2,
+                                       &vp9_highbd_tm_predictor_16x16_c, 16, 8),
+                            make_tuple(&vp9_highbd_tm_predictor_32x32_sse2,
+                                       &vp9_highbd_tm_predictor_32x32_c, 32, 8),
+                            make_tuple(&vp9_highbd_dc_predictor_4x4_sse,
+                                       &vp9_highbd_dc_predictor_4x4_c, 4, 8),
+                            make_tuple(&vp9_highbd_dc_predictor_8x8_sse2,
+                                       &vp9_highbd_dc_predictor_8x8_c, 8, 8),
+                            make_tuple(&vp9_highbd_dc_predictor_16x16_sse2,
+                                       &vp9_highbd_dc_predictor_16x16_c, 16, 8),
+                            make_tuple(&vp9_highbd_v_predictor_4x4_sse,
+                                       &vp9_highbd_v_predictor_4x4_c, 4, 8),
+                            make_tuple(&vp9_highbd_v_predictor_8x8_sse2,
+                                       &vp9_highbd_v_predictor_8x8_c, 8, 8),
+                            make_tuple(&vp9_highbd_v_predictor_16x16_sse2,
+                                       &vp9_highbd_v_predictor_16x16_c, 16, 8),
+                            make_tuple(&vp9_highbd_v_predictor_32x32_sse2,
+                                       &vp9_highbd_v_predictor_32x32_c, 32, 8),
+                            make_tuple(&vp9_highbd_tm_predictor_4x4_sse,
+                                       &vp9_highbd_tm_predictor_4x4_c, 4, 8),
+                            make_tuple(&vp9_highbd_tm_predictor_8x8_sse2,
+                                       &vp9_highbd_tm_predictor_8x8_c, 8, 8)));
 #else
 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest,
                         ::testing::Values(
-                            make_tuple(&vp9_high_dc_predictor_4x4_sse,
-                                       &vp9_high_dc_predictor_4x4_c, 4, 8),
-                            make_tuple(&vp9_high_dc_predictor_8x8_sse2,
-                                       &vp9_high_dc_predictor_8x8_c, 8, 8),
-                            make_tuple(&vp9_high_dc_predictor_16x16_sse2,
-                                       &vp9_high_dc_predictor_16x16_c, 16, 8),
-                            make_tuple(&vp9_high_v_predictor_4x4_sse,
-                                       &vp9_high_v_predictor_4x4_c, 4, 8),
-                            make_tuple(&vp9_high_v_predictor_8x8_sse2,
-                                       &vp9_high_v_predictor_8x8_c, 8, 8),
-                            make_tuple(&vp9_high_v_predictor_16x16_sse2,
-                                       &vp9_high_v_predictor_16x16_c, 16, 8),
-                            make_tuple(&vp9_high_v_predictor_32x32_sse2,
-                                       &vp9_high_v_predictor_32x32_c, 32, 8),
-                            make_tuple(&vp9_high_tm_predictor_4x4_sse,
-                                       &vp9_high_tm_predictor_4x4_c, 4, 8),
-                            make_tuple(&vp9_high_tm_predictor_8x8_sse2,
-                                       &vp9_high_tm_predictor_8x8_c, 8, 8)));
+                            make_tuple(&vp9_highbd_dc_predictor_4x4_sse,
+                                       &vp9_highbd_dc_predictor_4x4_c, 4, 8),
+                            make_tuple(&vp9_highbd_dc_predictor_8x8_sse2,
+                                       &vp9_highbd_dc_predictor_8x8_c, 8, 8),
+                            make_tuple(&vp9_highbd_dc_predictor_16x16_sse2,
+                                       &vp9_highbd_dc_predictor_16x16_c, 16, 8),
+                            make_tuple(&vp9_highbd_v_predictor_4x4_sse,
+                                       &vp9_highbd_v_predictor_4x4_c, 4, 8),
+                            make_tuple(&vp9_highbd_v_predictor_8x8_sse2,
+                                       &vp9_highbd_v_predictor_8x8_c, 8, 8),
+                            make_tuple(&vp9_highbd_v_predictor_16x16_sse2,
+                                       &vp9_highbd_v_predictor_16x16_c, 16, 8),
+                            make_tuple(&vp9_highbd_v_predictor_32x32_sse2,
+                                       &vp9_highbd_v_predictor_32x32_c, 32, 8),
+                            make_tuple(&vp9_highbd_tm_predictor_4x4_sse,
+                                       &vp9_highbd_tm_predictor_4x4_c, 4, 8),
+                            make_tuple(&vp9_highbd_tm_predictor_8x8_sse2,
+                                       &vp9_highbd_tm_predictor_8x8_c, 8, 8)));
 #endif
 #if ARCH_X86_64
 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest,
                         ::testing::Values(
-                            make_tuple(&vp9_high_dc_predictor_32x32_sse2,
-                                       &vp9_high_dc_predictor_32x32_c, 32, 10),
-                            make_tuple(&vp9_high_tm_predictor_16x16_sse2,
-                                       &vp9_high_tm_predictor_16x16_c, 16, 10),
-                            make_tuple(&vp9_high_tm_predictor_32x32_sse2,
-                                       &vp9_high_tm_predictor_32x32_c, 32, 10),
-                            make_tuple(&vp9_high_dc_predictor_4x4_sse,
-                                       &vp9_high_dc_predictor_4x4_c, 4, 10),
-                            make_tuple(&vp9_high_dc_predictor_8x8_sse2,
-                                       &vp9_high_dc_predictor_8x8_c, 8, 10),
-                            make_tuple(&vp9_high_dc_predictor_16x16_sse2,
-                                   &vp9_high_dc_predictor_16x16_c, 16, 10),
-                            make_tuple(&vp9_high_v_predictor_4x4_sse,
-                                       &vp9_high_v_predictor_4x4_c, 4, 10),
-                            make_tuple(&vp9_high_v_predictor_8x8_sse2,
-                                       &vp9_high_v_predictor_8x8_c, 8, 10),
-                            make_tuple(&vp9_high_v_predictor_16x16_sse2,
-                                       &vp9_high_v_predictor_16x16_c, 16, 10),
-                            make_tuple(&vp9_high_v_predictor_32x32_sse2,
-                                       &vp9_high_v_predictor_32x32_c, 32, 10),
-                            make_tuple(&vp9_high_tm_predictor_4x4_sse,
-                                       &vp9_high_tm_predictor_4x4_c, 4, 10),
-                            make_tuple(&vp9_high_tm_predictor_8x8_sse2,
-                                       &vp9_high_tm_predictor_8x8_c, 8, 10)));
+                            make_tuple(&vp9_highbd_dc_predictor_32x32_sse2,
+                                       &vp9_highbd_dc_predictor_32x32_c, 32,
+                                       10),
+                            make_tuple(&vp9_highbd_tm_predictor_16x16_sse2,
+                                       &vp9_highbd_tm_predictor_16x16_c, 16,
+                                       10),
+                            make_tuple(&vp9_highbd_tm_predictor_32x32_sse2,
+                                       &vp9_highbd_tm_predictor_32x32_c, 32,
+                                       10),
+                            make_tuple(&vp9_highbd_dc_predictor_4x4_sse,
+                                       &vp9_highbd_dc_predictor_4x4_c, 4, 10),
+                            make_tuple(&vp9_highbd_dc_predictor_8x8_sse2,
+                                       &vp9_highbd_dc_predictor_8x8_c, 8, 10),
+                            make_tuple(&vp9_highbd_dc_predictor_16x16_sse2,
+                                       &vp9_highbd_dc_predictor_16x16_c, 16,
+                                       10),
+                            make_tuple(&vp9_highbd_v_predictor_4x4_sse,
+                                       &vp9_highbd_v_predictor_4x4_c, 4, 10),
+                            make_tuple(&vp9_highbd_v_predictor_8x8_sse2,
+                                       &vp9_highbd_v_predictor_8x8_c, 8, 10),
+                            make_tuple(&vp9_highbd_v_predictor_16x16_sse2,
+                                       &vp9_highbd_v_predictor_16x16_c, 16,
+                                       10),
+                            make_tuple(&vp9_highbd_v_predictor_32x32_sse2,
+                                       &vp9_highbd_v_predictor_32x32_c, 32,
+                                       10),
+                            make_tuple(&vp9_highbd_tm_predictor_4x4_sse,
+                                       &vp9_highbd_tm_predictor_4x4_c, 4, 10),
+                            make_tuple(&vp9_highbd_tm_predictor_8x8_sse2,
+                                       &vp9_highbd_tm_predictor_8x8_c, 8, 10)));
 #else
 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest,
                         ::testing::Values(
-                            make_tuple(&vp9_high_dc_predictor_4x4_sse,
-                                       &vp9_high_dc_predictor_4x4_c, 4, 10),
-                            make_tuple(&vp9_high_dc_predictor_8x8_sse2,
-                                       &vp9_high_dc_predictor_8x8_c, 8, 10),
-                            make_tuple(&vp9_high_dc_predictor_16x16_sse2,
-                                       &vp9_high_dc_predictor_16x16_c, 16, 10),
-                            make_tuple(&vp9_high_v_predictor_4x4_sse,
-                                       &vp9_high_v_predictor_4x4_c, 4, 10),
-                            make_tuple(&vp9_high_v_predictor_8x8_sse2,
-                                       &vp9_high_v_predictor_8x8_c, 8, 10),
-                            make_tuple(&vp9_high_v_predictor_16x16_sse2,
-                                       &vp9_high_v_predictor_16x16_c, 16, 10),
-                            make_tuple(&vp9_high_v_predictor_32x32_sse2,
-                                       &vp9_high_v_predictor_32x32_c, 32, 10),
-                            make_tuple(&vp9_high_tm_predictor_4x4_sse,
-                                   &vp9_high_tm_predictor_4x4_c, 4, 10),
-                            make_tuple(&vp9_high_tm_predictor_8x8_sse2,
-                                       &vp9_high_tm_predictor_8x8_c, 8, 10)));
+                            make_tuple(&vp9_highbd_dc_predictor_4x4_sse,
+                                       &vp9_highbd_dc_predictor_4x4_c, 4, 10),
+                            make_tuple(&vp9_highbd_dc_predictor_8x8_sse2,
+                                       &vp9_highbd_dc_predictor_8x8_c, 8, 10),
+                            make_tuple(&vp9_highbd_dc_predictor_16x16_sse2,
+                                       &vp9_highbd_dc_predictor_16x16_c, 16,
+                                       10),
+                            make_tuple(&vp9_highbd_v_predictor_4x4_sse,
+                                       &vp9_highbd_v_predictor_4x4_c, 4, 10),
+                            make_tuple(&vp9_highbd_v_predictor_8x8_sse2,
+                                       &vp9_highbd_v_predictor_8x8_c, 8, 10),
+                            make_tuple(&vp9_highbd_v_predictor_16x16_sse2,
+                                       &vp9_highbd_v_predictor_16x16_c, 16, 10),
+                            make_tuple(&vp9_highbd_v_predictor_32x32_sse2,
+                                       &vp9_highbd_v_predictor_32x32_c, 32, 10),
+                            make_tuple(&vp9_highbd_tm_predictor_4x4_sse,
+                                       &vp9_highbd_tm_predictor_4x4_c, 4, 10),
+                            make_tuple(&vp9_highbd_tm_predictor_8x8_sse2,
+                                       &vp9_highbd_tm_predictor_8x8_c, 8, 10)));
 #endif
 
 #if ARCH_X86_64
 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest,
                         ::testing::Values(
-                            make_tuple(&vp9_high_dc_predictor_32x32_sse2,
-                                       &vp9_high_dc_predictor_32x32_c, 32, 12),
-                            make_tuple(&vp9_high_tm_predictor_16x16_sse2,
-                                       &vp9_high_tm_predictor_16x16_c, 16, 12),
-                            make_tuple(&vp9_high_tm_predictor_32x32_sse2,
-                                       &vp9_high_tm_predictor_32x32_c, 32, 12),
-                            make_tuple(&vp9_high_dc_predictor_4x4_sse,
-                                       &vp9_high_dc_predictor_4x4_c, 4, 12),
-                            make_tuple(&vp9_high_dc_predictor_8x8_sse2,
-                                       &vp9_high_dc_predictor_8x8_c, 8, 12),
-                            make_tuple(&vp9_high_dc_predictor_16x16_sse2,
-                                       &vp9_high_dc_predictor_16x16_c, 16, 12),
-                            make_tuple(&vp9_high_v_predictor_4x4_sse,
-                                       &vp9_high_v_predictor_4x4_c, 4, 12),
-                            make_tuple(&vp9_high_v_predictor_8x8_sse2,
-                                       &vp9_high_v_predictor_8x8_c, 8, 12),
-                            make_tuple(&vp9_high_v_predictor_16x16_sse2,
-                                       &vp9_high_v_predictor_16x16_c, 16, 12),
-                            make_tuple(&vp9_high_v_predictor_32x32_sse2,
-                                       &vp9_high_v_predictor_32x32_c, 32, 12),
-                            make_tuple(&vp9_high_tm_predictor_4x4_sse,
-                                       &vp9_high_tm_predictor_4x4_c, 4, 12),
-                            make_tuple(&vp9_high_tm_predictor_8x8_sse2,
-                                       &vp9_high_tm_predictor_8x8_c, 8, 12)));
+                            make_tuple(&vp9_highbd_dc_predictor_32x32_sse2,
+                                       &vp9_highbd_dc_predictor_32x32_c, 32,
+                                       12),
+                            make_tuple(&vp9_highbd_tm_predictor_16x16_sse2,
+                                       &vp9_highbd_tm_predictor_16x16_c, 16,
+                                       12),
+                            make_tuple(&vp9_highbd_tm_predictor_32x32_sse2,
+                                       &vp9_highbd_tm_predictor_32x32_c, 32,
+                                       12),
+                            make_tuple(&vp9_highbd_dc_predictor_4x4_sse,
+                                       &vp9_highbd_dc_predictor_4x4_c, 4, 12),
+                            make_tuple(&vp9_highbd_dc_predictor_8x8_sse2,
+                                       &vp9_highbd_dc_predictor_8x8_c, 8, 12),
+                            make_tuple(&vp9_highbd_dc_predictor_16x16_sse2,
+                                       &vp9_highbd_dc_predictor_16x16_c, 16,
+                                       12),
+                            make_tuple(&vp9_highbd_v_predictor_4x4_sse,
+                                       &vp9_highbd_v_predictor_4x4_c, 4, 12),
+                            make_tuple(&vp9_highbd_v_predictor_8x8_sse2,
+                                       &vp9_highbd_v_predictor_8x8_c, 8, 12),
+                            make_tuple(&vp9_highbd_v_predictor_16x16_sse2,
+                                       &vp9_highbd_v_predictor_16x16_c, 16,
+                                       12),
+                            make_tuple(&vp9_highbd_v_predictor_32x32_sse2,
+                                       &vp9_highbd_v_predictor_32x32_c, 32,
+                                       12),
+                            make_tuple(&vp9_highbd_tm_predictor_4x4_sse,
+                                       &vp9_highbd_tm_predictor_4x4_c, 4, 12),
+                            make_tuple(&vp9_highbd_tm_predictor_8x8_sse2,
+                                       &vp9_highbd_tm_predictor_8x8_c, 8, 12)));
 #else
 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest,
                         ::testing::Values(
-                            make_tuple(&vp9_high_dc_predictor_4x4_sse,
-                                       &vp9_high_dc_predictor_4x4_c, 4, 12),
-                            make_tuple(&vp9_high_dc_predictor_8x8_sse2,
-                                       &vp9_high_dc_predictor_8x8_c, 8, 12),
-                            make_tuple(&vp9_high_dc_predictor_16x16_sse2,
-                                       &vp9_high_dc_predictor_16x16_c, 16, 12),
-                            make_tuple(&vp9_high_v_predictor_4x4_sse,
-                                       &vp9_high_v_predictor_4x4_c, 4, 12),
-                            make_tuple(&vp9_high_v_predictor_8x8_sse2,
-                                       &vp9_high_v_predictor_8x8_c, 8, 12),
-                            make_tuple(&vp9_high_v_predictor_16x16_sse2,
-                                       &vp9_high_v_predictor_16x16_c, 16, 12),
-                            make_tuple(&vp9_high_v_predictor_32x32_sse2,
-                                       &vp9_high_v_predictor_32x32_c, 32, 12),
-                            make_tuple(&vp9_high_tm_predictor_4x4_sse,
-                                       &vp9_high_tm_predictor_4x4_c, 4, 12),
-                            make_tuple(&vp9_high_tm_predictor_8x8_sse2,
-                                       &vp9_high_tm_predictor_8x8_c, 8, 12)));
+                            make_tuple(&vp9_highbd_dc_predictor_4x4_sse,
+                                       &vp9_highbd_dc_predictor_4x4_c, 4, 12),
+                            make_tuple(&vp9_highbd_dc_predictor_8x8_sse2,
+                                       &vp9_highbd_dc_predictor_8x8_c, 8, 12),
+                            make_tuple(&vp9_highbd_dc_predictor_16x16_sse2,
+                                       &vp9_highbd_dc_predictor_16x16_c, 16,
+                                       12),
+                            make_tuple(&vp9_highbd_v_predictor_4x4_sse,
+                                       &vp9_highbd_v_predictor_4x4_c, 4, 12),
+                            make_tuple(&vp9_highbd_v_predictor_8x8_sse2,
+                                       &vp9_highbd_v_predictor_8x8_c, 8, 12),
+                            make_tuple(&vp9_highbd_v_predictor_16x16_sse2,
+                                       &vp9_highbd_v_predictor_16x16_c, 16, 12),
+                            make_tuple(&vp9_highbd_v_predictor_32x32_sse2,
+                                       &vp9_highbd_v_predictor_32x32_c, 32, 12),
+                            make_tuple(&vp9_highbd_tm_predictor_4x4_sse,
+                                       &vp9_highbd_tm_predictor_4x4_c, 4, 12),
+                            make_tuple(&vp9_highbd_tm_predictor_8x8_sse2,
+                                       &vp9_highbd_tm_predictor_8x8_c, 8, 12)));
 #endif
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_SSE2

diff --git a/source/libvpx/test/vp9_lossless_test.cc b/source/libvpx/test/vp9_lossless_test.cc
index b3b9c92..67215d3 100644
--- a/source/libvpx/test/vp9_lossless_test.cc
+++ b/source/libvpx/test/vp9_lossless_test.cc

@@ -19,17 +19,17 @@
 
 const int kMaxPsnr = 100;
 
-class LosslessTestLarge : public ::libvpx_test::EncoderTest,
+class LosslessTest : public ::libvpx_test::EncoderTest,
     public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
  protected:
-  LosslessTestLarge()
+  LosslessTest()
       : EncoderTest(GET_PARAM(0)),
         psnr_(kMaxPsnr),
         nframes_(0),
         encoding_mode_(GET_PARAM(1)) {
   }
 
-  virtual ~LosslessTestLarge() {}
+  virtual ~LosslessTest() {}
 
   virtual void SetUp() {
     InitializeConfig();
@@ -67,7 +67,7 @@
   libvpx_test::TestMode encoding_mode_;
 };
 
-TEST_P(LosslessTestLarge, TestLossLessEncoding) {
+TEST_P(LosslessTest, TestLossLessEncoding) {
   const vpx_rational timebase = { 33333333, 1000000000 };
   cfg_.g_timebase = timebase;
   cfg_.rc_target_bitrate = 2000;
@@ -85,7 +85,7 @@
   EXPECT_GE(psnr_lossless, kMaxPsnr);
 }
 
-TEST_P(LosslessTestLarge, TestLossLessEncoding444) {
+TEST_P(LosslessTest, TestLossLessEncoding444) {
   libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 10);
 
   cfg_.g_profile = 1;
@@ -102,7 +102,7 @@
   EXPECT_GE(psnr_lossless, kMaxPsnr);
 }
 
-TEST_P(LosslessTestLarge, TestLossLessEncodingCtrl) {
+TEST_P(LosslessTest, TestLossLessEncodingCtrl) {
   const vpx_rational timebase = { 33333333, 1000000000 };
   cfg_.g_timebase = timebase;
   cfg_.rc_target_bitrate = 2000;
@@ -121,5 +121,8 @@
   EXPECT_GE(psnr_lossless, kMaxPsnr);
 }
 
-VP9_INSTANTIATE_TEST_CASE(LosslessTestLarge, ALL_TEST_MODES);
+VP9_INSTANTIATE_TEST_CASE(LosslessTest,
+                          ::testing::Values(::libvpx_test::kRealTime,
+                                            ::libvpx_test::kOnePassGood,
+                                            ::libvpx_test::kTwoPassGood));
 }  // namespace

diff --git a/source/libvpx/test/yuv_video_source.h b/source/libvpx/test/yuv_video_source.h
new file mode 100644
index 0000000..3c852b2
--- /dev/null
+++ b/source/libvpx/test/yuv_video_source.h

@@ -0,0 +1,151 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef TEST_YUV_VIDEO_SOURCE_H_
+#define TEST_YUV_VIDEO_SOURCE_H_
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+
+#include "test/video_source.h"
+#include "vpx/vpx_image.h"
+
+namespace libvpx_test {
+
+// This class extends VideoSource to allow parsing of raw YUV
+// formats of various color sampling and bit-depths so that we can
+// do actual file encodes.
+class YUVVideoSource : public VideoSource {
+ public:
+  YUVVideoSource(const std::string &file_name, vpx_img_fmt format,
+                 unsigned int width, unsigned int height,
+                 int rate_numerator, int rate_denominator,
+                 unsigned int start, int limit)
+      : file_name_(file_name),
+        input_file_(NULL),
+        img_(NULL),
+        start_(start),
+        limit_(limit),
+        frame_(0),
+        width_(0),
+        height_(0),
+        format_(VPX_IMG_FMT_NONE),
+        framerate_numerator_(rate_numerator),
+        framerate_denominator_(rate_denominator) {
+    // This initializes format_, raw_size_, width_, height_ and allocates img.
+    SetSize(width, height, format);
+  }
+
+  virtual ~YUVVideoSource() {
+    vpx_img_free(img_);
+    if (input_file_)
+      fclose(input_file_);
+  }
+
+  virtual void Begin() {
+    if (input_file_)
+      fclose(input_file_);
+    input_file_ = OpenTestDataFile(file_name_);
+    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
+                                     << file_name_;
+    if (start_)
+      fseek(input_file_, static_cast<unsigned>(raw_size_) * start_, SEEK_SET);
+
+    frame_ = start_;
+    FillFrame();
+  }
+
+  virtual void Next() {
+    ++frame_;
+    FillFrame();
+  }
+
+  virtual vpx_image_t *img() const { return (frame_ < limit_) ? img_ : NULL;  }
+
+  // Models a stream where Timebase = 1/FPS, so pts == frame.
+  virtual vpx_codec_pts_t pts() const { return frame_; }
+
+  virtual unsigned long duration() const { return 1; }
+
+  virtual vpx_rational_t timebase() const {
+    const vpx_rational_t t = { framerate_denominator_, framerate_numerator_ };
+    return t;
+  }
+
+  virtual unsigned int frame() const { return frame_; }
+
+  virtual unsigned int limit() const { return limit_; }
+
+  virtual void SetSize(unsigned int width, unsigned int height,
+                       vpx_img_fmt format) {
+    if (width != width_ || height != height_ || format != format_) {
+      vpx_img_free(img_);
+      img_ = vpx_img_alloc(NULL, format, width, height, 1);
+      ASSERT_TRUE(img_ != NULL);
+      width_ = width;
+      height_ = height;
+      format_ = format;
+      switch (format) {
+        case VPX_IMG_FMT_I420:
+          raw_size_ = width * height * 3 / 2;
+          break;
+        case VPX_IMG_FMT_I422:
+          raw_size_ = width * height * 2;
+          break;
+        case VPX_IMG_FMT_I440:
+          raw_size_ = width * height * 2;
+          break;
+        case VPX_IMG_FMT_I444:
+          raw_size_ = width * height * 3;
+          break;
+        case VPX_IMG_FMT_I42016:
+          raw_size_ = width * height * 3;
+          break;
+        case VPX_IMG_FMT_I42216:
+          raw_size_ = width * height * 4;
+          break;
+        case VPX_IMG_FMT_I44016:
+          raw_size_ = width * height * 4;
+          break;
+        case VPX_IMG_FMT_I44416:
+          raw_size_ = width * height * 6;
+          break;
+        default:
+          ASSERT_TRUE(0);
+      }
+    }
+  }
+
+  virtual void FillFrame() {
+    ASSERT_TRUE(input_file_ != NULL);
+    // Read a frame from input_file.
+    if (fread(img_->img_data, raw_size_, 1, input_file_) == 0) {
+      limit_ = frame_;
+    }
+  }
+
+ protected:
+  std::string file_name_;
+  FILE *input_file_;
+  vpx_image_t *img_;
+  size_t raw_size_;
+  unsigned int start_;
+  unsigned int limit_;
+  unsigned int frame_;
+  unsigned int width_;
+  unsigned int height_;
+  vpx_img_fmt format_;
+  int framerate_numerator_;
+  int framerate_denominator_;
+};
+
+}  // namespace libvpx_test
+
+#endif  // TEST_YUV_VIDEO_SOURCE_H_

diff --git a/source/libvpx/tools_common.c b/source/libvpx/tools_common.c
index 2ec1711..e243a91 100644
--- a/source/libvpx/tools_common.c
+++ b/source/libvpx/tools_common.c

@@ -224,7 +224,8 @@
   for (plane = 0; plane < 3; ++plane) {
     const unsigned char *buf = img->planes[plane];
     const int stride = img->stride[plane];
-    const int w = vpx_img_plane_width(img, plane);
+    const int w = vpx_img_plane_width(img, plane) *
+        ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
     const int h = vpx_img_plane_height(img, plane);
     int y;
 
@@ -267,3 +268,219 @@
     return kMaxPSNR;
   }
 }
+
+// TODO(debargha): Consolidate the functions below into a separate file.
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+static void highbd_img_upshift(vpx_image_t *dst, vpx_image_t *src,
+                               int input_shift) {
+  // Note the offset is 1 less than half.
+  const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0;
+  int plane;
+  if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+      dst->x_chroma_shift != src->x_chroma_shift ||
+      dst->y_chroma_shift != src->y_chroma_shift ||
+      dst->fmt != src->fmt || input_shift < 0) {
+    fatal("Unsupported image conversion");
+  }
+  switch (src->fmt) {
+    case VPX_IMG_FMT_I42016:
+    case VPX_IMG_FMT_I42216:
+    case VPX_IMG_FMT_I44416:
+    case VPX_IMG_FMT_I44016:
+      break;
+    default:
+      fatal("Unsupported image conversion");
+      break;
+  }
+  for (plane = 0; plane < 3; plane++) {
+    int w = src->d_w;
+    int h = src->d_h;
+    int x, y;
+    if (plane) {
+      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+      h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
+    }
+    for (y = 0; y < h; y++) {
+      uint16_t *p_src =
+          (uint16_t *)(src->planes[plane] + y * src->stride[plane]);
+      uint16_t *p_dst =
+          (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]);
+      for (x = 0; x < w; x++)
+        *p_dst++ = (*p_src++ << input_shift) + offset;
+    }
+  }
+}
+
+static void lowbd_img_upshift(vpx_image_t *dst, vpx_image_t *src,
+                              int input_shift) {
+  // Note the offset is 1 less than half.
+  const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0;
+  int plane;
+  if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+      dst->x_chroma_shift != src->x_chroma_shift ||
+      dst->y_chroma_shift != src->y_chroma_shift ||
+      dst->fmt != src->fmt + VPX_IMG_FMT_HIGHBITDEPTH ||
+      input_shift < 0) {
+    fatal("Unsupported image conversion");
+  }
+  switch (src->fmt) {
+    case VPX_IMG_FMT_I420:
+    case VPX_IMG_FMT_I422:
+    case VPX_IMG_FMT_I444:
+    case VPX_IMG_FMT_I440:
+      break;
+    default:
+      fatal("Unsupported image conversion");
+      break;
+  }
+  for (plane = 0; plane < 3; plane++) {
+    int w = src->d_w;
+    int h = src->d_h;
+    int x, y;
+    if (plane) {
+      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+      h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
+    }
+    for (y = 0; y < h; y++) {
+      uint8_t *p_src = src->planes[plane] + y * src->stride[plane];
+      uint16_t *p_dst =
+          (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]);
+      for (x = 0; x < w; x++) {
+        *p_dst++ = (*p_src++ << input_shift) + offset;
+      }
+    }
+  }
+}
+
+void vpx_img_upshift(vpx_image_t *dst, vpx_image_t *src,
+                     int input_shift) {
+  if (src->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+    highbd_img_upshift(dst, src, input_shift);
+  } else {
+    lowbd_img_upshift(dst, src, input_shift);
+  }
+}
+
+void vpx_img_truncate_16_to_8(vpx_image_t *dst, vpx_image_t *src) {
+  int plane;
+  if (dst->fmt + VPX_IMG_FMT_HIGHBITDEPTH != src->fmt ||
+      dst->d_w != src->d_w || dst->d_h != src->d_h ||
+      dst->x_chroma_shift != src->x_chroma_shift ||
+      dst->y_chroma_shift != src->y_chroma_shift) {
+    fatal("Unsupported image conversion");
+  }
+  switch (dst->fmt) {
+    case VPX_IMG_FMT_I420:
+    case VPX_IMG_FMT_I422:
+    case VPX_IMG_FMT_I444:
+    case VPX_IMG_FMT_I440:
+      break;
+    default:
+      fatal("Unsupported image conversion");
+      break;
+  }
+  for (plane = 0; plane < 3; plane++) {
+    int w = src->d_w;
+    int h = src->d_h;
+    int x, y;
+    if (plane) {
+      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+      h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
+    }
+    for (y = 0; y < h; y++) {
+      uint16_t *p_src =
+          (uint16_t *)(src->planes[plane] + y * src->stride[plane]);
+      uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane];
+      for (x = 0; x < w; x++) {
+        *p_dst++ = *p_src++;
+      }
+    }
+  }
+}
+
+static void highbd_img_downshift(vpx_image_t *dst, vpx_image_t *src,
+                                 int down_shift) {
+  int plane;
+  if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+      dst->x_chroma_shift != src->x_chroma_shift ||
+      dst->y_chroma_shift != src->y_chroma_shift ||
+      dst->fmt != src->fmt || down_shift < 0) {
+    fatal("Unsupported image conversion");
+  }
+  switch (src->fmt) {
+    case VPX_IMG_FMT_I42016:
+    case VPX_IMG_FMT_I42216:
+    case VPX_IMG_FMT_I44416:
+    case VPX_IMG_FMT_I44016:
+      break;
+    default:
+      fatal("Unsupported image conversion");
+      break;
+  }
+  for (plane = 0; plane < 3; plane++) {
+    int w = src->d_w;
+    int h = src->d_h;
+    int x, y;
+    if (plane) {
+      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+      h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
+    }
+    for (y = 0; y < h; y++) {
+      uint16_t *p_src =
+          (uint16_t *)(src->planes[plane] + y * src->stride[plane]);
+      uint16_t *p_dst =
+          (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]);
+      for (x = 0; x < w; x++)
+        *p_dst++ = *p_src++ >> down_shift;
+    }
+  }
+}
+
+static void lowbd_img_downshift(vpx_image_t *dst, vpx_image_t *src,
+                                int down_shift) {
+  int plane;
+  if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+      dst->x_chroma_shift != src->x_chroma_shift ||
+      dst->y_chroma_shift != src->y_chroma_shift ||
+      src->fmt != dst->fmt + VPX_IMG_FMT_HIGHBITDEPTH ||
+      down_shift < 0) {
+    fatal("Unsupported image conversion");
+  }
+  switch (dst->fmt) {
+    case VPX_IMG_FMT_I420:
+    case VPX_IMG_FMT_I422:
+    case VPX_IMG_FMT_I444:
+    case VPX_IMG_FMT_I440:
+      break;
+    default:
+      fatal("Unsupported image conversion");
+      break;
+  }
+  for (plane = 0; plane < 3; plane++) {
+    int w = src->d_w;
+    int h = src->d_h;
+    int x, y;
+    if (plane) {
+      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+      h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
+    }
+    for (y = 0; y < h; y++) {
+      uint16_t *p_src =
+          (uint16_t *)(src->planes[plane] + y * src->stride[plane]);
+      uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane];
+      for (x = 0; x < w; x++) {
+        *p_dst++ = *p_src++ >> down_shift;
+      }
+    }
+  }
+}
+
+void vpx_img_downshift(vpx_image_t *dst, vpx_image_t *src,
+                       int down_shift) {
+  if (dst->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+    highbd_img_downshift(dst, src, down_shift);
+  } else {
+    lowbd_img_downshift(dst, src, down_shift);
+  }
+}
+#endif  // CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH

diff --git a/source/libvpx/tools_common.h b/source/libvpx/tools_common.h
index c1f466b..de6c38f 100644
--- a/source/libvpx/tools_common.h
+++ b/source/libvpx/tools_common.h

@@ -149,6 +149,12 @@
 
 double sse_to_psnr(double samples, double peak, double mse);
 
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+void vpx_img_upshift(vpx_image_t *dst, vpx_image_t *src, int input_shift);
+void vpx_img_downshift(vpx_image_t *dst, vpx_image_t *src, int down_shift);
+void vpx_img_truncate_16_to_8(vpx_image_t *dst, vpx_image_t *src);
+#endif
+
 #ifdef __cplusplus
 }  /* extern "C" */
 #endif

diff --git a/source/libvpx/vp8/common/arm/neon/loopfilter_neon.c b/source/libvpx/vp8/common/arm/neon/loopfilter_neon.c
index e103476..9d6807a 100644
--- a/source/libvpx/vp8/common/arm/neon/loopfilter_neon.c
+++ b/source/libvpx/vp8/common/arm/neon/loopfilter_neon.c

@@ -10,6 +10,7 @@
 
 #include <arm_neon.h>
 #include "./vpx_config.h"
+#include "vpx_ports/arm.h"
 
 static INLINE void vp8_loop_filter_neon(
         uint8x16_t qblimit,  // flimit
@@ -251,38 +252,56 @@
     return;
 }
 
-#if (__GNUC__ == 4 && (__GNUC_MINOR__ == 6))
-#warning Using GCC 4.6 is not recommended
-// Some versions of gcc4.6 do not correctly process vst4_lane_u8. When built
-// with any gcc4.6, use the C code.
-extern void vp8_loop_filter_vertical_edge_c(unsigned char *s, int p,
-                                            const unsigned char *blimit,
-                                            const unsigned char *limit,
-                                            const unsigned char *thresh,
-                                            int count);
-
-void vp8_loop_filter_vertical_edge_y_neon(
-        unsigned char *src,
-        int pitch,
-        unsigned char blimit,
-        unsigned char limit,
-        unsigned char thresh) {
-  vp8_loop_filter_vertical_edge_c(src, pitch, &blimit, &limit, &thresh, 2);
-}
-
-void vp8_loop_filter_vertical_edge_uv_neon(
-        unsigned char *u,
-        int pitch,
-        unsigned char blimit,
-        unsigned char limit,
-        unsigned char thresh,
-        unsigned char *v) {
-  vp8_loop_filter_vertical_edge_c(u, pitch, &blimit, &limit, &thresh, 1);
-  vp8_loop_filter_vertical_edge_c(v, pitch, &blimit, &limit, &thresh, 1);
-}
-#else
 static INLINE void write_4x8(unsigned char *dst, int pitch,
                              const uint8x8x4_t result) {
+#ifdef VPX_INCOMPATIBLE_GCC
+    /*
+     * uint8x8x4_t result
+    00 01 02 03 | 04 05 06 07
+    10 11 12 13 | 14 15 16 17
+    20 21 22 23 | 24 25 26 27
+    30 31 32 33 | 34 35 36 37
+    ---
+    * after vtrn_u16
+    00 01 20 21 | 04 05 24 25
+    02 03 22 23 | 06 07 26 27
+    10 11 30 31 | 14 15 34 35
+    12 13 32 33 | 16 17 36 37
+    ---
+    * after vtrn_u8
+    00 10 20 30 | 04 14 24 34
+    01 11 21 31 | 05 15 25 35
+    02 12 22 32 | 06 16 26 36
+    03 13 23 33 | 07 17 27 37
+    */
+    const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[0]),
+                                          vreinterpret_u16_u8(result.val[2]));
+    const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[1]),
+                                          vreinterpret_u16_u8(result.val[3]));
+    const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
+                                       vreinterpret_u8_u16(r13_u16.val[0]));
+    const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
+                                       vreinterpret_u8_u16(r13_u16.val[1]));
+    const uint32x2_t x_0_4 = vreinterpret_u32_u8(r01_u8.val[0]);
+    const uint32x2_t x_1_5 = vreinterpret_u32_u8(r01_u8.val[1]);
+    const uint32x2_t x_2_6 = vreinterpret_u32_u8(r23_u8.val[0]);
+    const uint32x2_t x_3_7 = vreinterpret_u32_u8(r23_u8.val[1]);
+    vst1_lane_u32((uint32_t *)dst, x_0_4, 0);
+    dst += pitch;
+    vst1_lane_u32((uint32_t *)dst, x_1_5, 0);
+    dst += pitch;
+    vst1_lane_u32((uint32_t *)dst, x_2_6, 0);
+    dst += pitch;
+    vst1_lane_u32((uint32_t *)dst, x_3_7, 0);
+    dst += pitch;
+    vst1_lane_u32((uint32_t *)dst, x_0_4, 1);
+    dst += pitch;
+    vst1_lane_u32((uint32_t *)dst, x_1_5, 1);
+    dst += pitch;
+    vst1_lane_u32((uint32_t *)dst, x_2_6, 1);
+    dst += pitch;
+    vst1_lane_u32((uint32_t *)dst, x_3_7, 1);
+#else
     vst4_lane_u8(dst, result, 0);
     dst += pitch;
     vst4_lane_u8(dst, result, 1);
@@ -298,6 +317,7 @@
     vst4_lane_u8(dst, result, 6);
     dst += pitch;
     vst4_lane_u8(dst, result, 7);
+#endif  // VPX_INCOMPATIBLE_GCC
 }
 
 void vp8_loop_filter_vertical_edge_y_neon(
@@ -528,4 +548,3 @@
     vd = v - 2;
     write_4x8(vd, pitch, q4ResultH);
 }
-#endif  // (__GNUC__ == 4 && (__GNUC_MINOR__ == 6))

diff --git a/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c b/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
index d5178bb..e1c8609 100644
--- a/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
+++ b/source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c

@@ -10,45 +10,9 @@
 
 #include <arm_neon.h>
 #include "./vpx_config.h"
+#include "vpx_ports/arm.h"
 
-#if (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
-static INLINE void write_2x8(unsigned char *dst, int pitch,
-                             const uint8x8x2_t result,
-                             const uint8x8x2_t result2) {
-  vst2_lane_u8(dst, result, 0);
-  dst += pitch;
-  vst2_lane_u8(dst, result, 1);
-  dst += pitch;
-  vst2_lane_u8(dst, result, 2);
-  dst += pitch;
-  vst2_lane_u8(dst, result, 3);
-  dst += pitch;
-  vst2_lane_u8(dst, result, 4);
-  dst += pitch;
-  vst2_lane_u8(dst, result, 5);
-  dst += pitch;
-  vst2_lane_u8(dst, result, 6);
-  dst += pitch;
-  vst2_lane_u8(dst, result, 7);
-  dst += pitch;
-
-  vst2_lane_u8(dst, result2, 0);
-  dst += pitch;
-  vst2_lane_u8(dst, result2, 1);
-  dst += pitch;
-  vst2_lane_u8(dst, result2, 2);
-  dst += pitch;
-  vst2_lane_u8(dst, result2, 3);
-  dst += pitch;
-  vst2_lane_u8(dst, result2, 4);
-  dst += pitch;
-  vst2_lane_u8(dst, result2, 5);
-  dst += pitch;
-  vst2_lane_u8(dst, result2, 6);
-  dst += pitch;
-  vst2_lane_u8(dst, result2, 7);
-}
-#else
+#ifdef VPX_INCOMPATIBLE_GCC
 static INLINE void write_2x4(unsigned char *dst, int pitch,
                              const uint8x8x2_t result) {
     /*
@@ -88,30 +52,47 @@
   dst += pitch * 8;
   write_2x4(dst, pitch, result2);
 }
-#endif
-
-
-#if (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
-static INLINE
-uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) {
-    x = vld4_lane_u8(src, x, 0);
-    src += pitch;
-    x = vld4_lane_u8(src, x, 1);
-    src += pitch;
-    x = vld4_lane_u8(src, x, 2);
-    src += pitch;
-    x = vld4_lane_u8(src, x, 3);
-    src += pitch;
-    x = vld4_lane_u8(src, x, 4);
-    src += pitch;
-    x = vld4_lane_u8(src, x, 5);
-    src += pitch;
-    x = vld4_lane_u8(src, x, 6);
-    src += pitch;
-    x = vld4_lane_u8(src, x, 7);
-    return x;
-}
 #else
+static INLINE void write_2x8(unsigned char *dst, int pitch,
+                             const uint8x8x2_t result,
+                             const uint8x8x2_t result2) {
+  vst2_lane_u8(dst, result, 0);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 1);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 2);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 3);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 4);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 5);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 6);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 7);
+  dst += pitch;
+
+  vst2_lane_u8(dst, result2, 0);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 1);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 2);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 3);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 4);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 5);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 6);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 7);
+}
+#endif  // VPX_INCOMPATIBLE_GCC
+
+
+#ifdef VPX_INCOMPATIBLE_GCC
 static INLINE
 uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) {
     const uint8x8_t a = vld1_u8(src);
@@ -169,7 +150,27 @@
 
     return x;
 }
-#endif
+#else
+static INLINE
+uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) {
+    x = vld4_lane_u8(src, x, 0);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 1);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 2);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 3);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 4);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 5);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 6);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 7);
+    return x;
+}
+#endif  // VPX_INCOMPATIBLE_GCC
 
 static INLINE void vp8_loop_filter_simple_vertical_edge_neon(
         unsigned char *s,

diff --git a/source/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c b/source/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c
index ffa3d91..5ad9465 100644
--- a/source/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c
+++ b/source/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c

@@ -9,11 +9,9 @@
  */
 
 #include <arm_neon.h>
+#include "vpx_ports/arm.h"
 
-#if (__GNUC__ == 4 && (__GNUC_MINOR__ == 6))
-#warning Using GCC 4.6 is not recommended
-// Some versions of gcc4.6 do not correctly process this function. When built
-// with any gcc4.6, use the C code.
+#ifdef VPX_INCOMPATIBLE_GCC
 #include "./vp8_rtcd.h"
 void vp8_short_walsh4x4_neon(
         int16_t *input,
@@ -128,4 +126,4 @@
     vst1q_s16(output + 8, q1s16);
     return;
 }
-#endif  // (__GNUC__ == 4 && (__GNUC_MINOR__ == 6))
+#endif  // VPX_INCOMPATIBLE_GCC

diff --git a/source/libvpx/vp8/encoder/denoising.c b/source/libvpx/vp8/encoder/denoising.c
index 12f9734..75b2a3b 100644
--- a/source/libvpx/vp8/encoder/denoising.c
+++ b/source/libvpx/vp8/encoder/denoising.c

@@ -390,9 +390,9 @@
     denoiser->denoise_pars.scale_motion_thresh = 16;
     denoiser->denoise_pars.scale_increase_filter = 1;
     denoiser->denoise_pars.denoise_mv_bias = 60;
-    denoiser->denoise_pars.pickmode_mv_bias = 60;
-    denoiser->denoise_pars.qp_thresh = 100;
-    denoiser->denoise_pars.consec_zerolast = 10;
+    denoiser->denoise_pars.pickmode_mv_bias = 75;
+    denoiser->denoise_pars.qp_thresh = 85;
+    denoiser->denoise_pars.consec_zerolast = 15;
     denoiser->denoise_pars.spatial_blur = 20;
   }
 }
@@ -453,17 +453,17 @@
     // Bitrate thresholds and noise metric (nmse) thresholds for switching to
     // aggressive mode.
     // TODO(marpan): Adjust thresholds, including effect on resolution.
-    denoiser->bitrate_threshold = 200000;  // (bits/sec).
+    denoiser->bitrate_threshold = 300000;  // (bits/sec).
     denoiser->threshold_aggressive_mode = 35;
-    if (width * height > 640 * 480) {
-      denoiser->bitrate_threshold = 500000;
-      denoiser->threshold_aggressive_mode = 100;
+    if (width * height > 1280 * 720) {
+      denoiser->bitrate_threshold = 2000000;
+      denoiser->threshold_aggressive_mode = 1400;
     } else if (width * height > 960 * 540) {
       denoiser->bitrate_threshold = 800000;
       denoiser->threshold_aggressive_mode = 150;
-    } else if (width * height > 1280 * 720) {
-      denoiser->bitrate_threshold = 2000000;
-      denoiser->threshold_aggressive_mode = 1400;
+    } else if (width * height > 640 * 480) {
+      denoiser->bitrate_threshold = 500000;
+      denoiser->threshold_aggressive_mode = 100;
     }
     return 0;
 }

diff --git a/source/libvpx/vp8/encoder/denoising.h b/source/libvpx/vp8/encoder/denoising.h
index fb7930b..6c1f9e2 100644
--- a/source/libvpx/vp8/encoder/denoising.h
+++ b/source/libvpx/vp8/encoder/denoising.h

@@ -27,6 +27,8 @@
 #define SUM_DIFF_FROM_AVG_THRESH_UV (8 * 8 * 8)
 #define MOTION_MAGNITUDE_THRESHOLD_UV (8*3)
 
+#define MAX_GF_ARF_DENOISE_RANGE (16)
+
 enum vp8_denoiser_decision
 {
   COPY_BLOCK,

diff --git a/source/libvpx/vp8/encoder/mcomp.c b/source/libvpx/vp8/encoder/mcomp.c
index 54abe76..545f2c8 100644
--- a/source/libvpx/vp8/encoder/mcomp.c
+++ b/source/libvpx/vp8/encoder/mcomp.c

@@ -393,8 +393,8 @@
 #endif
 
     /* central mv */
-    bestmv->as_mv.row <<= 3;
-    bestmv->as_mv.col <<= 3;
+    bestmv->as_mv.row *= 8;
+    bestmv->as_mv.col *= 8;
     startmv = *bestmv;
 
     /* calculate central point error */

diff --git a/source/libvpx/vp8/encoder/pickinter.c b/source/libvpx/vp8/encoder/pickinter.c
index 43f8957..9d5556d 100644
--- a/source/libvpx/vp8/encoder/pickinter.c
+++ b/source/libvpx/vp8/encoder/pickinter.c

@@ -516,9 +516,8 @@
     // Adjust rd for ZEROMV and LAST, if LAST is the closest reference frame.
     if (this_mode == ZEROMV &&
         x->e_mbd.mode_info_context->mbmi.ref_frame == LAST_FRAME &&
-        (denoise_aggressive || cpi->closest_reference_frame == LAST_FRAME))
-    {
-        this_rd = ((int64_t)this_rd) * rd_adj / 100;
+        (denoise_aggressive || cpi->closest_reference_frame == LAST_FRAME)) {
+      this_rd = ((int64_t)this_rd) * rd_adj / 100;
     }
 
     check_for_encode_breakout(*sse, x);
@@ -1083,7 +1082,14 @@
         {
 
             /* Store for later use by denoiser. */
-            if (this_mode == ZEROMV && sse < zero_mv_sse )
+            // Dont' denoise with GOLDEN OR ALTREF is they are old reference
+            // frames (greater than MAX_GF_ARF_DENOISE_RANGE frames in past).
+            int skip_old_reference = ((this_ref_frame != LAST_FRAME) &&
+                (cpi->common.current_video_frame -
+                 cpi->current_ref_frames[this_ref_frame] >
+                 MAX_GF_ARF_DENOISE_RANGE)) ? 1 : 0;
+            if (this_mode == ZEROMV && sse < zero_mv_sse &&
+                !skip_old_reference)
             {
                 zero_mv_sse = sse;
                 x->best_zeromv_reference_frame =
@@ -1092,7 +1098,7 @@
 
             /* Store the best NEWMV in x for later use in the denoiser. */
             if (x->e_mbd.mode_info_context->mbmi.mode == NEWMV &&
-                    sse < best_sse)
+                sse < best_sse && !skip_old_reference)
             {
                 best_sse = sse;
                 x->best_sse_inter_mode = NEWMV;

diff --git a/source/libvpx/vp8/vp8_dx_iface.c b/source/libvpx/vp8/vp8_dx_iface.c
index 3ab8ed0..5aa274d 100644
--- a/source/libvpx/vp8/vp8_dx_iface.c
+++ b/source/libvpx/vp8/vp8_dx_iface.c

@@ -112,22 +112,19 @@
      * structure. More memory may be required at the time the stream
      * information becomes known.
      */
-    if (!ctx->priv)
-    {
-        vp8_init_ctx(ctx);
-        priv = (vpx_codec_alg_priv_t *)ctx->priv;
+    if (!ctx->priv) {
+      vp8_init_ctx(ctx);
+      priv = (vpx_codec_alg_priv_t *)ctx->priv;
 
-        /* initialize number of fragments to zero */
-        priv->fragments.count = 0;
-        /* is input fragments enabled? */
-        priv->fragments.enabled =
-            (priv->base.init_flags & VPX_CODEC_USE_INPUT_FRAGMENTS);
+      /* initialize number of fragments to zero */
+      priv->fragments.count = 0;
+      /* is input fragments enabled? */
+      priv->fragments.enabled =
+          (priv->base.init_flags & VPX_CODEC_USE_INPUT_FRAGMENTS);
 
-        /*post processing level initialized to do nothing */
-    }
-    else
-    {
-        priv = (vpx_codec_alg_priv_t *)ctx->priv;
+      /*post processing level initialized to do nothing */
+    } else {
+      priv = (vpx_codec_alg_priv_t *)ctx->priv;
     }
 
     priv->yv12_frame_buffers.use_frame_threads =
@@ -138,11 +135,10 @@
 
     if (priv->yv12_frame_buffers.use_frame_threads &&
         ((ctx->priv->init_flags & VPX_CODEC_USE_ERROR_CONCEALMENT) ||
-         (ctx->priv->init_flags & VPX_CODEC_USE_INPUT_FRAGMENTS)))
-    {
-        /* row-based threading, error concealment, and input fragments will
-         * not be supported when using frame-based threading */
-        res = VPX_CODEC_INVALID_PARAM;
+         (ctx->priv->init_flags & VPX_CODEC_USE_INPUT_FRAGMENTS))) {
+      /* row-based threading, error concealment, and input fragments will
+       * not be supported when using frame-based threading */
+      res = VPX_CODEC_INVALID_PARAM;
     }
 
     return res;

diff --git a/source/libvpx/vp9/common/vp9_blockd.c b/source/libvpx/vp9/common/vp9_blockd.c
index e13445f..7094a01 100644
--- a/source/libvpx/vp9/common/vp9_blockd.c
+++ b/source/libvpx/vp9/common/vp9_blockd.c

@@ -92,7 +92,7 @@
                                    void *arg) {
   int plane;
 
-  for (plane = 0; plane < MAX_MB_PLANE; plane++)
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane)
     vp9_foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
 }
 

diff --git a/source/libvpx/vp9/common/vp9_blockd.h b/source/libvpx/vp9/common/vp9_blockd.h
index 702efe0..1234d54 100644
--- a/source/libvpx/vp9/common/vp9_blockd.h
+++ b/source/libvpx/vp9/common/vp9_blockd.h

@@ -17,14 +17,10 @@
 #include "vpx_ports/mem.h"
 #include "vpx_scale/yv12config.h"
 
-#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_common_data.h"
-#include "vp9/common/vp9_enums.h"
 #include "vp9/common/vp9_filter.h"
-#include "vp9/common/vp9_idct.h"
 #include "vp9/common/vp9_mv.h"
 #include "vp9/common/vp9_scale.h"
-#include "vp9/common/vp9_seg_common.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -47,9 +43,9 @@
   PLANE_TYPES
 } PLANE_TYPE;
 
-typedef char ENTROPY_CONTEXT;
+#define MAX_MB_PLANE 3
 
-typedef char PARTITION_CONTEXT;
+typedef char ENTROPY_CONTEXT;
 
 static INLINE int combine_entropy_contexts(ENTROPY_CONTEXT a,
                                            ENTROPY_CONTEXT b) {
@@ -111,17 +107,6 @@
   MAX_REF_FRAMES = 4
 } MV_REFERENCE_FRAME;
 
-static INLINE int b_width_log2(BLOCK_SIZE sb_type) {
-  return b_width_log2_lookup[sb_type];
-}
-static INLINE int b_height_log2(BLOCK_SIZE sb_type) {
-  return b_height_log2_lookup[sb_type];
-}
-
-static INLINE int mi_width_log2(BLOCK_SIZE sb_type) {
-  return mi_width_log2_lookup[sb_type];
-}
-
 // This structure now relates to 8x8 block regions.
 typedef struct {
   // Common for both INTER and INTRA blocks
@@ -173,8 +158,6 @@
   MV_PRECISION_Q4
 };
 
-enum { MAX_MB_PLANE = 3 };
-
 struct buf_2d {
   uint8_t *buf;
   int stride;
@@ -312,7 +295,7 @@
 static INLINE void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize,
                                             TX_SIZE tx_size, int block,
                                             int *x, int *y) {
-  const int bwl = b_width_log2(plane_bsize);
+  const int bwl = b_width_log2_lookup[plane_bsize];
   const int tx_cols_log2 = bwl - tx_size;
   const int tx_cols = 1 << tx_cols_log2;
   const int raster_mb = block >> (tx_size << 1);

diff --git a/source/libvpx/vp9/common/vp9_common.h b/source/libvpx/vp9/common/vp9_common.h
index 8305e7f..6801dd3 100644
--- a/source/libvpx/vp9/common/vp9_common.h
+++ b/source/libvpx/vp9/common/vp9_common.h

@@ -65,7 +65,7 @@
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static INLINE uint16_t clip_pixel_high(int val, int bd) {
+static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
   switch (bd) {
     case 8:
     default:
@@ -77,8 +77,22 @@
   }
 }
 
+// Note:
+// tran_low_t  is the datatype used for final transform coefficients.
+// tran_high_t is the datatype used for intermediate transform stages.
+typedef int64_t tran_high_t;
+typedef int32_t tran_low_t;
+
 #define CONVERT_TO_SHORTPTR(x) ((uint16_t*)(((uintptr_t)x) << 1))
 #define CONVERT_TO_BYTEPTR(x) ((uint8_t*)(((uintptr_t)x) >> 1 ))
+
+#else
+
+// Note:
+// tran_low_t  is the datatype used for final transform coefficients.
+// tran_high_t is the datatype used for intermediate transform stages.
+typedef int32_t tran_high_t;
+typedef int16_t tran_low_t;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if CONFIG_DEBUG

diff --git a/source/libvpx/vp9/common/vp9_common_data.c b/source/libvpx/vp9/common/vp9_common_data.c
index d4c1b71..2aaa009 100644
--- a/source/libvpx/vp9/common/vp9_common_data.c
+++ b/source/libvpx/vp9/common/vp9_common_data.c

@@ -8,8 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
-#include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_common_data.h"
 
 // Log 2 conversion lookup tables for block width and height
@@ -36,7 +34,6 @@
 const int num_pels_log2_lookup[BLOCK_SIZES] =
   {4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12};
 
-
 const PARTITION_TYPE partition_lookup[][BLOCK_SIZES] = {
   {  // 4X4
     // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64

diff --git a/source/libvpx/vp9/common/vp9_convolve.c b/source/libvpx/vp9/common/vp9_convolve.c
index ad70e59..7b65651 100644
--- a/source/libvpx/vp9/common/vp9_convolve.c
+++ b/source/libvpx/vp9/common/vp9_convolve.c

@@ -130,16 +130,14 @@
   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
   uint8_t temp[135 * 64];
-  int intermediate_height = (((h - 1) * y_step_q4 + 15) >> 4) + SUBPEL_TAPS;
+  int intermediate_height =
+          (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
 
   assert(w <= 64);
   assert(h <= 64);
   assert(y_step_q4 <= 32);
   assert(x_step_q4 <= 32);
 
-  if (intermediate_height < h)
-    intermediate_height = h;
-
   convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
                  x_filters, x0_q4, x_step_q4, w, intermediate_height);
   convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
@@ -284,11 +282,11 @@
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static void high_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
-                                uint8_t *dst8, ptrdiff_t dst_stride,
-                                const InterpKernel *x_filters,
-                                int x0_q4, int x_step_q4,
-                                int w, int h, int bd) {
+static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
+                                  uint8_t *dst8, ptrdiff_t dst_stride,
+                                  const InterpKernel *x_filters,
+                                  int x0_q4, int x_step_q4,
+                                  int w, int h, int bd) {
   int x, y;
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
@@ -301,7 +299,7 @@
       int k, sum = 0;
       for (k = 0; k < SUBPEL_TAPS; ++k)
         sum += src_x[k] * x_filter[k];
-      dst[x] = clip_pixel_high(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+      dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
       x_q4 += x_step_q4;
     }
     src += src_stride;
@@ -309,11 +307,11 @@
   }
 }
 
-static void high_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
-                                    uint8_t *dst8, ptrdiff_t dst_stride,
-                                    const InterpKernel *x_filters,
-                                    int x0_q4, int x_step_q4,
-                                    int w, int h, int bd) {
+static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
+                                      uint8_t *dst8, ptrdiff_t dst_stride,
+                                      const InterpKernel *x_filters,
+                                      int x0_q4, int x_step_q4,
+                                      int w, int h, int bd) {
   int x, y;
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
@@ -327,7 +325,7 @@
       for (k = 0; k < SUBPEL_TAPS; ++k)
         sum += src_x[k] * x_filter[k];
       dst[x] = ROUND_POWER_OF_TWO(dst[x] +
-          clip_pixel_high(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1);
+          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1);
       x_q4 += x_step_q4;
     }
     src += src_stride;
@@ -335,11 +333,11 @@
   }
 }
 
-static void high_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
-                               uint8_t *dst8, ptrdiff_t dst_stride,
-                               const InterpKernel *y_filters,
-                               int y0_q4, int y_step_q4, int w, int h,
-                               int bd) {
+static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
+                                 uint8_t *dst8, ptrdiff_t dst_stride,
+                                 const InterpKernel *y_filters,
+                                 int y0_q4, int y_step_q4, int w, int h,
+                                 int bd) {
   int x, y;
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
@@ -352,7 +350,7 @@
       int k, sum = 0;
       for (k = 0; k < SUBPEL_TAPS; ++k)
         sum += src_y[k * src_stride] * y_filter[k];
-      dst[y * dst_stride] = clip_pixel_high(
+      dst[y * dst_stride] = clip_pixel_highbd(
           ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
       y_q4 += y_step_q4;
     }
@@ -361,11 +359,11 @@
   }
 }
 
-static void high_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
-                                   uint8_t *dst8, ptrdiff_t dst_stride,
-                                   const InterpKernel *y_filters,
-                                   int y0_q4, int y_step_q4, int w, int h,
-                                   int bd) {
+static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
+                                     uint8_t *dst8, ptrdiff_t dst_stride,
+                                     const InterpKernel *y_filters,
+                                     int y0_q4, int y_step_q4, int w, int h,
+                                     int bd) {
   int x, y;
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
@@ -379,7 +377,7 @@
       for (k = 0; k < SUBPEL_TAPS; ++k)
         sum += src_y[k * src_stride] * y_filter[k];
       dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
-          clip_pixel_high(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1);
+          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1);
       y_q4 += y_step_q4;
     }
     ++src;
@@ -387,13 +385,13 @@
   }
 }
 
-static void high_convolve(const uint8_t *src, ptrdiff_t src_stride,
-                          uint8_t *dst, ptrdiff_t dst_stride,
-                          const InterpKernel *const x_filters,
-                          int x0_q4, int x_step_q4,
-                          const InterpKernel *const y_filters,
-                          int y0_q4, int y_step_q4,
-                          int w, int h, int bd) {
+static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const InterpKernel *const x_filters,
+                            int x0_q4, int x_step_q4,
+                            const InterpKernel *const y_filters,
+                            int y0_q4, int y_step_q4,
+                            int w, int h, int bd) {
   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
   // 2d filtering proceeds in 2 steps:
   //   (1) Interpolate horizontally into an intermediate buffer, temp.
@@ -407,119 +405,117 @@
   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
   uint16_t temp[64 * 135];
-  int intermediate_height = (((h - 1) * y_step_q4 + 15) >> 4) + SUBPEL_TAPS;
+  int intermediate_height =
+          (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
 
   assert(w <= 64);
   assert(h <= 64);
   assert(y_step_q4 <= 32);
   assert(x_step_q4 <= 32);
 
-  if (intermediate_height < h)
-    intermediate_height = h;
-
-  high_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                      src_stride, CONVERT_TO_BYTEPTR(temp), 64,
-                      x_filters, x0_q4, x_step_q4, w,
-                      intermediate_height, bd);
-  high_convolve_vert(CONVERT_TO_BYTEPTR(temp) + 64 * (SUBPEL_TAPS / 2 - 1),
-                     64, dst, dst_stride, y_filters, y0_q4, y_step_q4,
-                     w, h, bd);
+  highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                        src_stride, CONVERT_TO_BYTEPTR(temp), 64,
+                        x_filters, x0_q4, x_step_q4, w,
+                        intermediate_height, bd);
+  highbd_convolve_vert(CONVERT_TO_BYTEPTR(temp) + 64 * (SUBPEL_TAPS / 2 - 1),
+                       64, dst, dst_stride, y_filters, y0_q4, y_step_q4,
+                       w, h, bd);
 }
 
 
-void vp9_high_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+void vp9_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter_x, int x_step_q4,
+                                  const int16_t *filter_y, int y_step_q4,
+                                  int w, int h, int bd) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+  (void)filter_y;
+  (void)y_step_q4;
+
+  highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x,
+                        x0_q4, x_step_q4, w, h, bd);
+}
+
+void vp9_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const int16_t *filter_x, int x_step_q4,
+                                      const int16_t *filter_y, int y_step_q4,
+                                      int w, int h, int bd) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+  (void)filter_y;
+  (void)y_step_q4;
+
+  highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x,
+                            x0_q4, x_step_q4, w, h, bd);
+}
+
+void vp9_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const int16_t *filter_x, int x_step_q4,
+                                 const int16_t *filter_y, int y_step_q4,
+                                 int w, int h, int bd) {
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+  (void)filter_x;
+  (void)x_step_q4;
+
+  highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y,
+                       y0_q4, y_step_q4, w, h, bd);
+}
+
+void vp9_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                                     uint8_t *dst, ptrdiff_t dst_stride,
+                                     const int16_t *filter_x, int x_step_q4,
+                                     const int16_t *filter_y, int y_step_q4,
+                                     int w, int h, int bd) {
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+  (void)filter_x;
+  (void)x_step_q4;
+
+  highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y,
+                           y0_q4, y_step_q4, w, h, bd);
+}
+
+void vp9_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const int16_t *filter_x, int x_step_q4,
+                            const int16_t *filter_y, int y_step_q4,
+                            int w, int h, int bd) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+  highbd_convolve(src, src_stride, dst, dst_stride,
+                  filters_x, x0_q4, x_step_q4,
+                  filters_y, y0_q4, y_step_q4, w, h, bd);
+}
+
+void vp9_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
                                 uint8_t *dst, ptrdiff_t dst_stride,
                                 const int16_t *filter_x, int x_step_q4,
                                 const int16_t *filter_y, int y_step_q4,
                                 int w, int h, int bd) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-  (void)filter_y;
-  (void)y_step_q4;
-
-  high_convolve_horiz(src, src_stride, dst, dst_stride, filters_x,
-                      x0_q4, x_step_q4, w, h, bd);
-}
-
-void vp9_high_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const int16_t *filter_x, int x_step_q4,
-                                    const int16_t *filter_y, int y_step_q4,
-                                    int w, int h, int bd) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-  (void)filter_y;
-  (void)y_step_q4;
-
-  high_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x,
-                     x0_q4, x_step_q4, w, h, bd);
-}
-
-void vp9_high_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h, int bd) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-  (void)filter_x;
-  (void)x_step_q4;
-
-  high_convolve_vert(src, src_stride, dst, dst_stride, filters_y,
-                     y0_q4, y_step_q4, w, h, bd);
-}
-
-void vp9_high_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int x_step_q4,
-                                   const int16_t *filter_y, int y_step_q4,
-                                   int w, int h, int bd) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-  (void)filter_x;
-  (void)x_step_q4;
-
-  high_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y,
-                         y0_q4, y_step_q4, w, h, bd);
-}
-
-void vp9_high_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
-                          uint8_t *dst, ptrdiff_t dst_stride,
-                          const int16_t *filter_x, int x_step_q4,
-                          const int16_t *filter_y, int y_step_q4,
-                          int w, int h, int bd) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  high_convolve(src, src_stride, dst, dst_stride,
-                filters_x, x0_q4, x_step_q4,
-                filters_y, y0_q4, y_step_q4, w, h, bd);
-}
-
-void vp9_high_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
-                              int w, int h, int bd) {
   // Fixed size intermediate buffer places limits on parameters.
   DECLARE_ALIGNED_ARRAY(16, uint16_t, temp, 64 * 64);
   assert(w <= 64);
   assert(h <= 64);
 
-  vp9_high_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64,
-                       filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
-  vp9_high_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride,
-                          NULL, 0, NULL, 0, w, h, bd);
+  vp9_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64,
+                         filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
+  vp9_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride,
+                            NULL, 0, NULL, 0, w, h, bd);
 }
 
-void vp9_high_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
-                              uint8_t *dst8, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int filter_x_stride,
-                              const int16_t *filter_y, int filter_y_stride,
-                              int w, int h, int bd) {
+void vp9_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
+                                uint8_t *dst8, ptrdiff_t dst_stride,
+                                const int16_t *filter_x, int filter_x_stride,
+                                const int16_t *filter_y, int filter_y_stride,
+                                int w, int h, int bd) {
   int r;
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
@@ -536,11 +532,11 @@
   }
 }
 
-void vp9_high_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
-                             uint8_t *dst8, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int filter_x_stride,
-                             const int16_t *filter_y, int filter_y_stride,
-                             int w, int h, int bd) {
+void vp9_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
+                               uint8_t *dst8, ptrdiff_t dst_stride,
+                               const int16_t *filter_x, int filter_x_stride,
+                               const int16_t *filter_y, int filter_y_stride,
+                               int w, int h, int bd) {
   int x, y;
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);

diff --git a/source/libvpx/vp9/common/vp9_convolve.h b/source/libvpx/vp9/common/vp9_convolve.h
index faf70b1..8b044c8 100644
--- a/source/libvpx/vp9/common/vp9_convolve.h
+++ b/source/libvpx/vp9/common/vp9_convolve.h

@@ -24,11 +24,11 @@
                               int w, int h);
 
 #if CONFIG_VP9_HIGHBITDEPTH
-typedef void (*high_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int x_step_q4,
-                                   const int16_t *filter_y, int y_step_q4,
-                                   int w, int h, int bd);
+typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
+                                     uint8_t *dst, ptrdiff_t dst_stride,
+                                     const int16_t *filter_x, int x_step_q4,
+                                     const int16_t *filter_y, int y_step_q4,
+                                     int w, int h, int bd);
 #endif
 
 #ifdef __cplusplus

diff --git a/source/libvpx/vp9/common/vp9_debugmodes.c b/source/libvpx/vp9/common/vp9_debugmodes.c
index 3f16841..d9dace6 100644
--- a/source/libvpx/vp9/common/vp9_debugmodes.c
+++ b/source/libvpx/vp9/common/vp9_debugmodes.c

@@ -27,7 +27,7 @@
   int mi_row, mi_col;
   int mi_index = 0;
   // TODO(hkuang): Fix this debug function.
-  MODE_INFO **mi = NULL;
+  MODE_INFO **mi = &cm->mi;
   int rows = cm->mi_rows;
   int cols = cm->mi_cols;
   char prefix = descriptor[0];
@@ -53,7 +53,7 @@
   int mi_index = 0;
   FILE *mvs = fopen(file, "a");
   // TODO(hkuang): Fix this debug function.
-  MODE_INFO **mi = NULL;
+  MODE_INFO **mi = &cm->mi;
   int rows = cm->mi_rows;
   int cols = cm->mi_cols;
 

diff --git a/source/libvpx/vp9/common/vp9_entropy.h b/source/libvpx/vp9/common/vp9_entropy.h
index 8cdfc5c..239c049 100644
--- a/source/libvpx/vp9/common/vp9_entropy.h
+++ b/source/libvpx/vp9/common/vp9_entropy.h

@@ -13,8 +13,8 @@
 
 #include "vpx/vpx_integer.h"
 
-#include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_prob.h"
 #include "vp9/common/vp9_scan.h"
 
 #ifdef __cplusplus

diff --git a/source/libvpx/vp9/common/vp9_entropymode.h b/source/libvpx/vp9/common/vp9_entropymode.h
index 533757b..6831d3f 100644
--- a/source/libvpx/vp9/common/vp9_entropymode.h
+++ b/source/libvpx/vp9/common/vp9_entropymode.h

@@ -20,8 +20,6 @@
 #endif
 
 #define TX_SIZE_CONTEXTS 2
-#define SWITCHABLE_FILTERS 3   // number of switchable filters
-#define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1)
 
 struct VP9Common;
 

diff --git a/source/libvpx/vp9/common/vp9_entropymv.h b/source/libvpx/vp9/common/vp9_entropymv.h
index e7033e4..75e6861 100644
--- a/source/libvpx/vp9/common/vp9_entropymv.h
+++ b/source/libvpx/vp9/common/vp9_entropymv.h

@@ -13,7 +13,9 @@
 #define VP9_COMMON_VP9_ENTROPYMV_H_
 
 #include "./vpx_config.h"
-#include "vp9/common/vp9_blockd.h"
+
+#include "vp9/common/vp9_mv.h"
+#include "vp9/common/vp9_prob.h"
 
 #ifdef __cplusplus
 extern "C" {

diff --git a/source/libvpx/vp9/common/vp9_enums.h b/source/libvpx/vp9/common/vp9_enums.h
index 8817fdb..f83d21f 100644
--- a/source/libvpx/vp9/common/vp9_enums.h
+++ b/source/libvpx/vp9/common/vp9_enums.h

@@ -67,6 +67,7 @@
   PARTITION_INVALID = PARTITION_TYPES
 } PARTITION_TYPE;
 
+typedef char PARTITION_CONTEXT;
 #define PARTITION_PLOFFSET   4  // number of probability models per block size
 #define PARTITION_CONTEXTS (4 * PARTITION_PLOFFSET)
 

diff --git a/source/libvpx/vp9/common/vp9_filter.h b/source/libvpx/vp9/common/vp9_filter.h
index 8c359c7..3377d45 100644
--- a/source/libvpx/vp9/common/vp9_filter.h
+++ b/source/libvpx/vp9/common/vp9_filter.h

@@ -35,6 +35,13 @@
   SWITCHABLE = 4  /* should be the last one */
 } INTERP_FILTER;
 
+// Number of switchable filters
+#define SWITCHABLE_FILTERS 3
+
+// The codec can operate in four possible inter prediction filter mode:
+// 8-tap, 8-tap-smooth, 8-tap-sharp, and switching between the three.
+#define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1)
+
 typedef int16_t InterpKernel[SUBPEL_TAPS];
 
 const InterpKernel *vp9_get_interp_kernel(INTERP_FILTER filter);

diff --git a/source/libvpx/vp9/common/vp9_idct.c b/source/libvpx/vp9/common/vp9_idct.c
index b196fc5..d5b6f39 100644
--- a/source/libvpx/vp9/common/vp9_idct.c
+++ b/source/libvpx/vp9/common/vp9_idct.c

@@ -8,49 +8,48 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <assert.h>
 #include <math.h>
 
-#include "./vpx_config.h"
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_systemdependent.h"
 #include "vp9/common/vp9_blockd.h"
-#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_idct.h"
 
-#if CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH
-// When CONFIG_EMULATE_HW_HIGHBITDEPTH is 1 the transform performs strict
-// overflow wrapping to match expected hardware implementations.
+#if CONFIG_EMULATE_HARDWARE
+// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a
+// non-normative method to handle overflows. A stream that causes
+// overflows  in the inverse transform is considered invalid in VP9,
+// and a hardware implementer is free to choose any reasonable
+// method to handle overflows. However to aid in hardware
+// verification they can use a specific implementation of the
+// WRAPLOW() macro below that is identical to their intended
+// hardware implementation (and also use configure options to trigger
+// the C-implementation of the transform).
+//
+// The particular WRAPLOW implementation below performs strict
+// overflow wrapping to match common hardware implementations.
 // bd of 8 uses trans_low with 16bits, need to remove 16bits
 // bd of 10 uses trans_low with 18bits, need to remove 14bits
 // bd of 12 uses trans_low with 20bits, need to remove 12bits
 // bd of x uses trans_low with 8+x bits, need to remove 24-x bits
-#define WRAPLOW(x) ((((int32_t)x) << (24 - bd)) >> (24 - bd))
+#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
 #else
-#define WRAPLOW(x) (x)
-#endif  // CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH
+#define WRAPLOW(x, bd) (x)
+#endif  // CONFIG_EMULATE_HARDWARE
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static INLINE tran_low_t clamp_high(tran_high_t value, tran_low_t low,
-                                    tran_low_t high) {
-  return value < low ? low : (value > high ? high : value);
-}
-
-static INLINE tran_low_t clip_pixel_bd_high(tran_high_t dest,
-                                            tran_high_t trans, int bd) {
-  trans = WRAPLOW(trans);
-  switch (bd) {
-    case 8:
-    default:
-      return clamp_high(WRAPLOW(dest + trans), 0, 255);
-    case 10:
-      return clamp_high(WRAPLOW(dest + trans), 0, 1023);
-    case 12:
-      return clamp_high(WRAPLOW(dest + trans), 0, 4095);
-  }
+static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
+                                             int bd) {
+  trans = WRAPLOW(trans, bd);
+  return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
+  trans = WRAPLOW(trans, 8);
+  return clip_pixel(WRAPLOW(dest + trans, 8));
+}
+
 void vp9_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
    0.5 shifts per pixel. */
@@ -72,10 +71,10 @@
     c1 = e1 - c1;
     a1 -= b1;
     d1 += c1;
-    op[0] = a1;
-    op[1] = b1;
-    op[2] = c1;
-    op[3] = d1;
+    op[0] = WRAPLOW(a1, 8);
+    op[1] = WRAPLOW(b1, 8);
+    op[2] = WRAPLOW(c1, 8);
+    op[3] = WRAPLOW(d1, 8);
     ip += 4;
     op += 4;
   }
@@ -93,10 +92,10 @@
     c1 = e1 - c1;
     a1 -= b1;
     d1 += c1;
-    dest[stride * 0] = clip_pixel(dest[stride * 0] + a1);
-    dest[stride * 1] = clip_pixel(dest[stride * 1] + b1);
-    dest[stride * 2] = clip_pixel(dest[stride * 2] + c1);
-    dest[stride * 3] = clip_pixel(dest[stride * 3] + d1);
+    dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
+    dest[stride * 1] = clip_pixel_add(dest[stride * 1], b1);
+    dest[stride * 2] = clip_pixel_add(dest[stride * 2], c1);
+    dest[stride * 3] = clip_pixel_add(dest[stride * 3], d1);
 
     ip++;
     dest++;
@@ -113,17 +112,17 @@
   a1 = ip[0] >> UNIT_QUANT_SHIFT;
   e1 = a1 >> 1;
   a1 -= e1;
-  op[0] = a1;
-  op[1] = op[2] = op[3] = e1;
+  op[0] = WRAPLOW(a1, 8);
+  op[1] = op[2] = op[3] = WRAPLOW(e1, 8);
 
   ip = tmp;
   for (i = 0; i < 4; i++) {
     e1 = ip[0] >> 1;
     a1 = ip[0] - e1;
-    dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1);
-    dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + e1);
-    dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + e1);
-    dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + e1);
+    dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);
+    dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);
+    dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);
+    dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);
     ip++;
     dest++;
   }
@@ -135,18 +134,18 @@
   // stage 1
   temp1 = (input[0] + input[2]) * cospi_16_64;
   temp2 = (input[0] - input[2]) * cospi_16_64;
-  step[0] = dct_const_round_shift(temp1);
-  step[1] = dct_const_round_shift(temp2);
+  step[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-  step[2] = dct_const_round_shift(temp1);
-  step[3] = dct_const_round_shift(temp2);
+  step[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
 
   // stage 2
-  output[0] = step[0] + step[3];
-  output[1] = step[1] + step[2];
-  output[2] = step[1] - step[2];
-  output[3] = step[0] - step[3];
+  output[0] = WRAPLOW(step[0] + step[3], 8);
+  output[1] = WRAPLOW(step[1] + step[2], 8);
+  output[2] = WRAPLOW(step[1] - step[2], 8);
+  output[3] = WRAPLOW(step[0] - step[3], 8);
 }
 
 void vp9_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
@@ -167,9 +166,10 @@
     for (j = 0; j < 4; ++j)
       temp_in[j] = out[j * 4 + i];
     idct4(temp_in, temp_out);
-    for (j = 0; j < 4; ++j)
-      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
-                                  + dest[j * stride + i]);
+    for (j = 0; j < 4; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 4));
+    }
   }
 }
 
@@ -177,15 +177,15 @@
                          int dest_stride) {
   int i;
   tran_high_t a1;
-  tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);
-  out = dct_const_round_shift(out * cospi_16_64);
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
   a1 = ROUND_POWER_OF_TWO(out, 4);
 
   for (i = 0; i < 4; i++) {
-    dest[0] = clip_pixel(dest[0] + a1);
-    dest[1] = clip_pixel(dest[1] + a1);
-    dest[2] = clip_pixel(dest[2] + a1);
-    dest[3] = clip_pixel(dest[3] + a1);
+    dest[0] = clip_pixel_add(dest[0], a1);
+    dest[1] = clip_pixel_add(dest[1], a1);
+    dest[2] = clip_pixel_add(dest[2], a1);
+    dest[3] = clip_pixel_add(dest[3], a1);
     dest += dest_stride;
   }
 }
@@ -200,39 +200,39 @@
   step1[3] = input[6];
   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-  step1[4] = dct_const_round_shift(temp1);
-  step1[7] = dct_const_round_shift(temp2);
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-  step1[5] = dct_const_round_shift(temp1);
-  step1[6] = dct_const_round_shift(temp2);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
 
   // stage 2 & stage 3 - even half
   idct4(step1, step1);
 
   // stage 2 - odd half
-  step2[4] = step1[4] + step1[5];
-  step2[5] = step1[4] - step1[5];
-  step2[6] = -step1[6] + step1[7];
-  step2[7] = step1[6] + step1[7];
+  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
+  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
+  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
+  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
 
   // stage 3 -odd half
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = dct_const_round_shift(temp1);
-  step1[6] = dct_const_round_shift(temp2);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
   step1[7] = step2[7];
 
   // stage 4
-  output[0] = step1[0] + step1[7];
-  output[1] = step1[1] + step1[6];
-  output[2] = step1[2] + step1[5];
-  output[3] = step1[3] + step1[4];
-  output[4] = step1[3] - step1[4];
-  output[5] = step1[2] - step1[5];
-  output[6] = step1[1] - step1[6];
-  output[7] = step1[0] - step1[7];
+  output[0] = WRAPLOW(step1[0] + step1[7], 8);
+  output[1] = WRAPLOW(step1[1] + step1[6], 8);
+  output[2] = WRAPLOW(step1[2] + step1[5], 8);
+  output[3] = WRAPLOW(step1[3] + step1[4], 8);
+  output[4] = WRAPLOW(step1[3] - step1[4], 8);
+  output[5] = WRAPLOW(step1[2] - step1[5], 8);
+  output[6] = WRAPLOW(step1[1] - step1[6], 8);
+  output[7] = WRAPLOW(step1[0] - step1[7], 8);
 }
 
 void vp9_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
@@ -253,21 +253,22 @@
     for (j = 0; j < 8; ++j)
       temp_in[j] = out[j * 8 + i];
     idct8(temp_in, temp_out);
-    for (j = 0; j < 8; ++j)
-      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
-                                  + dest[j * stride + i]);
+    for (j = 0; j < 8; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
+    }
   }
 }
 
 void vp9_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);
-  out = dct_const_round_shift(out * cospi_16_64);
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
   a1 = ROUND_POWER_OF_TWO(out, 5);
   for (j = 0; j < 8; ++j) {
     for (i = 0; i < 8; ++i)
-      dest[i] = clip_pixel(dest[i] + a1);
+      dest[i] = clip_pixel_add(dest[i], a1);
     dest += stride;
   }
 }
@@ -308,10 +309,10 @@
   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
   // + 1b (addition) = 29b.
   // Hence the output bit depth is 15b.
-  output[0] = dct_const_round_shift(s0);
-  output[1] = dct_const_round_shift(s1);
-  output[2] = dct_const_round_shift(s2);
-  output[3] = dct_const_round_shift(s3);
+  output[0] = WRAPLOW(dct_const_round_shift(s0), 8);
+  output[1] = WRAPLOW(dct_const_round_shift(s1), 8);
+  output[2] = WRAPLOW(dct_const_round_shift(s2), 8);
+  output[3] = WRAPLOW(dct_const_round_shift(s3), 8);
 }
 
 void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
@@ -340,11 +341,13 @@
     for (j = 0; j < 4; ++j)
       temp_in[j] = out[j * 4 + i];
     IHT_4[tx_type].cols(temp_in, temp_out);
-    for (j = 0; j < 4; ++j)
-      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
-                                  + dest[j * stride + i]);
+    for (j = 0; j < 4; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 4));
+    }
   }
 }
+
 static void iadst8(const tran_low_t *input, tran_low_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
 
@@ -373,14 +376,14 @@
   s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
   s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
 
-  x0 = dct_const_round_shift(s0 + s4);
-  x1 = dct_const_round_shift(s1 + s5);
-  x2 = dct_const_round_shift(s2 + s6);
-  x3 = dct_const_round_shift(s3 + s7);
-  x4 = dct_const_round_shift(s0 - s4);
-  x5 = dct_const_round_shift(s1 - s5);
-  x6 = dct_const_round_shift(s2 - s6);
-  x7 = dct_const_round_shift(s3 - s7);
+  x0 = WRAPLOW(dct_const_round_shift(s0 + s4), 8);
+  x1 = WRAPLOW(dct_const_round_shift(s1 + s5), 8);
+  x2 = WRAPLOW(dct_const_round_shift(s2 + s6), 8);
+  x3 = WRAPLOW(dct_const_round_shift(s3 + s7), 8);
+  x4 = WRAPLOW(dct_const_round_shift(s0 - s4), 8);
+  x5 = WRAPLOW(dct_const_round_shift(s1 - s5), 8);
+  x6 = WRAPLOW(dct_const_round_shift(s2 - s6), 8);
+  x7 = WRAPLOW(dct_const_round_shift(s3 - s7), 8);
 
   // stage 2
   s0 = x0;
@@ -392,14 +395,14 @@
   s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
   s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
 
-  x0 = s0 + s2;
-  x1 = s1 + s3;
-  x2 = s0 - s2;
-  x3 = s1 - s3;
-  x4 = dct_const_round_shift(s4 + s6);
-  x5 = dct_const_round_shift(s5 + s7);
-  x6 = dct_const_round_shift(s4 - s6);
-  x7 = dct_const_round_shift(s5 - s7);
+  x0 = WRAPLOW(s0 + s2, 8);
+  x1 = WRAPLOW(s1 + s3, 8);
+  x2 = WRAPLOW(s0 - s2, 8);
+  x3 = WRAPLOW(s1 - s3, 8);
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
+  x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
+  x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
 
   // stage 3
   s2 = cospi_16_64 * (x2 + x3);
@@ -407,19 +410,19 @@
   s6 = cospi_16_64 * (x6 + x7);
   s7 = cospi_16_64 * (x6 - x7);
 
-  x2 = dct_const_round_shift(s2);
-  x3 = dct_const_round_shift(s3);
-  x6 = dct_const_round_shift(s6);
-  x7 = dct_const_round_shift(s7);
+  x2 = WRAPLOW(dct_const_round_shift(s2), 8);
+  x3 = WRAPLOW(dct_const_round_shift(s3), 8);
+  x6 = WRAPLOW(dct_const_round_shift(s6), 8);
+  x7 = WRAPLOW(dct_const_round_shift(s7), 8);
 
-  output[0] =  x0;
-  output[1] = -x4;
-  output[2] =  x6;
-  output[3] = -x2;
-  output[4] =  x3;
-  output[5] = -x7;
-  output[6] =  x5;
-  output[7] = -x1;
+  output[0] = WRAPLOW(x0, 8);
+  output[1] = WRAPLOW(-x4, 8);
+  output[2] = WRAPLOW(x6, 8);
+  output[3] = WRAPLOW(-x2, 8);
+  output[4] = WRAPLOW(x3, 8);
+  output[5] = WRAPLOW(-x7, 8);
+  output[6] = WRAPLOW(x5, 8);
+  output[7] = WRAPLOW(-x1, 8);
 }
 
 static const transform_2d IHT_8[] = {
@@ -449,9 +452,10 @@
     for (j = 0; j < 8; ++j)
       temp_in[j] = out[j * 8 + i];
     ht.cols(temp_in, temp_out);
-    for (j = 0; j < 8; ++j)
-      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
-                                  + dest[j * stride + i]);
+    for (j = 0; j < 8; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
+    }
   }
 }
 
@@ -474,9 +478,10 @@
     for (j = 0; j < 8; ++j)
       temp_in[j] = out[j * 8 + i];
     idct8(temp_in, temp_out);
-    for (j = 0; j < 8; ++j)
-      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
-                                  + dest[j * stride + i]);
+    for (j = 0; j < 8; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
+    }
   }
 }
 
@@ -514,23 +519,23 @@
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = dct_const_round_shift(temp1);
-  step2[15] = dct_const_round_shift(temp2);
+  step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = dct_const_round_shift(temp1);
-  step2[14] = dct_const_round_shift(temp2);
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = dct_const_round_shift(temp1);
-  step2[13] = dct_const_round_shift(temp2);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = dct_const_round_shift(temp1);
-  step2[12] = dct_const_round_shift(temp2);
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
 
   // stage 3
   step1[0] = step2[0];
@@ -540,109 +545,109 @@
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = dct_const_round_shift(temp1);
-  step1[7] = dct_const_round_shift(temp2);
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = dct_const_round_shift(temp1);
-  step1[6] = dct_const_round_shift(temp2);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
 
-  step1[8] = step2[8] + step2[9];
-  step1[9] = step2[8] - step2[9];
-  step1[10] = -step2[10] + step2[11];
-  step1[11] = step2[10] + step2[11];
-  step1[12] = step2[12] + step2[13];
-  step1[13] = step2[12] - step2[13];
-  step1[14] = -step2[14] + step2[15];
-  step1[15] = step2[14] + step2[15];
+  step1[8] = WRAPLOW(step2[8] + step2[9], 8);
+  step1[9] = WRAPLOW(step2[8] - step2[9], 8);
+  step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
+  step1[11] = WRAPLOW(step2[10] + step2[11], 8);
+  step1[12] = WRAPLOW(step2[12] + step2[13], 8);
+  step1[13] = WRAPLOW(step2[12] - step2[13], 8);
+  step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
+  step1[15] = WRAPLOW(step2[14] + step2[15], 8);
 
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = dct_const_round_shift(temp1);
-  step2[1] = dct_const_round_shift(temp2);
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = dct_const_round_shift(temp1);
-  step2[3] = dct_const_round_shift(temp2);
-  step2[4] = step1[4] + step1[5];
-  step2[5] = step1[4] - step1[5];
-  step2[6] = -step1[6] + step1[7];
-  step2[7] = step1[6] + step1[7];
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
+  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
+  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
+  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
 
   step2[8] = step1[8];
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = dct_const_round_shift(temp1);
-  step2[14] = dct_const_round_shift(temp2);
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = dct_const_round_shift(temp1);
-  step2[13] = dct_const_round_shift(temp2);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
   step2[11] = step1[11];
   step2[12] = step1[12];
 
   // stage 5
-  step1[0] = step2[0] + step2[3];
-  step1[1] = step2[1] + step2[2];
-  step1[2] = step2[1] - step2[2];
-  step1[3] = step2[0] - step2[3];
+  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
+  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
+  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
+  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = dct_const_round_shift(temp1);
-  step1[6] = dct_const_round_shift(temp2);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
   step1[7] = step2[7];
 
-  step1[8] = step2[8] + step2[11];
-  step1[9] = step2[9] + step2[10];
-  step1[10] = step2[9] - step2[10];
-  step1[11] = step2[8] - step2[11];
-  step1[12] = -step2[12] + step2[15];
-  step1[13] = -step2[13] + step2[14];
-  step1[14] = step2[13] + step2[14];
-  step1[15] = step2[12] + step2[15];
+  step1[8] = WRAPLOW(step2[8] + step2[11], 8);
+  step1[9] = WRAPLOW(step2[9] + step2[10], 8);
+  step1[10] = WRAPLOW(step2[9] - step2[10], 8);
+  step1[11] = WRAPLOW(step2[8] - step2[11], 8);
+  step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
+  step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
+  step1[14] = WRAPLOW(step2[13] + step2[14], 8);
+  step1[15] = WRAPLOW(step2[12] + step2[15], 8);
 
   // stage 6
-  step2[0] = step1[0] + step1[7];
-  step2[1] = step1[1] + step1[6];
-  step2[2] = step1[2] + step1[5];
-  step2[3] = step1[3] + step1[4];
-  step2[4] = step1[3] - step1[4];
-  step2[5] = step1[2] - step1[5];
-  step2[6] = step1[1] - step1[6];
-  step2[7] = step1[0] - step1[7];
+  step2[0] = WRAPLOW(step1[0] + step1[7], 8);
+  step2[1] = WRAPLOW(step1[1] + step1[6], 8);
+  step2[2] = WRAPLOW(step1[2] + step1[5], 8);
+  step2[3] = WRAPLOW(step1[3] + step1[4], 8);
+  step2[4] = WRAPLOW(step1[3] - step1[4], 8);
+  step2[5] = WRAPLOW(step1[2] - step1[5], 8);
+  step2[6] = WRAPLOW(step1[1] - step1[6], 8);
+  step2[7] = WRAPLOW(step1[0] - step1[7], 8);
   step2[8] = step1[8];
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = dct_const_round_shift(temp1);
-  step2[13] = dct_const_round_shift(temp2);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = dct_const_round_shift(temp1);
-  step2[12] = dct_const_round_shift(temp2);
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
   step2[14] = step1[14];
   step2[15] = step1[15];
 
   // stage 7
-  output[0] = step2[0] + step2[15];
-  output[1] = step2[1] + step2[14];
-  output[2] = step2[2] + step2[13];
-  output[3] = step2[3] + step2[12];
-  output[4] = step2[4] + step2[11];
-  output[5] = step2[5] + step2[10];
-  output[6] = step2[6] + step2[9];
-  output[7] = step2[7] + step2[8];
-  output[8] = step2[7] - step2[8];
-  output[9] = step2[6] - step2[9];
-  output[10] = step2[5] - step2[10];
-  output[11] = step2[4] - step2[11];
-  output[12] = step2[3] - step2[12];
-  output[13] = step2[2] - step2[13];
-  output[14] = step2[1] - step2[14];
-  output[15] = step2[0] - step2[15];
+  output[0] = WRAPLOW(step2[0] + step2[15], 8);
+  output[1] = WRAPLOW(step2[1] + step2[14], 8);
+  output[2] = WRAPLOW(step2[2] + step2[13], 8);
+  output[3] = WRAPLOW(step2[3] + step2[12], 8);
+  output[4] = WRAPLOW(step2[4] + step2[11], 8);
+  output[5] = WRAPLOW(step2[5] + step2[10], 8);
+  output[6] = WRAPLOW(step2[6] + step2[9], 8);
+  output[7] = WRAPLOW(step2[7] + step2[8], 8);
+  output[8] = WRAPLOW(step2[7] - step2[8], 8);
+  output[9] = WRAPLOW(step2[6] - step2[9], 8);
+  output[10] = WRAPLOW(step2[5] - step2[10], 8);
+  output[11] = WRAPLOW(step2[4] - step2[11], 8);
+  output[12] = WRAPLOW(step2[3] - step2[12], 8);
+  output[13] = WRAPLOW(step2[2] - step2[13], 8);
+  output[14] = WRAPLOW(step2[1] - step2[14], 8);
+  output[15] = WRAPLOW(step2[0] - step2[15], 8);
 }
 
 void vp9_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
@@ -664,9 +669,10 @@
     for (j = 0; j < 16; ++j)
       temp_in[j] = out[j * 16 + i];
     idct16(temp_in, temp_out);
-    for (j = 0; j < 16; ++j)
-      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
-                                  + dest[j * stride + i]);
+    for (j = 0; j < 16; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
+    }
   }
 }
 
@@ -718,22 +724,22 @@
   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
   s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
 
-  x0 = dct_const_round_shift(s0 + s8);
-  x1 = dct_const_round_shift(s1 + s9);
-  x2 = dct_const_round_shift(s2 + s10);
-  x3 = dct_const_round_shift(s3 + s11);
-  x4 = dct_const_round_shift(s4 + s12);
-  x5 = dct_const_round_shift(s5 + s13);
-  x6 = dct_const_round_shift(s6 + s14);
-  x7 = dct_const_round_shift(s7 + s15);
-  x8  = dct_const_round_shift(s0 - s8);
-  x9  = dct_const_round_shift(s1 - s9);
-  x10 = dct_const_round_shift(s2 - s10);
-  x11 = dct_const_round_shift(s3 - s11);
-  x12 = dct_const_round_shift(s4 - s12);
-  x13 = dct_const_round_shift(s5 - s13);
-  x14 = dct_const_round_shift(s6 - s14);
-  x15 = dct_const_round_shift(s7 - s15);
+  x0 = WRAPLOW(dct_const_round_shift(s0 + s8), 8);
+  x1 = WRAPLOW(dct_const_round_shift(s1 + s9), 8);
+  x2 = WRAPLOW(dct_const_round_shift(s2 + s10), 8);
+  x3 = WRAPLOW(dct_const_round_shift(s3 + s11), 8);
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s12), 8);
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s13), 8);
+  x6 = WRAPLOW(dct_const_round_shift(s6 + s14), 8);
+  x7 = WRAPLOW(dct_const_round_shift(s7 + s15), 8);
+  x8 = WRAPLOW(dct_const_round_shift(s0 - s8), 8);
+  x9 = WRAPLOW(dct_const_round_shift(s1 - s9), 8);
+  x10 = WRAPLOW(dct_const_round_shift(s2 - s10), 8);
+  x11 = WRAPLOW(dct_const_round_shift(s3 - s11), 8);
+  x12 = WRAPLOW(dct_const_round_shift(s4 - s12), 8);
+  x13 = WRAPLOW(dct_const_round_shift(s5 - s13), 8);
+  x14 = WRAPLOW(dct_const_round_shift(s6 - s14), 8);
+  x15 = WRAPLOW(dct_const_round_shift(s7 - s15), 8);
 
   // stage 2
   s0 = x0;
@@ -753,22 +759,22 @@
   s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
   s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
 
-  x0 = s0 + s4;
-  x1 = s1 + s5;
-  x2 = s2 + s6;
-  x3 = s3 + s7;
-  x4 = s0 - s4;
-  x5 = s1 - s5;
-  x6 = s2 - s6;
-  x7 = s3 - s7;
-  x8 = dct_const_round_shift(s8 + s12);
-  x9 = dct_const_round_shift(s9 + s13);
-  x10 = dct_const_round_shift(s10 + s14);
-  x11 = dct_const_round_shift(s11 + s15);
-  x12 = dct_const_round_shift(s8 - s12);
-  x13 = dct_const_round_shift(s9 - s13);
-  x14 = dct_const_round_shift(s10 - s14);
-  x15 = dct_const_round_shift(s11 - s15);
+  x0 = WRAPLOW(s0 + s4, 8);
+  x1 = WRAPLOW(s1 + s5, 8);
+  x2 = WRAPLOW(s2 + s6, 8);
+  x3 = WRAPLOW(s3 + s7, 8);
+  x4 = WRAPLOW(s0 - s4, 8);
+  x5 = WRAPLOW(s1 - s5, 8);
+  x6 = WRAPLOW(s2 - s6, 8);
+  x7 = WRAPLOW(s3 - s7, 8);
+  x8 = WRAPLOW(dct_const_round_shift(s8 + s12), 8);
+  x9 = WRAPLOW(dct_const_round_shift(s9 + s13), 8);
+  x10 = WRAPLOW(dct_const_round_shift(s10 + s14), 8);
+  x11 = WRAPLOW(dct_const_round_shift(s11 + s15), 8);
+  x12 = WRAPLOW(dct_const_round_shift(s8 - s12), 8);
+  x13 = WRAPLOW(dct_const_round_shift(s9 - s13), 8);
+  x14 = WRAPLOW(dct_const_round_shift(s10 - s14), 8);
+  x15 = WRAPLOW(dct_const_round_shift(s11 - s15), 8);
 
   // stage 3
   s0 = x0;
@@ -788,22 +794,22 @@
   s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
   s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
 
-  x0 = s0 + s2;
-  x1 = s1 + s3;
-  x2 = s0 - s2;
-  x3 = s1 - s3;
-  x4 = dct_const_round_shift(s4 + s6);
-  x5 = dct_const_round_shift(s5 + s7);
-  x6 = dct_const_round_shift(s4 - s6);
-  x7 = dct_const_round_shift(s5 - s7);
-  x8 = s8 + s10;
-  x9 = s9 + s11;
-  x10 = s8 - s10;
-  x11 = s9 - s11;
-  x12 = dct_const_round_shift(s12 + s14);
-  x13 = dct_const_round_shift(s13 + s15);
-  x14 = dct_const_round_shift(s12 - s14);
-  x15 = dct_const_round_shift(s13 - s15);
+  x0 = WRAPLOW(check_range(s0 + s2), 8);
+  x1 = WRAPLOW(check_range(s1 + s3), 8);
+  x2 = WRAPLOW(check_range(s0 - s2), 8);
+  x3 = WRAPLOW(check_range(s1 - s3), 8);
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
+  x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
+  x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
+  x8 = WRAPLOW(check_range(s8 + s10), 8);
+  x9 = WRAPLOW(check_range(s9 + s11), 8);
+  x10 = WRAPLOW(check_range(s8 - s10), 8);
+  x11 = WRAPLOW(check_range(s9 - s11), 8);
+  x12 = WRAPLOW(dct_const_round_shift(s12 + s14), 8);
+  x13 = WRAPLOW(dct_const_round_shift(s13 + s15), 8);
+  x14 = WRAPLOW(dct_const_round_shift(s12 - s14), 8);
+  x15 = WRAPLOW(dct_const_round_shift(s13 - s15), 8);
 
   // stage 4
   s2 = (- cospi_16_64) * (x2 + x3);
@@ -815,31 +821,31 @@
   s14 = (- cospi_16_64) * (x14 + x15);
   s15 = cospi_16_64 * (x14 - x15);
 
-  x2 = dct_const_round_shift(s2);
-  x3 = dct_const_round_shift(s3);
-  x6 = dct_const_round_shift(s6);
-  x7 = dct_const_round_shift(s7);
-  x10 = dct_const_round_shift(s10);
-  x11 = dct_const_round_shift(s11);
-  x14 = dct_const_round_shift(s14);
-  x15 = dct_const_round_shift(s15);
+  x2 = WRAPLOW(dct_const_round_shift(s2), 8);
+  x3 = WRAPLOW(dct_const_round_shift(s3), 8);
+  x6 = WRAPLOW(dct_const_round_shift(s6), 8);
+  x7 = WRAPLOW(dct_const_round_shift(s7), 8);
+  x10 = WRAPLOW(dct_const_round_shift(s10), 8);
+  x11 = WRAPLOW(dct_const_round_shift(s11), 8);
+  x14 = WRAPLOW(dct_const_round_shift(s14), 8);
+  x15 = WRAPLOW(dct_const_round_shift(s15), 8);
 
-  output[0] =  x0;
-  output[1] = -x8;
-  output[2] =  x12;
-  output[3] = -x4;
-  output[4] =  x6;
-  output[5] =  x14;
-  output[6] =  x10;
-  output[7] =  x2;
-  output[8] =  x3;
-  output[9] =  x11;
-  output[10] =  x15;
-  output[11] =  x7;
-  output[12] =  x5;
-  output[13] = -x13;
-  output[14] =  x9;
-  output[15] = -x1;
+  output[0] = WRAPLOW(x0, 8);
+  output[1] = WRAPLOW(-x8, 8);
+  output[2] = WRAPLOW(x12, 8);
+  output[3] = WRAPLOW(-x4, 8);
+  output[4] = WRAPLOW(x6, 8);
+  output[5] = WRAPLOW(x14, 8);
+  output[6] = WRAPLOW(x10, 8);
+  output[7] = WRAPLOW(x2, 8);
+  output[8] = WRAPLOW(x3, 8);
+  output[9] = WRAPLOW(x11, 8);
+  output[10] = WRAPLOW(x15, 8);
+  output[11] = WRAPLOW(x7, 8);
+  output[12] = WRAPLOW(x5, 8);
+  output[13] = WRAPLOW(-x13, 8);
+  output[14] = WRAPLOW(x9, 8);
+  output[15] = WRAPLOW(-x1, 8);
 }
 
 static const transform_2d IHT_16[] = {
@@ -869,9 +875,10 @@
     for (j = 0; j < 16; ++j)
       temp_in[j] = out[j * 16 + i];
     ht.cols(temp_in, temp_out);
-    for (j = 0; j < 16; ++j)
-      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
-                                        + dest[j * stride + i]);
+    for (j = 0; j < 16; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
+    }
   }
 }
 
@@ -895,21 +902,22 @@
     for (j = 0; j < 16; ++j)
       temp_in[j] = out[j*16 + i];
     idct16(temp_in, temp_out);
-    for (j = 0; j < 16; ++j)
-      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
-                                  + dest[j * stride + i]);
+    for (j = 0; j < 16; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
+    }
   }
 }
 
 void vp9_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);
-  out = dct_const_round_shift(out * cospi_16_64);
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
   a1 = ROUND_POWER_OF_TWO(out, 6);
   for (j = 0; j < 16; ++j) {
     for (i = 0; i < 16; ++i)
-      dest[i] = clip_pixel(dest[i] + a1);
+      dest[i] = clip_pixel_add(dest[i], a1);
     dest += stride;
   }
 }
@@ -938,43 +946,43 @@
 
   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
-  step1[16] = dct_const_round_shift(temp1);
-  step1[31] = dct_const_round_shift(temp2);
+  step1[16] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[31] = WRAPLOW(dct_const_round_shift(temp2), 8);
 
   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
-  step1[17] = dct_const_round_shift(temp1);
-  step1[30] = dct_const_round_shift(temp2);
+  step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
 
   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
-  step1[18] = dct_const_round_shift(temp1);
-  step1[29] = dct_const_round_shift(temp2);
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
 
   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
-  step1[19] = dct_const_round_shift(temp1);
-  step1[28] = dct_const_round_shift(temp2);
+  step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
 
   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
-  step1[20] = dct_const_round_shift(temp1);
-  step1[27] = dct_const_round_shift(temp2);
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
 
   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
-  step1[21] = dct_const_round_shift(temp1);
-  step1[26] = dct_const_round_shift(temp2);
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
 
   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
-  step1[22] = dct_const_round_shift(temp1);
-  step1[25] = dct_const_round_shift(temp2);
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
 
   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
-  step1[23] = dct_const_round_shift(temp1);
-  step1[24] = dct_const_round_shift(temp2);
+  step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
 
   // stage 2
   step2[0] = step1[0];
@@ -988,40 +996,40 @@
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = dct_const_round_shift(temp1);
-  step2[15] = dct_const_round_shift(temp2);
+  step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = dct_const_round_shift(temp1);
-  step2[14] = dct_const_round_shift(temp2);
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = dct_const_round_shift(temp1);
-  step2[13] = dct_const_round_shift(temp2);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = dct_const_round_shift(temp1);
-  step2[12] = dct_const_round_shift(temp2);
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
 
-  step2[16] = step1[16] + step1[17];
-  step2[17] = step1[16] - step1[17];
-  step2[18] = -step1[18] + step1[19];
-  step2[19] = step1[18] + step1[19];
-  step2[20] = step1[20] + step1[21];
-  step2[21] = step1[20] - step1[21];
-  step2[22] = -step1[22] + step1[23];
-  step2[23] = step1[22] + step1[23];
-  step2[24] = step1[24] + step1[25];
-  step2[25] = step1[24] - step1[25];
-  step2[26] = -step1[26] + step1[27];
-  step2[27] = step1[26] + step1[27];
-  step2[28] = step1[28] + step1[29];
-  step2[29] = step1[28] - step1[29];
-  step2[30] = -step1[30] + step1[31];
-  step2[31] = step1[30] + step1[31];
+  step2[16] = WRAPLOW(step1[16] + step1[17], 8);
+  step2[17] = WRAPLOW(step1[16] - step1[17], 8);
+  step2[18] = WRAPLOW(-step1[18] + step1[19], 8);
+  step2[19] = WRAPLOW(step1[18] + step1[19], 8);
+  step2[20] = WRAPLOW(step1[20] + step1[21], 8);
+  step2[21] = WRAPLOW(step1[20] - step1[21], 8);
+  step2[22] = WRAPLOW(-step1[22] + step1[23], 8);
+  step2[23] = WRAPLOW(step1[22] + step1[23], 8);
+  step2[24] = WRAPLOW(step1[24] + step1[25], 8);
+  step2[25] = WRAPLOW(step1[24] - step1[25], 8);
+  step2[26] = WRAPLOW(-step1[26] + step1[27], 8);
+  step2[27] = WRAPLOW(step1[26] + step1[27], 8);
+  step2[28] = WRAPLOW(step1[28] + step1[29], 8);
+  step2[29] = WRAPLOW(step1[28] - step1[29], 8);
+  step2[30] = WRAPLOW(-step1[30] + step1[31], 8);
+  step2[31] = WRAPLOW(step1[30] + step1[31], 8);
 
   // stage 3
   step1[0] = step2[0];
@@ -1031,42 +1039,42 @@
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = dct_const_round_shift(temp1);
-  step1[7] = dct_const_round_shift(temp2);
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = dct_const_round_shift(temp1);
-  step1[6] = dct_const_round_shift(temp2);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
 
-  step1[8] = step2[8] + step2[9];
-  step1[9] = step2[8] - step2[9];
-  step1[10] = -step2[10] + step2[11];
-  step1[11] = step2[10] + step2[11];
-  step1[12] = step2[12] + step2[13];
-  step1[13] = step2[12] - step2[13];
-  step1[14] = -step2[14] + step2[15];
-  step1[15] = step2[14] + step2[15];
+  step1[8] = WRAPLOW(step2[8] + step2[9], 8);
+  step1[9] = WRAPLOW(step2[8] - step2[9], 8);
+  step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
+  step1[11] = WRAPLOW(step2[10] + step2[11], 8);
+  step1[12] = WRAPLOW(step2[12] + step2[13], 8);
+  step1[13] = WRAPLOW(step2[12] - step2[13], 8);
+  step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
+  step1[15] = WRAPLOW(step2[14] + step2[15], 8);
 
   step1[16] = step2[16];
   step1[31] = step2[31];
   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
-  step1[17] = dct_const_round_shift(temp1);
-  step1[30] = dct_const_round_shift(temp2);
+  step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
-  step1[18] = dct_const_round_shift(temp1);
-  step1[29] = dct_const_round_shift(temp2);
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
   step1[19] = step2[19];
   step1[20] = step2[20];
   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
-  step1[21] = dct_const_round_shift(temp1);
-  step1[26] = dct_const_round_shift(temp2);
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
-  step1[22] = dct_const_round_shift(temp1);
-  step1[25] = dct_const_round_shift(temp2);
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
   step1[23] = step2[23];
   step1[24] = step2[24];
   step1[27] = step2[27];
@@ -1075,87 +1083,87 @@
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = dct_const_round_shift(temp1);
-  step2[1] = dct_const_round_shift(temp2);
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = dct_const_round_shift(temp1);
-  step2[3] = dct_const_round_shift(temp2);
-  step2[4] = step1[4] + step1[5];
-  step2[5] = step1[4] - step1[5];
-  step2[6] = -step1[6] + step1[7];
-  step2[7] = step1[6] + step1[7];
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
+  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
+  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
+  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
 
   step2[8] = step1[8];
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = dct_const_round_shift(temp1);
-  step2[14] = dct_const_round_shift(temp2);
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = dct_const_round_shift(temp1);
-  step2[13] = dct_const_round_shift(temp2);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
   step2[11] = step1[11];
   step2[12] = step1[12];
 
-  step2[16] = step1[16] + step1[19];
-  step2[17] = step1[17] + step1[18];
-  step2[18] = step1[17] - step1[18];
-  step2[19] = step1[16] - step1[19];
-  step2[20] = -step1[20] + step1[23];
-  step2[21] = -step1[21] + step1[22];
-  step2[22] = step1[21] + step1[22];
-  step2[23] = step1[20] + step1[23];
+  step2[16] = WRAPLOW(step1[16] + step1[19], 8);
+  step2[17] = WRAPLOW(step1[17] + step1[18], 8);
+  step2[18] = WRAPLOW(step1[17] - step1[18], 8);
+  step2[19] = WRAPLOW(step1[16] - step1[19], 8);
+  step2[20] = WRAPLOW(-step1[20] + step1[23], 8);
+  step2[21] = WRAPLOW(-step1[21] + step1[22], 8);
+  step2[22] = WRAPLOW(step1[21] + step1[22], 8);
+  step2[23] = WRAPLOW(step1[20] + step1[23], 8);
 
-  step2[24] = step1[24] + step1[27];
-  step2[25] = step1[25] + step1[26];
-  step2[26] = step1[25] - step1[26];
-  step2[27] = step1[24] - step1[27];
-  step2[28] = -step1[28] + step1[31];
-  step2[29] = -step1[29] + step1[30];
-  step2[30] = step1[29] + step1[30];
-  step2[31] = step1[28] + step1[31];
+  step2[24] = WRAPLOW(step1[24] + step1[27], 8);
+  step2[25] = WRAPLOW(step1[25] + step1[26], 8);
+  step2[26] = WRAPLOW(step1[25] - step1[26], 8);
+  step2[27] = WRAPLOW(step1[24] - step1[27], 8);
+  step2[28] = WRAPLOW(-step1[28] + step1[31], 8);
+  step2[29] = WRAPLOW(-step1[29] + step1[30], 8);
+  step2[30] = WRAPLOW(step1[29] + step1[30], 8);
+  step2[31] = WRAPLOW(step1[28] + step1[31], 8);
 
   // stage 5
-  step1[0] = step2[0] + step2[3];
-  step1[1] = step2[1] + step2[2];
-  step1[2] = step2[1] - step2[2];
-  step1[3] = step2[0] - step2[3];
+  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
+  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
+  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
+  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = dct_const_round_shift(temp1);
-  step1[6] = dct_const_round_shift(temp2);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
   step1[7] = step2[7];
 
-  step1[8] = step2[8] + step2[11];
-  step1[9] = step2[9] + step2[10];
-  step1[10] = step2[9] - step2[10];
-  step1[11] = step2[8] - step2[11];
-  step1[12] = -step2[12] + step2[15];
-  step1[13] = -step2[13] + step2[14];
-  step1[14] = step2[13] + step2[14];
-  step1[15] = step2[12] + step2[15];
+  step1[8] = WRAPLOW(step2[8] + step2[11], 8);
+  step1[9] = WRAPLOW(step2[9] + step2[10], 8);
+  step1[10] = WRAPLOW(step2[9] - step2[10], 8);
+  step1[11] = WRAPLOW(step2[8] - step2[11], 8);
+  step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
+  step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
+  step1[14] = WRAPLOW(step2[13] + step2[14], 8);
+  step1[15] = WRAPLOW(step2[12] + step2[15], 8);
 
   step1[16] = step2[16];
   step1[17] = step2[17];
   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
-  step1[18] = dct_const_round_shift(temp1);
-  step1[29] = dct_const_round_shift(temp2);
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
-  step1[19] = dct_const_round_shift(temp1);
-  step1[28] = dct_const_round_shift(temp2);
+  step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
-  step1[20] = dct_const_round_shift(temp1);
-  step1[27] = dct_const_round_shift(temp2);
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
-  step1[21] = dct_const_round_shift(temp1);
-  step1[26] = dct_const_round_shift(temp2);
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
   step1[22] = step2[22];
   step1[23] = step2[23];
   step1[24] = step2[24];
@@ -1164,62 +1172,62 @@
   step1[31] = step2[31];
 
   // stage 6
-  step2[0] = step1[0] + step1[7];
-  step2[1] = step1[1] + step1[6];
-  step2[2] = step1[2] + step1[5];
-  step2[3] = step1[3] + step1[4];
-  step2[4] = step1[3] - step1[4];
-  step2[5] = step1[2] - step1[5];
-  step2[6] = step1[1] - step1[6];
-  step2[7] = step1[0] - step1[7];
+  step2[0] = WRAPLOW(step1[0] + step1[7], 8);
+  step2[1] = WRAPLOW(step1[1] + step1[6], 8);
+  step2[2] = WRAPLOW(step1[2] + step1[5], 8);
+  step2[3] = WRAPLOW(step1[3] + step1[4], 8);
+  step2[4] = WRAPLOW(step1[3] - step1[4], 8);
+  step2[5] = WRAPLOW(step1[2] - step1[5], 8);
+  step2[6] = WRAPLOW(step1[1] - step1[6], 8);
+  step2[7] = WRAPLOW(step1[0] - step1[7], 8);
   step2[8] = step1[8];
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = dct_const_round_shift(temp1);
-  step2[13] = dct_const_round_shift(temp2);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = dct_const_round_shift(temp1);
-  step2[12] = dct_const_round_shift(temp2);
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
   step2[14] = step1[14];
   step2[15] = step1[15];
 
-  step2[16] = step1[16] + step1[23];
-  step2[17] = step1[17] + step1[22];
-  step2[18] = step1[18] + step1[21];
-  step2[19] = step1[19] + step1[20];
-  step2[20] = step1[19] - step1[20];
-  step2[21] = step1[18] - step1[21];
-  step2[22] = step1[17] - step1[22];
-  step2[23] = step1[16] - step1[23];
+  step2[16] = WRAPLOW(step1[16] + step1[23], 8);
+  step2[17] = WRAPLOW(step1[17] + step1[22], 8);
+  step2[18] = WRAPLOW(step1[18] + step1[21], 8);
+  step2[19] = WRAPLOW(step1[19] + step1[20], 8);
+  step2[20] = WRAPLOW(step1[19] - step1[20], 8);
+  step2[21] = WRAPLOW(step1[18] - step1[21], 8);
+  step2[22] = WRAPLOW(step1[17] - step1[22], 8);
+  step2[23] = WRAPLOW(step1[16] - step1[23], 8);
 
-  step2[24] = -step1[24] + step1[31];
-  step2[25] = -step1[25] + step1[30];
-  step2[26] = -step1[26] + step1[29];
-  step2[27] = -step1[27] + step1[28];
-  step2[28] = step1[27] + step1[28];
-  step2[29] = step1[26] + step1[29];
-  step2[30] = step1[25] + step1[30];
-  step2[31] = step1[24] + step1[31];
+  step2[24] = WRAPLOW(-step1[24] + step1[31], 8);
+  step2[25] = WRAPLOW(-step1[25] + step1[30], 8);
+  step2[26] = WRAPLOW(-step1[26] + step1[29], 8);
+  step2[27] = WRAPLOW(-step1[27] + step1[28], 8);
+  step2[28] = WRAPLOW(step1[27] + step1[28], 8);
+  step2[29] = WRAPLOW(step1[26] + step1[29], 8);
+  step2[30] = WRAPLOW(step1[25] + step1[30], 8);
+  step2[31] = WRAPLOW(step1[24] + step1[31], 8);
 
   // stage 7
-  step1[0] = step2[0] + step2[15];
-  step1[1] = step2[1] + step2[14];
-  step1[2] = step2[2] + step2[13];
-  step1[3] = step2[3] + step2[12];
-  step1[4] = step2[4] + step2[11];
-  step1[5] = step2[5] + step2[10];
-  step1[6] = step2[6] + step2[9];
-  step1[7] = step2[7] + step2[8];
-  step1[8] = step2[7] - step2[8];
-  step1[9] = step2[6] - step2[9];
-  step1[10] = step2[5] - step2[10];
-  step1[11] = step2[4] - step2[11];
-  step1[12] = step2[3] - step2[12];
-  step1[13] = step2[2] - step2[13];
-  step1[14] = step2[1] - step2[14];
-  step1[15] = step2[0] - step2[15];
+  step1[0] = WRAPLOW(step2[0] + step2[15], 8);
+  step1[1] = WRAPLOW(step2[1] + step2[14], 8);
+  step1[2] = WRAPLOW(step2[2] + step2[13], 8);
+  step1[3] = WRAPLOW(step2[3] + step2[12], 8);
+  step1[4] = WRAPLOW(step2[4] + step2[11], 8);
+  step1[5] = WRAPLOW(step2[5] + step2[10], 8);
+  step1[6] = WRAPLOW(step2[6] + step2[9], 8);
+  step1[7] = WRAPLOW(step2[7] + step2[8], 8);
+  step1[8] = WRAPLOW(step2[7] - step2[8], 8);
+  step1[9] = WRAPLOW(step2[6] - step2[9], 8);
+  step1[10] = WRAPLOW(step2[5] - step2[10], 8);
+  step1[11] = WRAPLOW(step2[4] - step2[11], 8);
+  step1[12] = WRAPLOW(step2[3] - step2[12], 8);
+  step1[13] = WRAPLOW(step2[2] - step2[13], 8);
+  step1[14] = WRAPLOW(step2[1] - step2[14], 8);
+  step1[15] = WRAPLOW(step2[0] - step2[15], 8);
 
   step1[16] = step2[16];
   step1[17] = step2[17];
@@ -1227,58 +1235,58 @@
   step1[19] = step2[19];
   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
   temp2 = (step2[20] + step2[27]) * cospi_16_64;
-  step1[20] = dct_const_round_shift(temp1);
-  step1[27] = dct_const_round_shift(temp2);
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
   temp2 = (step2[21] + step2[26]) * cospi_16_64;
-  step1[21] = dct_const_round_shift(temp1);
-  step1[26] = dct_const_round_shift(temp2);
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
   temp2 = (step2[22] + step2[25]) * cospi_16_64;
-  step1[22] = dct_const_round_shift(temp1);
-  step1[25] = dct_const_round_shift(temp2);
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
   temp2 = (step2[23] + step2[24]) * cospi_16_64;
-  step1[23] = dct_const_round_shift(temp1);
-  step1[24] = dct_const_round_shift(temp2);
+  step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
   step1[28] = step2[28];
   step1[29] = step2[29];
   step1[30] = step2[30];
   step1[31] = step2[31];
 
   // final stage
-  output[0] = step1[0] + step1[31];
-  output[1] = step1[1] + step1[30];
-  output[2] = step1[2] + step1[29];
-  output[3] = step1[3] + step1[28];
-  output[4] = step1[4] + step1[27];
-  output[5] = step1[5] + step1[26];
-  output[6] = step1[6] + step1[25];
-  output[7] = step1[7] + step1[24];
-  output[8] = step1[8] + step1[23];
-  output[9] = step1[9] + step1[22];
-  output[10] = step1[10] + step1[21];
-  output[11] = step1[11] + step1[20];
-  output[12] = step1[12] + step1[19];
-  output[13] = step1[13] + step1[18];
-  output[14] = step1[14] + step1[17];
-  output[15] = step1[15] + step1[16];
-  output[16] = step1[15] - step1[16];
-  output[17] = step1[14] - step1[17];
-  output[18] = step1[13] - step1[18];
-  output[19] = step1[12] - step1[19];
-  output[20] = step1[11] - step1[20];
-  output[21] = step1[10] - step1[21];
-  output[22] = step1[9] - step1[22];
-  output[23] = step1[8] - step1[23];
-  output[24] = step1[7] - step1[24];
-  output[25] = step1[6] - step1[25];
-  output[26] = step1[5] - step1[26];
-  output[27] = step1[4] - step1[27];
-  output[28] = step1[3] - step1[28];
-  output[29] = step1[2] - step1[29];
-  output[30] = step1[1] - step1[30];
-  output[31] = step1[0] - step1[31];
+  output[0] = WRAPLOW(step1[0] + step1[31], 8);
+  output[1] = WRAPLOW(step1[1] + step1[30], 8);
+  output[2] = WRAPLOW(step1[2] + step1[29], 8);
+  output[3] = WRAPLOW(step1[3] + step1[28], 8);
+  output[4] = WRAPLOW(step1[4] + step1[27], 8);
+  output[5] = WRAPLOW(step1[5] + step1[26], 8);
+  output[6] = WRAPLOW(step1[6] + step1[25], 8);
+  output[7] = WRAPLOW(step1[7] + step1[24], 8);
+  output[8] = WRAPLOW(step1[8] + step1[23], 8);
+  output[9] = WRAPLOW(step1[9] + step1[22], 8);
+  output[10] = WRAPLOW(step1[10] + step1[21], 8);
+  output[11] = WRAPLOW(step1[11] + step1[20], 8);
+  output[12] = WRAPLOW(step1[12] + step1[19], 8);
+  output[13] = WRAPLOW(step1[13] + step1[18], 8);
+  output[14] = WRAPLOW(step1[14] + step1[17], 8);
+  output[15] = WRAPLOW(step1[15] + step1[16], 8);
+  output[16] = WRAPLOW(step1[15] - step1[16], 8);
+  output[17] = WRAPLOW(step1[14] - step1[17], 8);
+  output[18] = WRAPLOW(step1[13] - step1[18], 8);
+  output[19] = WRAPLOW(step1[12] - step1[19], 8);
+  output[20] = WRAPLOW(step1[11] - step1[20], 8);
+  output[21] = WRAPLOW(step1[10] - step1[21], 8);
+  output[22] = WRAPLOW(step1[9] - step1[22], 8);
+  output[23] = WRAPLOW(step1[8] - step1[23], 8);
+  output[24] = WRAPLOW(step1[7] - step1[24], 8);
+  output[25] = WRAPLOW(step1[6] - step1[25], 8);
+  output[26] = WRAPLOW(step1[5] - step1[26], 8);
+  output[27] = WRAPLOW(step1[4] - step1[27], 8);
+  output[28] = WRAPLOW(step1[3] - step1[28], 8);
+  output[29] = WRAPLOW(step1[2] - step1[29], 8);
+  output[30] = WRAPLOW(step1[1] - step1[30], 8);
+  output[31] = WRAPLOW(step1[0] - step1[31], 8);
 }
 
 void vp9_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
@@ -1313,9 +1321,10 @@
     for (j = 0; j < 32; ++j)
       temp_in[j] = out[j * 32 + i];
     idct32(temp_in, temp_out);
-    for (j = 0; j < 32; ++j)
-      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
-                                        + dest[j * stride + i]);
+    for (j = 0; j < 32; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
+    }
   }
 }
 
@@ -1339,9 +1348,10 @@
     for (j = 0; j < 32; ++j)
       temp_in[j] = out[j * 32 + i];
     idct32(temp_in, temp_out);
-    for (j = 0; j < 32; ++j)
-      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
-                                  + dest[j * stride + i]);
+    for (j = 0; j < 32; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
+    }
   }
 }
 
@@ -1349,13 +1359,13 @@
   int i, j;
   tran_high_t a1;
 
-  tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);
-  out = dct_const_round_shift(out * cospi_16_64);
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
   a1 = ROUND_POWER_OF_TWO(out, 6);
 
   for (j = 0; j < 32; ++j) {
     for (i = 0; i < 32; ++i)
-      dest[i] = clip_pixel(dest[i] + a1);
+      dest[i] = clip_pixel_add(dest[i], a1);
     dest += stride;
   }
 }
@@ -1448,8 +1458,8 @@
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vp9_high_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
-                               int stride, int bd) {
+void vp9_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+                                 int stride, int bd) {
   /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
      0.5 shifts per pixel. */
   int i;
@@ -1471,10 +1481,10 @@
     c1 = e1 - c1;
     a1 -= b1;
     d1 += c1;
-    op[0] = WRAPLOW(a1);
-    op[1] = WRAPLOW(b1);
-    op[2] = WRAPLOW(c1);
-    op[3] = WRAPLOW(d1);
+    op[0] = WRAPLOW(a1, bd);
+    op[1] = WRAPLOW(b1, bd);
+    op[2] = WRAPLOW(c1, bd);
+    op[3] = WRAPLOW(d1, bd);
     ip += 4;
     op += 4;
   }
@@ -1492,39 +1502,18 @@
     c1 = e1 - c1;
     a1 -= b1;
     d1 += c1;
-    dest[stride * 0] = clip_pixel_bd_high(dest[stride * 0], a1, bd);
-    dest[stride * 1] = clip_pixel_bd_high(dest[stride * 1], b1, bd);
-    dest[stride * 2] = clip_pixel_bd_high(dest[stride * 2], c1, bd);
-    dest[stride * 3] = clip_pixel_bd_high(dest[stride * 3], d1, bd);
+    dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
+    dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd);
+    dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd);
+    dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd);
 
     ip++;
     dest++;
   }
 }
 
-static void high_idct4(const tran_low_t *input, tran_low_t *output, int bd) {
-  tran_low_t step[4];
-  tran_high_t temp1, temp2;
-  (void) bd;
-  // stage 1
-  temp1 = (input[0] + input[2]) * cospi_16_64;
-  temp2 = (input[0] - input[2]) * cospi_16_64;
-  step[0] = WRAPLOW(dct_const_round_shift(temp1));
-  step[1] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
-  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-  step[2] = WRAPLOW(dct_const_round_shift(temp1));
-  step[3] = WRAPLOW(dct_const_round_shift(temp2));
-
-  // stage 2
-  output[0] = WRAPLOW(step[0] + step[3]);
-  output[1] = WRAPLOW(step[1] + step[2]);
-  output[2] = WRAPLOW(step[1] - step[2]);
-  output[3] = WRAPLOW(step[0] - step[3]);
-}
-
-void vp9_high_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
-                              int dest_stride, int bd) {
+void vp9_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
+                                int dest_stride, int bd) {
   int i;
   tran_high_t a1, e1;
   tran_low_t tmp[4];
@@ -1536,24 +1525,49 @@
   a1 = ip[0] >> UNIT_QUANT_SHIFT;
   e1 = a1 >> 1;
   a1 -= e1;
-  op[0] = WRAPLOW(a1);
-  op[1] = op[2] = op[3] = WRAPLOW(e1);
+  op[0] = WRAPLOW(a1, bd);
+  op[1] = op[2] = op[3] = WRAPLOW(e1, bd);
 
   ip = tmp;
   for (i = 0; i < 4; i++) {
     e1 = ip[0] >> 1;
     a1 = ip[0] - e1;
-    dest[dest_stride * 0] = clip_pixel_bd_high(dest[dest_stride * 0], a1, bd);
-    dest[dest_stride * 1] = clip_pixel_bd_high(dest[dest_stride * 1], e1, bd);
-    dest[dest_stride * 2] = clip_pixel_bd_high(dest[dest_stride * 2], e1, bd);
-    dest[dest_stride * 3] = clip_pixel_bd_high(dest[dest_stride * 3], e1, bd);
+    dest[dest_stride * 0] = highbd_clip_pixel_add(
+        dest[dest_stride * 0], a1, bd);
+    dest[dest_stride * 1] = highbd_clip_pixel_add(
+        dest[dest_stride * 1], e1, bd);
+    dest[dest_stride * 2] = highbd_clip_pixel_add(
+        dest[dest_stride * 2], e1, bd);
+    dest[dest_stride * 3] = highbd_clip_pixel_add(
+        dest[dest_stride * 3], e1, bd);
     ip++;
     dest++;
   }
 }
 
-void vp9_high_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
-                               int stride, int bd) {
+static void highbd_idct4(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_low_t step[4];
+  tran_high_t temp1, temp2;
+  (void) bd;
+  // stage 1
+  temp1 = (input[0] + input[2]) * cospi_16_64;
+  temp2 = (input[0] - input[2]) * cospi_16_64;
+  step[0] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step[1] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+  step[2] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step[3] = WRAPLOW(dct_const_round_shift(temp2), bd);
+
+  // stage 2
+  output[0] = WRAPLOW(step[0] + step[3], bd);
+  output[1] = WRAPLOW(step[1] + step[2], bd);
+  output[2] = WRAPLOW(step[1] - step[2], bd);
+  output[3] = WRAPLOW(step[0] - step[3], bd);
+}
+
+void vp9_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+                                 int stride, int bd) {
   tran_low_t out[4 * 4];
   tran_low_t *outptr = out;
   int i, j;
@@ -1562,7 +1576,7 @@
 
   // Rows
   for (i = 0; i < 4; ++i) {
-    high_idct4(input, outptr, bd);
+    highbd_idct4(input, outptr, bd);
     input += 4;
     outptr += 4;
   }
@@ -1571,33 +1585,34 @@
   for (i = 0; i < 4; ++i) {
     for (j = 0; j < 4; ++j)
       temp_in[j] = out[j * 4 + i];
-    high_idct4(temp_in, temp_out, bd);
-    for (j = 0; j < 4; ++j)
-      dest[j * stride + i] = clip_pixel_bd_high(
+    highbd_idct4(temp_in, temp_out, bd);
+    for (j = 0; j < 4; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+    }
   }
 }
 
-void vp9_high_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
-                              int dest_stride, int bd) {
+void vp9_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
+                                int dest_stride, int bd) {
   int i;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 4);
 
   for (i = 0; i < 4; i++) {
-    dest[0] = clip_pixel_bd_high(dest[0], a1, bd);
-    dest[1] = clip_pixel_bd_high(dest[1], a1, bd);
-    dest[2] = clip_pixel_bd_high(dest[2], a1, bd);
-    dest[3] = clip_pixel_bd_high(dest[3], a1, bd);
+    dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
+    dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
+    dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
+    dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
     dest += dest_stride;
   }
 }
 
-static void high_idct8(const tran_low_t *input, tran_low_t *output, int bd) {
+static void highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd) {
   tran_low_t step1[8], step2[8];
   tran_high_t temp1, temp2;
   // stage 1
@@ -1607,43 +1622,43 @@
   step1[3] = input[6];
   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
 
   // stage 2 & stage 3 - even half
-  high_idct4(step1, step1, bd);
+  highbd_idct4(step1, step1, bd);
 
   // stage 2 - odd half
-  step2[4] = WRAPLOW(step1[4] + step1[5]);
-  step2[5] = WRAPLOW(step1[4] - step1[5]);
-  step2[6] = WRAPLOW(-step1[6] + step1[7]);
-  step2[7] = WRAPLOW(step1[6] + step1[7]);
+  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
+  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
+  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
+  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
 
   // stage 3 - odd half
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
 
   // stage 4
-  output[0] = WRAPLOW(step1[0] + step1[7]);
-  output[1] = WRAPLOW(step1[1] + step1[6]);
-  output[2] = WRAPLOW(step1[2] + step1[5]);
-  output[3] = WRAPLOW(step1[3] + step1[4]);
-  output[4] = WRAPLOW(step1[3] - step1[4]);
-  output[5] = WRAPLOW(step1[2] - step1[5]);
-  output[6] = WRAPLOW(step1[1] - step1[6]);
-  output[7] = WRAPLOW(step1[0] - step1[7]);
+  output[0] = WRAPLOW(step1[0] + step1[7], bd);
+  output[1] = WRAPLOW(step1[1] + step1[6], bd);
+  output[2] = WRAPLOW(step1[2] + step1[5], bd);
+  output[3] = WRAPLOW(step1[3] + step1[4], bd);
+  output[4] = WRAPLOW(step1[3] - step1[4], bd);
+  output[5] = WRAPLOW(step1[2] - step1[5], bd);
+  output[6] = WRAPLOW(step1[1] - step1[6], bd);
+  output[7] = WRAPLOW(step1[0] - step1[7], bd);
 }
 
-void vp9_high_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
-                               int stride, int bd) {
+void vp9_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
+                                 int stride, int bd) {
   tran_low_t out[8 * 8];
   tran_low_t *outptr = out;
   int i, j;
@@ -1652,7 +1667,7 @@
 
   // First transform rows.
   for (i = 0; i < 8; ++i) {
-    high_idct8(input, outptr, bd);
+    highbd_idct8(input, outptr, bd);
     input += 8;
     outptr += 8;
   }
@@ -1661,30 +1676,30 @@
   for (i = 0; i < 8; ++i) {
     for (j = 0; j < 8; ++j)
       temp_in[j] = out[j * 8 + i];
-    high_idct8(temp_in, temp_out, bd);
-    for (j = 0; j < 8; ++j)
-      dest[j * stride + i] = clip_pixel_bd_high(dest[j * stride + i],
-                                        ROUND_POWER_OF_TWO(temp_out[j], 5),
-                                        bd);
+    highbd_idct8(temp_in, temp_out, bd);
+    for (j = 0; j < 8; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+    }
   }
 }
 
-void vp9_high_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
-                              int stride, int bd) {
+void vp9_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
+                                int stride, int bd) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 5);
   for (j = 0; j < 8; ++j) {
     for (i = 0; i < 8; ++i)
-      dest[i] = clip_pixel_bd_high(dest[i], a1, bd);
+      dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
     dest += stride;
   }
 }
 
-static void high_iadst4(const tran_low_t *input, tran_low_t *output, int bd) {
+static void highbd_iadst4(const tran_low_t *input, tran_low_t *output, int bd) {
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
 
   tran_high_t x0 = input[0];
@@ -1721,19 +1736,19 @@
   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
   // + 1b (addition) = 29b.
   // Hence the output bit depth is 15b.
-  output[0] = WRAPLOW(dct_const_round_shift(s0));
-  output[1] = WRAPLOW(dct_const_round_shift(s1));
-  output[2] = WRAPLOW(dct_const_round_shift(s2));
-  output[3] = WRAPLOW(dct_const_round_shift(s3));
+  output[0] = WRAPLOW(dct_const_round_shift(s0), bd);
+  output[1] = WRAPLOW(dct_const_round_shift(s1), bd);
+  output[2] = WRAPLOW(dct_const_round_shift(s2), bd);
+  output[3] = WRAPLOW(dct_const_round_shift(s3), bd);
 }
 
-void vp9_high_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
-                              int stride, int tx_type, int bd) {
-  const high_transform_2d IHT_4[] = {
-    { high_idct4, high_idct4  },    // DCT_DCT  = 0
-    { high_iadst4, high_idct4 },    // ADST_DCT = 1
-    { high_idct4, high_iadst4 },    // DCT_ADST = 2
-    { high_iadst4, high_iadst4 }    // ADST_ADST = 3
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+                                int stride, int tx_type, int bd) {
+  const highbd_transform_2d IHT_4[] = {
+    { highbd_idct4, highbd_idct4  },    // DCT_DCT  = 0
+    { highbd_iadst4, highbd_idct4 },    // ADST_DCT = 1
+    { highbd_idct4, highbd_iadst4 },    // DCT_ADST = 2
+    { highbd_iadst4, highbd_iadst4 }    // ADST_ADST = 3
   };
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
@@ -1754,13 +1769,14 @@
     for (j = 0; j < 4; ++j)
       temp_in[j] = out[j * 4 + i];
     IHT_4[tx_type].cols(temp_in, temp_out, bd);
-    for (j = 0; j < 4; ++j)
-      dest[j * stride + i] = clip_pixel_bd_high(
+    for (j = 0; j < 4; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+    }
   }
 }
 
-static void high_iadst8(const tran_low_t *input, tran_low_t *output, int bd) {
+static void highbd_iadst8(const tran_low_t *input, tran_low_t *output, int bd) {
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
 
   tran_high_t x0 = input[7];
@@ -1788,14 +1804,14 @@
   s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
   s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
 
-  x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
-  x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
-  x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
-  x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
-  x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
-  x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
-  x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
-  x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
+  x0 = WRAPLOW(dct_const_round_shift(s0 + s4), bd);
+  x1 = WRAPLOW(dct_const_round_shift(s1 + s5), bd);
+  x2 = WRAPLOW(dct_const_round_shift(s2 + s6), bd);
+  x3 = WRAPLOW(dct_const_round_shift(s3 + s7), bd);
+  x4 = WRAPLOW(dct_const_round_shift(s0 - s4), bd);
+  x5 = WRAPLOW(dct_const_round_shift(s1 - s5), bd);
+  x6 = WRAPLOW(dct_const_round_shift(s2 - s6), bd);
+  x7 = WRAPLOW(dct_const_round_shift(s3 - s7), bd);
 
   // stage 2
   s0 = x0;
@@ -1807,14 +1823,14 @@
   s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
   s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
 
-  x0 = s0 + s2;
-  x1 = s1 + s3;
-  x2 = s0 - s2;
-  x3 = s1 - s3;
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
-  x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
-  x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
+  x0 = WRAPLOW(s0 + s2, bd);
+  x1 = WRAPLOW(s1 + s3, bd);
+  x2 = WRAPLOW(s0 - s2, bd);
+  x3 = WRAPLOW(s1 - s3, bd);
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s6), bd);
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s7), bd);
+  x6 = WRAPLOW(dct_const_round_shift(s4 - s6), bd);
+  x7 = WRAPLOW(dct_const_round_shift(s5 - s7), bd);
 
   // stage 3
   s2 = cospi_16_64 * (x2 + x3);
@@ -1822,35 +1838,35 @@
   s6 = cospi_16_64 * (x6 + x7);
   s7 = cospi_16_64 * (x6 - x7);
 
-  x2 = WRAPLOW(dct_const_round_shift(s2));
-  x3 = WRAPLOW(dct_const_round_shift(s3));
-  x6 = WRAPLOW(dct_const_round_shift(s6));
-  x7 = WRAPLOW(dct_const_round_shift(s7));
+  x2 = WRAPLOW(dct_const_round_shift(s2), bd);
+  x3 = WRAPLOW(dct_const_round_shift(s3), bd);
+  x6 = WRAPLOW(dct_const_round_shift(s6), bd);
+  x7 = WRAPLOW(dct_const_round_shift(s7), bd);
 
-  output[0] = WRAPLOW(x0);
-  output[1] = WRAPLOW(-x4);
-  output[2] = WRAPLOW(x6);
-  output[3] = WRAPLOW(-x2);
-  output[4] = WRAPLOW(x3);
-  output[5] = WRAPLOW(-x7);
-  output[6] = WRAPLOW(x5);
-  output[7] = WRAPLOW(-x1);
+  output[0] = WRAPLOW(x0, bd);
+  output[1] = WRAPLOW(-x4, bd);
+  output[2] = WRAPLOW(x6, bd);
+  output[3] = WRAPLOW(-x2, bd);
+  output[4] = WRAPLOW(x3, bd);
+  output[5] = WRAPLOW(-x7, bd);
+  output[6] = WRAPLOW(x5, bd);
+  output[7] = WRAPLOW(-x1, bd);
 }
 
-static const high_transform_2d HIGH_IHT_8[] = {
-  { high_idct8,  high_idct8  },  // DCT_DCT  = 0
-  { high_iadst8, high_idct8  },  // ADST_DCT = 1
-  { high_idct8,  high_iadst8 },  // DCT_ADST = 2
-  { high_iadst8, high_iadst8 }   // ADST_ADST = 3
+static const highbd_transform_2d HIGH_IHT_8[] = {
+  { highbd_idct8,  highbd_idct8  },  // DCT_DCT  = 0
+  { highbd_iadst8, highbd_idct8  },  // ADST_DCT = 1
+  { highbd_idct8,  highbd_iadst8 },  // DCT_ADST = 2
+  { highbd_iadst8, highbd_iadst8 }   // ADST_ADST = 3
 };
 
-void vp9_high_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
-                              int stride, int tx_type, int bd) {
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
+                                int stride, int tx_type, int bd) {
   int i, j;
   tran_low_t out[8 * 8];
   tran_low_t *outptr = out;
   tran_low_t temp_in[8], temp_out[8];
-  const high_transform_2d ht = HIGH_IHT_8[tx_type];
+  const highbd_transform_2d ht = HIGH_IHT_8[tx_type];
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
   // Inverse transform row vectors.
@@ -1865,14 +1881,15 @@
     for (j = 0; j < 8; ++j)
       temp_in[j] = out[j * 8 + i];
     ht.cols(temp_in, temp_out, bd);
-    for (j = 0; j < 8; ++j)
-      dest[j * stride + i] = clip_pixel_bd_high(
+    for (j = 0; j < 8; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+    }
   }
 }
 
-void vp9_high_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
-                               int stride, int bd) {
+void vp9_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
+                                 int stride, int bd) {
   tran_low_t out[8 * 8] = { 0 };
   tran_low_t *outptr = out;
   int i, j;
@@ -1882,7 +1899,7 @@
   // First transform rows.
   // Only first 4 row has non-zero coefs.
   for (i = 0; i < 4; ++i) {
-    high_idct8(input, outptr, bd);
+    highbd_idct8(input, outptr, bd);
     input += 8;
     outptr += 8;
   }
@@ -1890,14 +1907,15 @@
   for (i = 0; i < 8; ++i) {
     for (j = 0; j < 8; ++j)
       temp_in[j] = out[j * 8 + i];
-    high_idct8(temp_in, temp_out, bd);
-    for (j = 0; j < 8; ++j)
-      dest[j * stride + i] = clip_pixel_bd_high(
+    highbd_idct8(temp_in, temp_out, bd);
+    for (j = 0; j < 8; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+    }
   }
 }
 
-static void high_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
+static void highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
   tran_low_t step1[16], step2[16];
   tran_high_t temp1, temp2;
   (void) bd;
@@ -1932,23 +1950,23 @@
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[15] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[8] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[15] = WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
 
   // stage 3
   step1[0] = step2[0];
@@ -1958,113 +1976,113 @@
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  step1[8] = WRAPLOW(step2[8] + step2[9]);
-  step1[9] = WRAPLOW(step2[8] - step2[9]);
-  step1[10] = WRAPLOW(-step2[10] + step2[11]);
-  step1[11] = WRAPLOW(step2[10] + step2[11]);
-  step1[12] = WRAPLOW(step2[12] + step2[13]);
-  step1[13] = WRAPLOW(step2[12] - step2[13]);
-  step1[14] = WRAPLOW(-step2[14] + step2[15]);
-  step1[15] = WRAPLOW(step2[14] + step2[15]);
+  step1[8] = WRAPLOW(step2[8] + step2[9], bd);
+  step1[9] = WRAPLOW(step2[8] - step2[9], bd);
+  step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
+  step1[11] = WRAPLOW(step2[10] + step2[11], bd);
+  step1[12] = WRAPLOW(step2[12] + step2[13], bd);
+  step1[13] = WRAPLOW(step2[12] - step2[13], bd);
+  step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
+  step1[15] = WRAPLOW(step2[14] + step2[15], bd);
 
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
-  step2[4] = WRAPLOW(step1[4] + step1[5]);
-  step2[5] = WRAPLOW(step1[4] - step1[5]);
-  step2[6] = WRAPLOW(-step1[6] + step1[7]);
-  step2[7] = WRAPLOW(step1[6] + step1[7]);
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
+  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
+  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
+  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
 
   step2[8] = step1[8];
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[11] = step1[11];
   step2[12] = step1[12];
 
   // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3]);
-  step1[1] = WRAPLOW(step2[1] + step2[2]);
-  step1[2] = WRAPLOW(step2[1] - step2[2]);
-  step1[3] = WRAPLOW(step2[0] - step2[3]);
+  step1[0] = WRAPLOW(step2[0] + step2[3], bd);
+  step1[1] = WRAPLOW(step2[1] + step2[2], bd);
+  step1[2] = WRAPLOW(step2[1] - step2[2], bd);
+  step1[3] = WRAPLOW(step2[0] - step2[3], bd);
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
 
-  step1[8] = WRAPLOW(step2[8] + step2[11]);
-  step1[9] = WRAPLOW(step2[9] + step2[10]);
-  step1[10] = WRAPLOW(step2[9] - step2[10]);
-  step1[11] = WRAPLOW(step2[8] - step2[11]);
-  step1[12] = WRAPLOW(-step2[12] + step2[15]);
-  step1[13] = WRAPLOW(-step2[13] + step2[14]);
-  step1[14] = WRAPLOW(step2[13] + step2[14]);
-  step1[15] = WRAPLOW(step2[12] + step2[15]);
+  step1[8] = WRAPLOW(step2[8] + step2[11], bd);
+  step1[9] = WRAPLOW(step2[9] + step2[10], bd);
+  step1[10] = WRAPLOW(step2[9] - step2[10], bd);
+  step1[11] = WRAPLOW(step2[8] - step2[11], bd);
+  step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
+  step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
+  step1[14] = WRAPLOW(step2[13] + step2[14], bd);
+  step1[15] = WRAPLOW(step2[12] + step2[15], bd);
 
   // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7]);
-  step2[1] = WRAPLOW(step1[1] + step1[6]);
-  step2[2] = WRAPLOW(step1[2] + step1[5]);
-  step2[3] = WRAPLOW(step1[3] + step1[4]);
-  step2[4] = WRAPLOW(step1[3] - step1[4]);
-  step2[5] = WRAPLOW(step1[2] - step1[5]);
-  step2[6] = WRAPLOW(step1[1] - step1[6]);
-  step2[7] = WRAPLOW(step1[0] - step1[7]);
+  step2[0] = WRAPLOW(step1[0] + step1[7], bd);
+  step2[1] = WRAPLOW(step1[1] + step1[6], bd);
+  step2[2] = WRAPLOW(step1[2] + step1[5], bd);
+  step2[3] = WRAPLOW(step1[3] + step1[4], bd);
+  step2[4] = WRAPLOW(step1[3] - step1[4], bd);
+  step2[5] = WRAPLOW(step1[2] - step1[5], bd);
+  step2[6] = WRAPLOW(step1[1] - step1[6], bd);
+  step2[7] = WRAPLOW(step1[0] - step1[7], bd);
   step2[8] = step1[8];
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[14] = step1[14];
   step2[15] = step1[15];
 
   // stage 7
-  output[0] = WRAPLOW(step2[0] + step2[15]);
-  output[1] = WRAPLOW(step2[1] + step2[14]);
-  output[2] = WRAPLOW(step2[2] + step2[13]);
-  output[3] = WRAPLOW(step2[3] + step2[12]);
-  output[4] = WRAPLOW(step2[4] + step2[11]);
-  output[5] = WRAPLOW(step2[5] + step2[10]);
-  output[6] = WRAPLOW(step2[6] + step2[9]);
-  output[7] = WRAPLOW(step2[7] + step2[8]);
-  output[8] = WRAPLOW(step2[7] - step2[8]);
-  output[9] = WRAPLOW(step2[6] - step2[9]);
-  output[10] = WRAPLOW(step2[5] - step2[10]);
-  output[11] = WRAPLOW(step2[4] - step2[11]);
-  output[12] = WRAPLOW(step2[3] - step2[12]);
-  output[13] = WRAPLOW(step2[2] - step2[13]);
-  output[14] = WRAPLOW(step2[1] - step2[14]);
-  output[15] = WRAPLOW(step2[0] - step2[15]);
+  output[0] = WRAPLOW(step2[0] + step2[15], bd);
+  output[1] = WRAPLOW(step2[1] + step2[14], bd);
+  output[2] = WRAPLOW(step2[2] + step2[13], bd);
+  output[3] = WRAPLOW(step2[3] + step2[12], bd);
+  output[4] = WRAPLOW(step2[4] + step2[11], bd);
+  output[5] = WRAPLOW(step2[5] + step2[10], bd);
+  output[6] = WRAPLOW(step2[6] + step2[9], bd);
+  output[7] = WRAPLOW(step2[7] + step2[8], bd);
+  output[8] = WRAPLOW(step2[7] - step2[8], bd);
+  output[9] = WRAPLOW(step2[6] - step2[9], bd);
+  output[10] = WRAPLOW(step2[5] - step2[10], bd);
+  output[11] = WRAPLOW(step2[4] - step2[11], bd);
+  output[12] = WRAPLOW(step2[3] - step2[12], bd);
+  output[13] = WRAPLOW(step2[2] - step2[13], bd);
+  output[14] = WRAPLOW(step2[1] - step2[14], bd);
+  output[15] = WRAPLOW(step2[0] - step2[15], bd);
 }
 
-void vp9_high_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
-                                  int stride, int bd) {
+void vp9_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
+                                    int stride, int bd) {
   tran_low_t out[16 * 16];
   tran_low_t *outptr = out;
   int i, j;
@@ -2073,7 +2091,7 @@
 
   // First transform rows.
   for (i = 0; i < 16; ++i) {
-    high_idct16(input, outptr, bd);
+    highbd_idct16(input, outptr, bd);
     input += 16;
     outptr += 16;
   }
@@ -2082,14 +2100,16 @@
   for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j)
       temp_in[j] = out[j * 16 + i];
-    high_idct16(temp_in, temp_out, bd);
-    for (j = 0; j < 16; ++j)
-      dest[j * stride + i] = clip_pixel_bd_high(
+    highbd_idct16(temp_in, temp_out, bd);
+    for (j = 0; j < 16; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+    }
   }
 }
 
-static void high_iadst16(const tran_low_t *input, tran_low_t *output, int bd) {
+static void highbd_iadst16(const tran_low_t *input, tran_low_t *output,
+                           int bd) {
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
   tran_high_t s9, s10, s11, s12, s13, s14, s15;
 
@@ -2135,22 +2155,22 @@
   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
   s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
 
-  x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
-  x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
-  x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
-  x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
-  x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
-  x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
-  x8  = WRAPLOW(dct_const_round_shift(s0 - s8));
-  x9  = WRAPLOW(dct_const_round_shift(s1 - s9));
-  x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
-  x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
-  x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
-  x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
-  x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
-  x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
+  x0 = WRAPLOW(dct_const_round_shift(s0 + s8), bd);
+  x1 = WRAPLOW(dct_const_round_shift(s1 + s9), bd);
+  x2 = WRAPLOW(dct_const_round_shift(s2 + s10), bd);
+  x3 = WRAPLOW(dct_const_round_shift(s3 + s11), bd);
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s12), bd);
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s13), bd);
+  x6 = WRAPLOW(dct_const_round_shift(s6 + s14), bd);
+  x7 = WRAPLOW(dct_const_round_shift(s7 + s15), bd);
+  x8  = WRAPLOW(dct_const_round_shift(s0 - s8), bd);
+  x9  = WRAPLOW(dct_const_round_shift(s1 - s9), bd);
+  x10 = WRAPLOW(dct_const_round_shift(s2 - s10), bd);
+  x11 = WRAPLOW(dct_const_round_shift(s3 - s11), bd);
+  x12 = WRAPLOW(dct_const_round_shift(s4 - s12), bd);
+  x13 = WRAPLOW(dct_const_round_shift(s5 - s13), bd);
+  x14 = WRAPLOW(dct_const_round_shift(s6 - s14), bd);
+  x15 = WRAPLOW(dct_const_round_shift(s7 - s15), bd);
 
   // stage 2
   s0 = x0;
@@ -2170,22 +2190,22 @@
   s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
   s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
 
-  x0 = WRAPLOW(s0 + s4);
-  x1 = WRAPLOW(s1 + s5);
-  x2 = WRAPLOW(s2 + s6);
-  x3 = WRAPLOW(s3 + s7);
-  x4 = WRAPLOW(s0 - s4);
-  x5 = WRAPLOW(s1 - s5);
-  x6 = WRAPLOW(s2 - s6);
-  x7 = WRAPLOW(s3 - s7);
-  x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
-  x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
-  x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
-  x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
-  x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
-  x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
-  x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
-  x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
+  x0 = WRAPLOW(s0 + s4, bd);
+  x1 = WRAPLOW(s1 + s5, bd);
+  x2 = WRAPLOW(s2 + s6, bd);
+  x3 = WRAPLOW(s3 + s7, bd);
+  x4 = WRAPLOW(s0 - s4, bd);
+  x5 = WRAPLOW(s1 - s5, bd);
+  x6 = WRAPLOW(s2 - s6, bd);
+  x7 = WRAPLOW(s3 - s7, bd);
+  x8 = WRAPLOW(dct_const_round_shift(s8 + s12), bd);
+  x9 = WRAPLOW(dct_const_round_shift(s9 + s13), bd);
+  x10 = WRAPLOW(dct_const_round_shift(s10 + s14), bd);
+  x11 = WRAPLOW(dct_const_round_shift(s11 + s15), bd);
+  x12 = WRAPLOW(dct_const_round_shift(s8 - s12), bd);
+  x13 = WRAPLOW(dct_const_round_shift(s9 - s13), bd);
+  x14 = WRAPLOW(dct_const_round_shift(s10 - s14), bd);
+  x15 = WRAPLOW(dct_const_round_shift(s11 - s15), bd);
 
   // stage 3
   s0 = x0;
@@ -2205,22 +2225,22 @@
   s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
   s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
 
-  x0 = WRAPLOW(s0 + s2);
-  x1 = WRAPLOW(s1 + s3);
-  x2 = WRAPLOW(s0 - s2);
-  x3 = WRAPLOW(s1 - s3);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
-  x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
-  x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
-  x8 = WRAPLOW(s8 + s10);
-  x9 = WRAPLOW(s9 + s11);
-  x10 = WRAPLOW(s8 - s10);
-  x11 = WRAPLOW(s9 - s11);
-  x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
-  x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
-  x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
-  x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
+  x0 = WRAPLOW(s0 + s2, bd);
+  x1 = WRAPLOW(s1 + s3, bd);
+  x2 = WRAPLOW(s0 - s2, bd);
+  x3 = WRAPLOW(s1 - s3, bd);
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s6), bd);
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s7), bd);
+  x6 = WRAPLOW(dct_const_round_shift(s4 - s6), bd);
+  x7 = WRAPLOW(dct_const_round_shift(s5 - s7), bd);
+  x8 = WRAPLOW(s8 + s10, bd);
+  x9 = WRAPLOW(s9 + s11, bd);
+  x10 = WRAPLOW(s8 - s10, bd);
+  x11 = WRAPLOW(s9 - s11, bd);
+  x12 = WRAPLOW(dct_const_round_shift(s12 + s14), bd);
+  x13 = WRAPLOW(dct_const_round_shift(s13 + s15), bd);
+  x14 = WRAPLOW(dct_const_round_shift(s12 - s14), bd);
+  x15 = WRAPLOW(dct_const_round_shift(s13 - s15), bd);
 
   // stage 4
   s2 = (- cospi_16_64) * (x2 + x3);
@@ -2232,47 +2252,47 @@
   s14 = (- cospi_16_64) * (x14 + x15);
   s15 = cospi_16_64 * (x14 - x15);
 
-  x2 = WRAPLOW(dct_const_round_shift(s2));
-  x3 = WRAPLOW(dct_const_round_shift(s3));
-  x6 = WRAPLOW(dct_const_round_shift(s6));
-  x7 = WRAPLOW(dct_const_round_shift(s7));
-  x10 = WRAPLOW(dct_const_round_shift(s10));
-  x11 = WRAPLOW(dct_const_round_shift(s11));
-  x14 = WRAPLOW(dct_const_round_shift(s14));
-  x15 = WRAPLOW(dct_const_round_shift(s15));
+  x2 = WRAPLOW(dct_const_round_shift(s2), bd);
+  x3 = WRAPLOW(dct_const_round_shift(s3), bd);
+  x6 = WRAPLOW(dct_const_round_shift(s6), bd);
+  x7 = WRAPLOW(dct_const_round_shift(s7), bd);
+  x10 = WRAPLOW(dct_const_round_shift(s10), bd);
+  x11 = WRAPLOW(dct_const_round_shift(s11), bd);
+  x14 = WRAPLOW(dct_const_round_shift(s14), bd);
+  x15 = WRAPLOW(dct_const_round_shift(s15), bd);
 
-  output[0] = WRAPLOW(x0);
-  output[1] = WRAPLOW(-x8);
-  output[2] = WRAPLOW(x12);
-  output[3] = WRAPLOW(-x4);
-  output[4] = WRAPLOW(x6);
-  output[5] = WRAPLOW(x14);
-  output[6] = WRAPLOW(x10);
-  output[7] = WRAPLOW(x2);
-  output[8] = WRAPLOW(x3);
-  output[9] = WRAPLOW(x11);
-  output[10] = WRAPLOW(x15);
-  output[11] = WRAPLOW(x7);
-  output[12] = WRAPLOW(x5);
-  output[13] = WRAPLOW(-x13);
-  output[14] = WRAPLOW(x9);
-  output[15] = WRAPLOW(-x1);
+  output[0] = WRAPLOW(x0, bd);
+  output[1] = WRAPLOW(-x8, bd);
+  output[2] = WRAPLOW(x12, bd);
+  output[3] = WRAPLOW(-x4, bd);
+  output[4] = WRAPLOW(x6, bd);
+  output[5] = WRAPLOW(x14, bd);
+  output[6] = WRAPLOW(x10, bd);
+  output[7] = WRAPLOW(x2, bd);
+  output[8] = WRAPLOW(x3, bd);
+  output[9] = WRAPLOW(x11, bd);
+  output[10] = WRAPLOW(x15, bd);
+  output[11] = WRAPLOW(x7, bd);
+  output[12] = WRAPLOW(x5, bd);
+  output[13] = WRAPLOW(-x13, bd);
+  output[14] = WRAPLOW(x9, bd);
+  output[15] = WRAPLOW(-x1, bd);
 }
 
-static const high_transform_2d HIGH_IHT_16[] = {
-  { high_idct16,  high_idct16  },  // DCT_DCT  = 0
-  { high_iadst16, high_idct16  },  // ADST_DCT = 1
-  { high_idct16,  high_iadst16 },  // DCT_ADST = 2
-  { high_iadst16, high_iadst16 }   // ADST_ADST = 3
+static const highbd_transform_2d HIGH_IHT_16[] = {
+  { highbd_idct16,  highbd_idct16  },  // DCT_DCT  = 0
+  { highbd_iadst16, highbd_idct16  },  // ADST_DCT = 1
+  { highbd_idct16,  highbd_iadst16 },  // DCT_ADST = 2
+  { highbd_iadst16, highbd_iadst16 }   // ADST_ADST = 3
 };
 
-void vp9_high_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
-                                 int stride, int tx_type, int bd) {
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
+                                   int stride, int tx_type, int bd) {
   int i, j;
   tran_low_t out[16 * 16];
   tran_low_t *outptr = out;
   tran_low_t temp_in[16], temp_out[16];
-  const high_transform_2d ht = HIGH_IHT_16[tx_type];
+  const highbd_transform_2d ht = HIGH_IHT_16[tx_type];
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
   // Rows
@@ -2287,14 +2307,15 @@
     for (j = 0; j < 16; ++j)
       temp_in[j] = out[j * 16 + i];
     ht.cols(temp_in, temp_out, bd);
-    for (j = 0; j < 16; ++j)
-      dest[j * stride + i] = clip_pixel_bd_high(
+    for (j = 0; j < 16; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+    }
   }
 }
 
-void vp9_high_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
-                                 int stride, int bd) {
+void vp9_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
+                                   int stride, int bd) {
   tran_low_t out[16 * 16] = { 0 };
   tran_low_t *outptr = out;
   int i, j;
@@ -2304,7 +2325,7 @@
   // First transform rows. Since all non-zero dct coefficients are in
   // upper-left 4x4 area, we only need to calculate first 4 rows here.
   for (i = 0; i < 4; ++i) {
-    high_idct16(input, outptr, bd);
+    highbd_idct16(input, outptr, bd);
     input += 16;
     outptr += 16;
   }
@@ -2313,30 +2334,31 @@
   for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j)
       temp_in[j] = out[j*16 + i];
-    high_idct16(temp_in, temp_out, bd);
-    for (j = 0; j < 16; ++j)
-      dest[j * stride + i] = clip_pixel_bd_high(
+    highbd_idct16(temp_in, temp_out, bd);
+    for (j = 0; j < 16; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+    }
   }
 }
 
-void vp9_high_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
-                                int stride, int bd) {
+void vp9_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
+                                  int stride, int bd) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 6);
   for (j = 0; j < 16; ++j) {
     for (i = 0; i < 16; ++i)
-      dest[i] = clip_pixel_bd_high(dest[i], a1, bd);
+      dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
     dest += stride;
   }
 }
 
-static void high_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
+static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
   tran_low_t step1[32], step2[32];
   tran_high_t temp1, temp2;
   (void) bd;
@@ -2361,43 +2383,43 @@
 
   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
-  step1[16] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[31] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[16] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[31] = WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
-  step1[17] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[30] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[17] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[30] = WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
-  step1[19] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[28] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[19] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[28] = WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
-  step1[23] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[24] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[23] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[24] = WRAPLOW(dct_const_round_shift(temp2), bd);
 
   // stage 2
   step2[0] = step1[0];
@@ -2411,40 +2433,40 @@
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[15] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[8] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[15] = WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  step2[16] = WRAPLOW(step1[16] + step1[17]);
-  step2[17] = WRAPLOW(step1[16] - step1[17]);
-  step2[18] = WRAPLOW(-step1[18] + step1[19]);
-  step2[19] = WRAPLOW(step1[18] + step1[19]);
-  step2[20] = WRAPLOW(step1[20] + step1[21]);
-  step2[21] = WRAPLOW(step1[20] - step1[21]);
-  step2[22] = WRAPLOW(-step1[22] + step1[23]);
-  step2[23] = WRAPLOW(step1[22] + step1[23]);
-  step2[24] = WRAPLOW(step1[24] + step1[25]);
-  step2[25] = WRAPLOW(step1[24] - step1[25]);
-  step2[26] = WRAPLOW(-step1[26] + step1[27]);
-  step2[27] = WRAPLOW(step1[26] + step1[27]);
-  step2[28] = WRAPLOW(step1[28] + step1[29]);
-  step2[29] = WRAPLOW(step1[28] - step1[29]);
-  step2[30] = WRAPLOW(-step1[30] + step1[31]);
-  step2[31] = WRAPLOW(step1[30] + step1[31]);
+  step2[16] = WRAPLOW(step1[16] + step1[17], bd);
+  step2[17] = WRAPLOW(step1[16] - step1[17], bd);
+  step2[18] = WRAPLOW(-step1[18] + step1[19], bd);
+  step2[19] = WRAPLOW(step1[18] + step1[19], bd);
+  step2[20] = WRAPLOW(step1[20] + step1[21], bd);
+  step2[21] = WRAPLOW(step1[20] - step1[21], bd);
+  step2[22] = WRAPLOW(-step1[22] + step1[23], bd);
+  step2[23] = WRAPLOW(step1[22] + step1[23], bd);
+  step2[24] = WRAPLOW(step1[24] + step1[25], bd);
+  step2[25] = WRAPLOW(step1[24] - step1[25], bd);
+  step2[26] = WRAPLOW(-step1[26] + step1[27], bd);
+  step2[27] = WRAPLOW(step1[26] + step1[27], bd);
+  step2[28] = WRAPLOW(step1[28] + step1[29], bd);
+  step2[29] = WRAPLOW(step1[28] - step1[29], bd);
+  step2[30] = WRAPLOW(-step1[30] + step1[31], bd);
+  step2[31] = WRAPLOW(step1[30] + step1[31], bd);
 
   // stage 3
   step1[0] = step2[0];
@@ -2454,42 +2476,42 @@
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  step1[8] = WRAPLOW(step2[8] + step2[9]);
-  step1[9] = WRAPLOW(step2[8] - step2[9]);
-  step1[10] = WRAPLOW(-step2[10] + step2[11]);
-  step1[11] = WRAPLOW(step2[10] + step2[11]);
-  step1[12] = WRAPLOW(step2[12] + step2[13]);
-  step1[13] = WRAPLOW(step2[12] - step2[13]);
-  step1[14] = WRAPLOW(-step2[14] + step2[15]);
-  step1[15] = WRAPLOW(step2[14] + step2[15]);
+  step1[8] = WRAPLOW(step2[8] + step2[9], bd);
+  step1[9] = WRAPLOW(step2[8] - step2[9], bd);
+  step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
+  step1[11] = WRAPLOW(step2[10] + step2[11], bd);
+  step1[12] = WRAPLOW(step2[12] + step2[13], bd);
+  step1[13] = WRAPLOW(step2[12] - step2[13], bd);
+  step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
+  step1[15] = WRAPLOW(step2[14] + step2[15], bd);
 
   step1[16] = step2[16];
   step1[31] = step2[31];
   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
-  step1[17] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[30] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[17] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[30] = WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[19] = step2[19];
   step1[20] = step2[20];
   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[23] = step2[23];
   step1[24] = step2[24];
   step1[27] = step2[27];
@@ -2498,87 +2520,87 @@
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
-  step2[4] = WRAPLOW(step1[4] + step1[5]);
-  step2[5] = WRAPLOW(step1[4] - step1[5]);
-  step2[6] = WRAPLOW(-step1[6] + step1[7]);
-  step2[7] = WRAPLOW(step1[6] + step1[7]);
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
+  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
+  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
+  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
 
   step2[8] = step1[8];
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[11] = step1[11];
   step2[12] = step1[12];
 
-  step2[16] = WRAPLOW(step1[16] + step1[19]);
-  step2[17] = WRAPLOW(step1[17] + step1[18]);
-  step2[18] = WRAPLOW(step1[17] - step1[18]);
-  step2[19] = WRAPLOW(step1[16] - step1[19]);
-  step2[20] = WRAPLOW(-step1[20] + step1[23]);
-  step2[21] = WRAPLOW(-step1[21] + step1[22]);
-  step2[22] = WRAPLOW(step1[21] + step1[22]);
-  step2[23] = WRAPLOW(step1[20] + step1[23]);
+  step2[16] = WRAPLOW(step1[16] + step1[19], bd);
+  step2[17] = WRAPLOW(step1[17] + step1[18], bd);
+  step2[18] = WRAPLOW(step1[17] - step1[18], bd);
+  step2[19] = WRAPLOW(step1[16] - step1[19], bd);
+  step2[20] = WRAPLOW(-step1[20] + step1[23], bd);
+  step2[21] = WRAPLOW(-step1[21] + step1[22], bd);
+  step2[22] = WRAPLOW(step1[21] + step1[22], bd);
+  step2[23] = WRAPLOW(step1[20] + step1[23], bd);
 
-  step2[24] = WRAPLOW(step1[24] + step1[27]);
-  step2[25] = WRAPLOW(step1[25] + step1[26]);
-  step2[26] = WRAPLOW(step1[25] - step1[26]);
-  step2[27] = WRAPLOW(step1[24] - step1[27]);
-  step2[28] = WRAPLOW(-step1[28] + step1[31]);
-  step2[29] = WRAPLOW(-step1[29] + step1[30]);
-  step2[30] = WRAPLOW(step1[29] + step1[30]);
-  step2[31] = WRAPLOW(step1[28] + step1[31]);
+  step2[24] = WRAPLOW(step1[24] + step1[27], bd);
+  step2[25] = WRAPLOW(step1[25] + step1[26], bd);
+  step2[26] = WRAPLOW(step1[25] - step1[26], bd);
+  step2[27] = WRAPLOW(step1[24] - step1[27], bd);
+  step2[28] = WRAPLOW(-step1[28] + step1[31], bd);
+  step2[29] = WRAPLOW(-step1[29] + step1[30], bd);
+  step2[30] = WRAPLOW(step1[29] + step1[30], bd);
+  step2[31] = WRAPLOW(step1[28] + step1[31], bd);
 
   // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3]);
-  step1[1] = WRAPLOW(step2[1] + step2[2]);
-  step1[2] = WRAPLOW(step2[1] - step2[2]);
-  step1[3] = WRAPLOW(step2[0] - step2[3]);
+  step1[0] = WRAPLOW(step2[0] + step2[3], bd);
+  step1[1] = WRAPLOW(step2[1] + step2[2], bd);
+  step1[2] = WRAPLOW(step2[1] - step2[2], bd);
+  step1[3] = WRAPLOW(step2[0] - step2[3], bd);
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
 
-  step1[8] = WRAPLOW(step2[8] + step2[11]);
-  step1[9] = WRAPLOW(step2[9] + step2[10]);
-  step1[10] = WRAPLOW(step2[9] - step2[10]);
-  step1[11] = WRAPLOW(step2[8] - step2[11]);
-  step1[12] = WRAPLOW(-step2[12] + step2[15]);
-  step1[13] = WRAPLOW(-step2[13] + step2[14]);
-  step1[14] = WRAPLOW(step2[13] + step2[14]);
-  step1[15] = WRAPLOW(step2[12] + step2[15]);
+  step1[8] = WRAPLOW(step2[8] + step2[11], bd);
+  step1[9] = WRAPLOW(step2[9] + step2[10], bd);
+  step1[10] = WRAPLOW(step2[9] - step2[10], bd);
+  step1[11] = WRAPLOW(step2[8] - step2[11], bd);
+  step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
+  step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
+  step1[14] = WRAPLOW(step2[13] + step2[14], bd);
+  step1[15] = WRAPLOW(step2[12] + step2[15], bd);
 
   step1[16] = step2[16];
   step1[17] = step2[17];
   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
-  step1[19] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[28] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[19] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[28] = WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[22] = step2[22];
   step1[23] = step2[23];
   step1[24] = step2[24];
@@ -2587,62 +2609,62 @@
   step1[31] = step2[31];
 
   // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7]);
-  step2[1] = WRAPLOW(step1[1] + step1[6]);
-  step2[2] = WRAPLOW(step1[2] + step1[5]);
-  step2[3] = WRAPLOW(step1[3] + step1[4]);
-  step2[4] = WRAPLOW(step1[3] - step1[4]);
-  step2[5] = WRAPLOW(step1[2] - step1[5]);
-  step2[6] = WRAPLOW(step1[1] - step1[6]);
-  step2[7] = WRAPLOW(step1[0] - step1[7]);
+  step2[0] = WRAPLOW(step1[0] + step1[7], bd);
+  step2[1] = WRAPLOW(step1[1] + step1[6], bd);
+  step2[2] = WRAPLOW(step1[2] + step1[5], bd);
+  step2[3] = WRAPLOW(step1[3] + step1[4], bd);
+  step2[4] = WRAPLOW(step1[3] - step1[4], bd);
+  step2[5] = WRAPLOW(step1[2] - step1[5], bd);
+  step2[6] = WRAPLOW(step1[1] - step1[6], bd);
+  step2[7] = WRAPLOW(step1[0] - step1[7], bd);
   step2[8] = step1[8];
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
-  step2[14] = WRAPLOW(step1[14]);
-  step2[15] = WRAPLOW(step1[15]);
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[14] = step1[14];
+  step2[15] = step1[15];
 
-  step2[16] = WRAPLOW(step1[16] + step1[23]);
-  step2[17] = WRAPLOW(step1[17] + step1[22]);
-  step2[18] = WRAPLOW(step1[18] + step1[21]);
-  step2[19] = WRAPLOW(step1[19] + step1[20]);
-  step2[20] = WRAPLOW(step1[19] - step1[20]);
-  step2[21] = WRAPLOW(step1[18] - step1[21]);
-  step2[22] = WRAPLOW(step1[17] - step1[22]);
-  step2[23] = WRAPLOW(step1[16] - step1[23]);
+  step2[16] = WRAPLOW(step1[16] + step1[23], bd);
+  step2[17] = WRAPLOW(step1[17] + step1[22], bd);
+  step2[18] = WRAPLOW(step1[18] + step1[21], bd);
+  step2[19] = WRAPLOW(step1[19] + step1[20], bd);
+  step2[20] = WRAPLOW(step1[19] - step1[20], bd);
+  step2[21] = WRAPLOW(step1[18] - step1[21], bd);
+  step2[22] = WRAPLOW(step1[17] - step1[22], bd);
+  step2[23] = WRAPLOW(step1[16] - step1[23], bd);
 
-  step2[24] = WRAPLOW(-step1[24] + step1[31]);
-  step2[25] = WRAPLOW(-step1[25] + step1[30]);
-  step2[26] = WRAPLOW(-step1[26] + step1[29]);
-  step2[27] = WRAPLOW(-step1[27] + step1[28]);
-  step2[28] = WRAPLOW(step1[27] + step1[28]);
-  step2[29] = WRAPLOW(step1[26] + step1[29]);
-  step2[30] = WRAPLOW(step1[25] + step1[30]);
-  step2[31] = WRAPLOW(step1[24] + step1[31]);
+  step2[24] = WRAPLOW(-step1[24] + step1[31], bd);
+  step2[25] = WRAPLOW(-step1[25] + step1[30], bd);
+  step2[26] = WRAPLOW(-step1[26] + step1[29], bd);
+  step2[27] = WRAPLOW(-step1[27] + step1[28], bd);
+  step2[28] = WRAPLOW(step1[27] + step1[28], bd);
+  step2[29] = WRAPLOW(step1[26] + step1[29], bd);
+  step2[30] = WRAPLOW(step1[25] + step1[30], bd);
+  step2[31] = WRAPLOW(step1[24] + step1[31], bd);
 
   // stage 7
-  step1[0] = WRAPLOW(step2[0] + step2[15]);
-  step1[1] = WRAPLOW(step2[1] + step2[14]);
-  step1[2] = WRAPLOW(step2[2] + step2[13]);
-  step1[3] = WRAPLOW(step2[3] + step2[12]);
-  step1[4] = WRAPLOW(step2[4] + step2[11]);
-  step1[5] = WRAPLOW(step2[5] + step2[10]);
-  step1[6] = WRAPLOW(step2[6] + step2[9]);
-  step1[7] = WRAPLOW(step2[7] + step2[8]);
-  step1[8] = WRAPLOW(step2[7] - step2[8]);
-  step1[9] = WRAPLOW(step2[6] - step2[9]);
-  step1[10] = WRAPLOW(step2[5] - step2[10]);
-  step1[11] = WRAPLOW(step2[4] - step2[11]);
-  step1[12] = WRAPLOW(step2[3] - step2[12]);
-  step1[13] = WRAPLOW(step2[2] - step2[13]);
-  step1[14] = WRAPLOW(step2[1] - step2[14]);
-  step1[15] = WRAPLOW(step2[0] - step2[15]);
+  step1[0] = WRAPLOW(step2[0] + step2[15], bd);
+  step1[1] = WRAPLOW(step2[1] + step2[14], bd);
+  step1[2] = WRAPLOW(step2[2] + step2[13], bd);
+  step1[3] = WRAPLOW(step2[3] + step2[12], bd);
+  step1[4] = WRAPLOW(step2[4] + step2[11], bd);
+  step1[5] = WRAPLOW(step2[5] + step2[10], bd);
+  step1[6] = WRAPLOW(step2[6] + step2[9], bd);
+  step1[7] = WRAPLOW(step2[7] + step2[8], bd);
+  step1[8] = WRAPLOW(step2[7] - step2[8], bd);
+  step1[9] = WRAPLOW(step2[6] - step2[9], bd);
+  step1[10] = WRAPLOW(step2[5] - step2[10], bd);
+  step1[11] = WRAPLOW(step2[4] - step2[11], bd);
+  step1[12] = WRAPLOW(step2[3] - step2[12], bd);
+  step1[13] = WRAPLOW(step2[2] - step2[13], bd);
+  step1[14] = WRAPLOW(step2[1] - step2[14], bd);
+  step1[15] = WRAPLOW(step2[0] - step2[15], bd);
 
   step1[16] = step2[16];
   step1[17] = step2[17];
@@ -2650,62 +2672,62 @@
   step1[19] = step2[19];
   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
   temp2 = (step2[20] + step2[27]) * cospi_16_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
   temp2 = (step2[21] + step2[26]) * cospi_16_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
   temp2 = (step2[22] + step2[25]) * cospi_16_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
   temp2 = (step2[23] + step2[24]) * cospi_16_64;
-  step1[23] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[24] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[23] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[24] = WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[28] = step2[28];
   step1[29] = step2[29];
   step1[30] = step2[30];
   step1[31] = step2[31];
 
   // final stage
-  output[0] = WRAPLOW(step1[0] + step1[31]);
-  output[1] = WRAPLOW(step1[1] + step1[30]);
-  output[2] = WRAPLOW(step1[2] + step1[29]);
-  output[3] = WRAPLOW(step1[3] + step1[28]);
-  output[4] = WRAPLOW(step1[4] + step1[27]);
-  output[5] = WRAPLOW(step1[5] + step1[26]);
-  output[6] = WRAPLOW(step1[6] + step1[25]);
-  output[7] = WRAPLOW(step1[7] + step1[24]);
-  output[8] = WRAPLOW(step1[8] + step1[23]);
-  output[9] = WRAPLOW(step1[9] + step1[22]);
-  output[10] = WRAPLOW(step1[10] + step1[21]);
-  output[11] = WRAPLOW(step1[11] + step1[20]);
-  output[12] = WRAPLOW(step1[12] + step1[19]);
-  output[13] = WRAPLOW(step1[13] + step1[18]);
-  output[14] = WRAPLOW(step1[14] + step1[17]);
-  output[15] = WRAPLOW(step1[15] + step1[16]);
-  output[16] = WRAPLOW(step1[15] - step1[16]);
-  output[17] = WRAPLOW(step1[14] - step1[17]);
-  output[18] = WRAPLOW(step1[13] - step1[18]);
-  output[19] = WRAPLOW(step1[12] - step1[19]);
-  output[20] = WRAPLOW(step1[11] - step1[20]);
-  output[21] = WRAPLOW(step1[10] - step1[21]);
-  output[22] = WRAPLOW(step1[9] - step1[22]);
-  output[23] = WRAPLOW(step1[8] - step1[23]);
-  output[24] = WRAPLOW(step1[7] - step1[24]);
-  output[25] = WRAPLOW(step1[6] - step1[25]);
-  output[26] = WRAPLOW(step1[5] - step1[26]);
-  output[27] = WRAPLOW(step1[4] - step1[27]);
-  output[28] = WRAPLOW(step1[3] - step1[28]);
-  output[29] = WRAPLOW(step1[2] - step1[29]);
-  output[30] = WRAPLOW(step1[1] - step1[30]);
-  output[31] = WRAPLOW(step1[0] - step1[31]);
+  output[0] = WRAPLOW(step1[0] + step1[31], bd);
+  output[1] = WRAPLOW(step1[1] + step1[30], bd);
+  output[2] = WRAPLOW(step1[2] + step1[29], bd);
+  output[3] = WRAPLOW(step1[3] + step1[28], bd);
+  output[4] = WRAPLOW(step1[4] + step1[27], bd);
+  output[5] = WRAPLOW(step1[5] + step1[26], bd);
+  output[6] = WRAPLOW(step1[6] + step1[25], bd);
+  output[7] = WRAPLOW(step1[7] + step1[24], bd);
+  output[8] = WRAPLOW(step1[8] + step1[23], bd);
+  output[9] = WRAPLOW(step1[9] + step1[22], bd);
+  output[10] = WRAPLOW(step1[10] + step1[21], bd);
+  output[11] = WRAPLOW(step1[11] + step1[20], bd);
+  output[12] = WRAPLOW(step1[12] + step1[19], bd);
+  output[13] = WRAPLOW(step1[13] + step1[18], bd);
+  output[14] = WRAPLOW(step1[14] + step1[17], bd);
+  output[15] = WRAPLOW(step1[15] + step1[16], bd);
+  output[16] = WRAPLOW(step1[15] - step1[16], bd);
+  output[17] = WRAPLOW(step1[14] - step1[17], bd);
+  output[18] = WRAPLOW(step1[13] - step1[18], bd);
+  output[19] = WRAPLOW(step1[12] - step1[19], bd);
+  output[20] = WRAPLOW(step1[11] - step1[20], bd);
+  output[21] = WRAPLOW(step1[10] - step1[21], bd);
+  output[22] = WRAPLOW(step1[9] - step1[22], bd);
+  output[23] = WRAPLOW(step1[8] - step1[23], bd);
+  output[24] = WRAPLOW(step1[7] - step1[24], bd);
+  output[25] = WRAPLOW(step1[6] - step1[25], bd);
+  output[26] = WRAPLOW(step1[5] - step1[26], bd);
+  output[27] = WRAPLOW(step1[4] - step1[27], bd);
+  output[28] = WRAPLOW(step1[3] - step1[28], bd);
+  output[29] = WRAPLOW(step1[2] - step1[29], bd);
+  output[30] = WRAPLOW(step1[1] - step1[30], bd);
+  output[31] = WRAPLOW(step1[0] - step1[31], bd);
 }
 
-void vp9_high_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
-                                   int stride, int bd) {
+void vp9_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
+                                     int stride, int bd) {
   tran_low_t out[32 * 32];
   tran_low_t *outptr = out;
   int i, j;
@@ -2725,7 +2747,7 @@
       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
 
     if (zero_coeff[0] | zero_coeff[1])
-      high_idct32(input, outptr, bd);
+      highbd_idct32(input, outptr, bd);
     else
       vpx_memset(outptr, 0, sizeof(tran_low_t) * 32);
     input += 32;
@@ -2736,15 +2758,16 @@
   for (i = 0; i < 32; ++i) {
     for (j = 0; j < 32; ++j)
       temp_in[j] = out[j * 32 + i];
-    high_idct32(temp_in, temp_out, bd);
-    for (j = 0; j < 32; ++j)
-      dest[j * stride + i] = clip_pixel_bd_high(
+    highbd_idct32(temp_in, temp_out, bd);
+    for (j = 0; j < 32; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+    }
   }
 }
 
-void vp9_high_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
-                                 int stride, int bd) {
+void vp9_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
+                                   int stride, int bd) {
   tran_low_t out[32 * 32] = {0};
   tran_low_t *outptr = out;
   int i, j;
@@ -2754,7 +2777,7 @@
   // Rows
   // Only upper-left 8x8 has non-zero coeff.
   for (i = 0; i < 8; ++i) {
-    high_idct32(input, outptr, bd);
+    highbd_idct32(input, outptr, bd);
     input += 32;
     outptr += 32;
   }
@@ -2762,50 +2785,51 @@
   for (i = 0; i < 32; ++i) {
     for (j = 0; j < 32; ++j)
       temp_in[j] = out[j * 32 + i];
-    high_idct32(temp_in, temp_out, bd);
-    for (j = 0; j < 32; ++j)
-      dest[j * stride + i] = clip_pixel_bd_high(
+    highbd_idct32(temp_in, temp_out, bd);
+    for (j = 0; j < 32; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+    }
   }
 }
 
-void vp9_high_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
-                                int stride, int bd) {
+void vp9_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
+                                  int stride, int bd) {
   int i, j;
   int a1;
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 6);
 
   for (j = 0; j < 32; ++j) {
     for (i = 0; i < 32; ++i)
-      dest[i] = clip_pixel_bd_high(dest[i], a1, bd);
+      dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
     dest += stride;
   }
 }
 
 // idct
-void vp9_high_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
-                          int eob, int bd) {
+void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd) {
   if (eob > 1)
-    vp9_high_idct4x4_16_add(input, dest, stride, bd);
+    vp9_highbd_idct4x4_16_add(input, dest, stride, bd);
   else
-    vp9_high_idct4x4_1_add(input, dest, stride, bd);
+    vp9_highbd_idct4x4_1_add(input, dest, stride, bd);
 }
 
 
-void vp9_high_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
-                          int eob, int bd) {
+void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd) {
   if (eob > 1)
-    vp9_high_iwht4x4_16_add(input, dest, stride, bd);
+    vp9_highbd_iwht4x4_16_add(input, dest, stride, bd);
   else
-    vp9_high_iwht4x4_1_add(input, dest, stride, bd);
+    vp9_highbd_iwht4x4_1_add(input, dest, stride, bd);
 }
 
-void vp9_high_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
-                          int eob, int bd) {
+void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd) {
   // If dc is 1, then input[0] is the reconstructed value, do not need
   // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
 
@@ -2815,64 +2839,64 @@
   // Combine that with code here.
   // DC only DCT coefficient
   if (eob == 1) {
-    vp9_high_idct8x8_1_add(input, dest, stride, bd);
+    vp9_highbd_idct8x8_1_add(input, dest, stride, bd);
   } else if (eob <= 10) {
-    vp9_high_idct8x8_10_add(input, dest, stride, bd);
+    vp9_highbd_idct8x8_10_add(input, dest, stride, bd);
   } else {
-    vp9_high_idct8x8_64_add(input, dest, stride, bd);
+    vp9_highbd_idct8x8_64_add(input, dest, stride, bd);
   }
 }
 
-void vp9_high_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
-                       int eob, int bd) {
+void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest,
+                              int stride, int eob, int bd) {
   // The calculation can be simplified if there are not many non-zero dct
   // coefficients. Use eobs to separate different cases.
   // DC only DCT coefficient.
   if (eob == 1) {
-    vp9_high_idct16x16_1_add(input, dest, stride, bd);
+    vp9_highbd_idct16x16_1_add(input, dest, stride, bd);
   } else if (eob <= 10) {
-    vp9_high_idct16x16_10_add(input, dest, stride, bd);
+    vp9_highbd_idct16x16_10_add(input, dest, stride, bd);
   } else {
-    vp9_high_idct16x16_256_add(input, dest, stride, bd);
+    vp9_highbd_idct16x16_256_add(input, dest, stride, bd);
   }
 }
 
-void vp9_high_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
-                       int eob, int bd) {
+void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest,
+                              int stride, int eob, int bd) {
   // Non-zero coeff only in upper-left 8x8
   if (eob == 1) {
-    vp9_high_idct32x32_1_add(input, dest, stride, bd);
+    vp9_highbd_idct32x32_1_add(input, dest, stride, bd);
   } else if (eob <= 34) {
-    vp9_high_idct32x32_34_add(input, dest, stride, bd);
+    vp9_highbd_idct32x32_34_add(input, dest, stride, bd);
   } else {
-    vp9_high_idct32x32_1024_add(input, dest, stride, bd);
+    vp9_highbd_idct32x32_1024_add(input, dest, stride, bd);
   }
 }
 
 // iht
-void vp9_high_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
-                         uint8_t *dest, int stride, int eob, int bd) {
+void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
+                           uint8_t *dest, int stride, int eob, int bd) {
   if (tx_type == DCT_DCT)
-    vp9_high_idct4x4_add(input, dest, stride, eob, bd);
+    vp9_highbd_idct4x4_add(input, dest, stride, eob, bd);
   else
-    vp9_high_iht4x4_16_add(input, dest, stride, tx_type, bd);
+    vp9_highbd_iht4x4_16_add(input, dest, stride, tx_type, bd);
 }
 
-void vp9_high_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
-                         uint8_t *dest, int stride, int eob, int bd) {
+void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
+                           uint8_t *dest, int stride, int eob, int bd) {
   if (tx_type == DCT_DCT) {
-    vp9_high_idct8x8_add(input, dest, stride, eob, bd);
+    vp9_highbd_idct8x8_add(input, dest, stride, eob, bd);
   } else {
-    vp9_high_iht8x8_64_add(input, dest, stride, tx_type, bd);
+    vp9_highbd_iht8x8_64_add(input, dest, stride, tx_type, bd);
   }
 }
 
-void vp9_high_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
+void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
                            uint8_t *dest, int stride, int eob, int bd) {
   if (tx_type == DCT_DCT) {
-    vp9_high_idct16x16_add(input, dest, stride, eob, bd);
+    vp9_highbd_idct16x16_add(input, dest, stride, eob, bd);
   } else {
-    vp9_high_iht16x16_256_add(input, dest, stride, tx_type, bd);
+    vp9_highbd_iht16x16_256_add(input, dest, stride, tx_type, bd);
   }
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH

diff --git a/source/libvpx/vp9/common/vp9_idct.h b/source/libvpx/vp9/common/vp9_idct.h
index 694be3c..12569b9 100644
--- a/source/libvpx/vp9/common/vp9_idct.h
+++ b/source/libvpx/vp9/common/vp9_idct.h

@@ -14,7 +14,6 @@
 #include <assert.h>
 
 #include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_enums.h"
 
@@ -22,7 +21,6 @@
 extern "C" {
 #endif
 
-
 // Constants and Macros used by all idct/dct functions
 #define DCT_CONST_BITS 14
 #define DCT_CONST_ROUNDING  (1 << (DCT_CONST_BITS - 1))
@@ -36,17 +34,6 @@
 #define dual_set_epi16(a, b) \
   _mm_set_epi16(b, b, b, b, a, a, a, a)
 
-// Note:
-// tran_low_t  is the datatype used for final transform coefficients.
-// tran_high_t is the datatype used for intermediate transform stages.
-#if CONFIG_VP9_HIGHBITDEPTH
-typedef int64_t tran_high_t;
-typedef int32_t tran_low_t;
-#else
-typedef int32_t tran_high_t;
-typedef int16_t tran_low_t;
-#endif
-
 // Constants:
 //  for (int i = 1; i< 32; ++i)
 //    printf("static const int cospi_%d_64 = %.0f;\n", i,
@@ -90,8 +77,7 @@
 static const tran_high_t sinpi_3_9 = 13377;
 static const tran_high_t sinpi_4_9 = 15212;
 
-static INLINE tran_low_t dct_const_round_shift(tran_high_t input) {
-  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+static INLINE tran_low_t check_range(tran_high_t input) {
 #if CONFIG_VP9_HIGHBITDEPTH
   // For valid highbitdepth VP9 streams, intermediate stage coefficients will
   // stay within the ranges:
@@ -105,10 +91,15 @@
   // this range for every intermediate coefficient can burdensome for a decoder,
   // therefore the following assertion is only enabled when configured with
   // --enable-coefficient-range-checking.
-  assert(INT16_MIN <= rv);
-  assert(rv <= INT16_MAX);
+  assert(INT16_MIN <= input);
+  assert(input <= INT16_MAX);
 #endif
-  return (tran_low_t)rv;
+  return (tran_low_t)input;
+}
+
+static INLINE tran_low_t dct_const_round_shift(tran_high_t input) {
+  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+  return check_range(rv);
 }
 
 typedef void (*transform_1d)(const tran_low_t*, tran_low_t*);
@@ -118,11 +109,11 @@
 } transform_2d;
 
 #if CONFIG_VP9_HIGHBITDEPTH
-typedef void (*high_transform_1d)(const tran_low_t*, tran_low_t*, int bd);
+typedef void (*highbd_transform_1d)(const tran_low_t*, tran_low_t*, int bd);
 
 typedef struct {
-  high_transform_1d cols, rows;  // vertical and horizontal
-} high_transform_2d;
+  highbd_transform_1d cols, rows;  // vertical and horizontal
+} highbd_transform_2d;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
@@ -144,22 +135,22 @@
                       int stride, int eob);
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vp9_high_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
-                          int eob, int bd);
-void vp9_high_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
-                          int eob, int bd);
-void vp9_high_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
-                          int eob, int bd);
-void vp9_high_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
+void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                             int eob, int bd);
-void vp9_high_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
+void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                             int eob, int bd);
-void vp9_high_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
-                         uint8_t *dest, int stride, int eob, int bd);
-void vp9_high_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
-                         uint8_t *dest, int stride, int eob, int bd);
-void vp9_high_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
+void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd);
+void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest,
+                              int stride, int eob, int bd);
+void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest,
+                              int stride, int eob, int bd);
+void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
                            uint8_t *dest, int stride, int eob, int bd);
+void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
+                           uint8_t *dest, int stride, int eob, int bd);
+void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
+                             uint8_t *dest, int stride, int eob, int bd);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/source/libvpx/vp9/common/vp9_loopfilter.c b/source/libvpx/vp9/common/vp9_loopfilter.c
index 102eb71..aca8d7b 100644
--- a/source/libvpx/vp9/common/vp9_loopfilter.c
+++ b/source/libvpx/vp9/common/vp9_loopfilter.c

@@ -392,6 +392,107 @@
   }
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_filter_selectively_vert_row2(PLANE_TYPE plane_type,
+                                                uint16_t *s, int pitch,
+                                                unsigned int mask_16x16_l,
+                                                unsigned int mask_8x8_l,
+                                                unsigned int mask_4x4_l,
+                                                unsigned int mask_4x4_int_l,
+                                                const loop_filter_info_n *lfi_n,
+                                                const uint8_t *lfl, int bd) {
+  const int mask_shift = plane_type ? 4 : 8;
+  const int mask_cutoff = plane_type ? 0xf : 0xff;
+  const int lfl_forward = plane_type ? 4 : 8;
+
+  unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff;
+  unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff;
+  unsigned int mask_4x4_0 = mask_4x4_l & mask_cutoff;
+  unsigned int mask_4x4_int_0 = mask_4x4_int_l & mask_cutoff;
+  unsigned int mask_16x16_1 = (mask_16x16_l >> mask_shift) & mask_cutoff;
+  unsigned int mask_8x8_1 = (mask_8x8_l >> mask_shift) & mask_cutoff;
+  unsigned int mask_4x4_1 = (mask_4x4_l >> mask_shift) & mask_cutoff;
+  unsigned int mask_4x4_int_1 = (mask_4x4_int_l >> mask_shift) & mask_cutoff;
+  unsigned int mask;
+
+  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_4x4_int_0 |
+       mask_16x16_1 | mask_8x8_1 | mask_4x4_1 | mask_4x4_int_1;
+       mask; mask >>= 1) {
+    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
+    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward);
+
+    // TODO(yunqingwang): count in loopfilter functions should be removed.
+    if (mask & 1) {
+      if ((mask_16x16_0 | mask_16x16_1) & 1) {
+        if ((mask_16x16_0 & mask_16x16_1) & 1) {
+          vp9_highbd_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                          lfi0->hev_thr, bd);
+        } else if (mask_16x16_0 & 1) {
+          vp9_highbd_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim,
+                                     lfi0->hev_thr, bd);
+        } else {
+          vp9_highbd_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim,
+                                     lfi1->lim, lfi1->hev_thr, bd);
+        }
+      }
+
+      if ((mask_8x8_0 | mask_8x8_1) & 1) {
+        if ((mask_8x8_0 & mask_8x8_1) & 1) {
+          vp9_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                         lfi1->hev_thr, bd);
+        } else if (mask_8x8_0 & 1) {
+          vp9_highbd_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, 1, bd);
+        } else {
+          vp9_highbd_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim,
+                                    lfi1->lim, lfi1->hev_thr, 1, bd);
+        }
+      }
+
+      if ((mask_4x4_0 | mask_4x4_1) & 1) {
+        if ((mask_4x4_0 & mask_4x4_1) & 1) {
+          vp9_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                         lfi1->hev_thr, bd);
+        } else if (mask_4x4_0 & 1) {
+          vp9_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, 1, bd);
+        } else {
+          vp9_highbd_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim,
+                                    lfi1->lim, lfi1->hev_thr, 1, bd);
+        }
+      }
+
+      if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) {
+        if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {
+          vp9_highbd_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
+                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                         lfi1->hev_thr, bd);
+        } else if (mask_4x4_int_0 & 1) {
+          vp9_highbd_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, 1, bd);
+        } else {
+          vp9_highbd_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim,
+                                    lfi1->lim, lfi1->hev_thr, 1, bd);
+        }
+      }
+    }
+
+    s += 8;
+    lfl += 1;
+    mask_16x16_0 >>= 1;
+    mask_8x8_0 >>= 1;
+    mask_4x4_0 >>= 1;
+    mask_4x4_int_0 >>= 1;
+    mask_16x16_1 >>= 1;
+    mask_8x8_1 >>= 1;
+    mask_4x4_1 >>= 1;
+    mask_4x4_int_1 >>= 1;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static void filter_selectively_horiz(uint8_t *s, int pitch,
                                      unsigned int mask_16x16,
                                      unsigned int mask_8x8,
@@ -419,7 +520,7 @@
         }
       } else if (mask_8x8 & 1) {
         if ((mask_8x8 & 3) == 3) {
-          // Next block's thresholds
+          // Next block's thresholds.
           const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
 
           vp9_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
@@ -448,7 +549,7 @@
         }
       } else if (mask_4x4 & 1) {
         if ((mask_4x4 & 3) == 3) {
-          // Next block's thresholds
+          // Next block's thresholds.
           const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
 
           vp9_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
@@ -488,6 +589,112 @@
   }
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_filter_selectively_horiz(uint16_t *s, int pitch,
+                                            unsigned int mask_16x16,
+                                            unsigned int mask_8x8,
+                                            unsigned int mask_4x4,
+                                            unsigned int mask_4x4_int,
+                                            const loop_filter_info_n *lfi_n,
+                                            const uint8_t *lfl, int bd) {
+  unsigned int mask;
+  int count;
+
+  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
+       mask; mask >>= count) {
+    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+
+    count = 1;
+    if (mask & 1) {
+      if (mask_16x16 & 1) {
+        if ((mask_16x16 & 3) == 3) {
+          vp9_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
+                                       lfi->hev_thr, 2, bd);
+          count = 2;
+        } else {
+          vp9_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
+                                       lfi->hev_thr, 1, bd);
+        }
+      } else if (mask_8x8 & 1) {
+        if ((mask_8x8 & 3) == 3) {
+          // Next block's thresholds.
+          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+
+          vp9_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
+                                           lfi->hev_thr, lfin->mblim, lfin->lim,
+                                           lfin->hev_thr, bd);
+
+          if ((mask_4x4_int & 3) == 3) {
+            vp9_highbd_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
+                                             lfi->lim, lfi->hev_thr,
+                                             lfin->mblim, lfin->lim,
+                                             lfin->hev_thr, bd);
+          } else {
+            if (mask_4x4_int & 1) {
+              vp9_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+                                          lfi->lim, lfi->hev_thr, 1, bd);
+            } else if (mask_4x4_int & 2) {
+              vp9_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
+                                          lfin->lim, lfin->hev_thr, 1, bd);
+            }
+          }
+          count = 2;
+        } else {
+          vp9_highbd_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, 1, bd);
+
+          if (mask_4x4_int & 1) {
+            vp9_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+                                        lfi->lim, lfi->hev_thr, 1, bd);
+          }
+        }
+      } else if (mask_4x4 & 1) {
+        if ((mask_4x4 & 3) == 3) {
+          // Next block's thresholds.
+          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+
+          vp9_highbd_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
+                                           lfi->hev_thr, lfin->mblim, lfin->lim,
+                                           lfin->hev_thr, bd);
+          if ((mask_4x4_int & 3) == 3) {
+            vp9_highbd_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
+                                             lfi->lim, lfi->hev_thr,
+                                             lfin->mblim, lfin->lim,
+                                             lfin->hev_thr, bd);
+          } else {
+            if (mask_4x4_int & 1) {
+              vp9_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+                                          lfi->lim, lfi->hev_thr, 1, bd);
+            } else if (mask_4x4_int & 2) {
+              vp9_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
+                                          lfin->lim, lfin->hev_thr, 1, bd);
+            }
+          }
+          count = 2;
+        } else {
+          vp9_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, 1, bd);
+
+          if (mask_4x4_int & 1) {
+            vp9_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+                                        lfi->lim, lfi->hev_thr, 1, bd);
+          }
+        }
+      } else if (mask_4x4_int & 1) {
+        vp9_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+                                    lfi->hev_thr, 1, bd);
+      }
+    }
+    s += 8 * count;
+    lfl += count;
+    mask_16x16 >>= count;
+    mask_8x8 >>= count;
+    mask_4x4 >>= count;
+    mask_4x4_int >>= count;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 // This function ors into the current lfm structure, where to do loop
 // filters for the specific mi we are looking at. It uses information
 // including the block_size_type (32x16, 32x32, etc.), the transform size,
@@ -903,14 +1110,53 @@
   }
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_filter_selectively_vert(uint16_t *s, int pitch,
+                                           unsigned int mask_16x16,
+                                           unsigned int mask_8x8,
+                                           unsigned int mask_4x4,
+                                           unsigned int mask_4x4_int,
+                                           const loop_filter_info_n *lfi_n,
+                                           const uint8_t *lfl, int bd) {
+  unsigned int mask;
+
+  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
+       mask; mask >>= 1) {
+    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+
+    if (mask & 1) {
+      if (mask_16x16 & 1) {
+        vp9_highbd_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim,
+                                   lfi->hev_thr, bd);
+      } else if (mask_8x8 & 1) {
+        vp9_highbd_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim,
+                                  lfi->hev_thr, 1, bd);
+      } else if (mask_4x4 & 1) {
+        vp9_highbd_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim,
+                                lfi->hev_thr, 1, bd);
+      }
+    }
+    if (mask_4x4_int & 1)
+      vp9_highbd_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim,
+                                lfi->hev_thr, 1, bd);
+    s += 8;
+    lfl += 1;
+    mask_16x16 >>= 1;
+    mask_8x8 >>= 1;
+    mask_4x4 >>= 1;
+    mask_4x4_int >>= 1;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static void filter_block_plane_non420(VP9_COMMON *cm,
                                       struct macroblockd_plane *plane,
                                       MODE_INFO *mi_8x8,
                                       int mi_row, int mi_col) {
   const int ss_x = plane->subsampling_x;
   const int ss_y = plane->subsampling_y;
-  const int row_step = 1 << ss_x;
-  const int col_step = 1 << ss_y;
+  const int row_step = 1 << ss_y;
+  const int col_step = 1 << ss_x;
   const int row_step_stride = cm->mi_stride * row_step;
   struct buf_2d *const dst = &plane->dst;
   uint8_t* const dst0 = dst->buf;
@@ -1001,12 +1247,32 @@
 
     // Disable filtering on the leftmost column
     border_mask = ~(mi_col == 0);
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->use_highbitdepth) {
+      highbd_filter_selectively_vert(CONVERT_TO_SHORTPTR(dst->buf),
+                                     dst->stride,
+                                     mask_16x16_c & border_mask,
+                                     mask_8x8_c & border_mask,
+                                     mask_4x4_c & border_mask,
+                                     mask_4x4_int[r],
+                                     &cm->lf_info, &lfl[r << 3],
+                                     (int)cm->bit_depth);
+    } else {
+      filter_selectively_vert(dst->buf, dst->stride,
+                              mask_16x16_c & border_mask,
+                              mask_8x8_c & border_mask,
+                              mask_4x4_c & border_mask,
+                              mask_4x4_int[r],
+                              &cm->lf_info, &lfl[r << 3]);
+    }
+#else
     filter_selectively_vert(dst->buf, dst->stride,
                             mask_16x16_c & border_mask,
                             mask_8x8_c & border_mask,
                             mask_4x4_c & border_mask,
                             mask_4x4_int[r],
                             &cm->lf_info, &lfl[r << 3]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     dst->buf += 8 * dst->stride;
     mi_8x8 += row_step_stride;
   }
@@ -1030,13 +1296,32 @@
       mask_8x8_r = mask_8x8[r];
       mask_4x4_r = mask_4x4[r];
     }
-
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->use_highbitdepth) {
+      highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
+                                      dst->stride,
+                                      mask_16x16_r,
+                                      mask_8x8_r,
+                                      mask_4x4_r,
+                                      mask_4x4_int_r,
+                                      &cm->lf_info, &lfl[r << 3],
+                                      (int)cm->bit_depth);
+    } else {
+      filter_selectively_horiz(dst->buf, dst->stride,
+                               mask_16x16_r,
+                               mask_8x8_r,
+                               mask_4x4_r,
+                               mask_4x4_int_r,
+                               &cm->lf_info, &lfl[r << 3]);
+    }
+#else
     filter_selectively_horiz(dst->buf, dst->stride,
                              mask_16x16_r,
                              mask_8x8_r,
                              mask_4x4_r,
                              mask_4x4_int_r,
                              &cm->lf_info, &lfl[r << 3]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     dst->buf += 8 * dst->stride;
   }
 }
@@ -1062,7 +1347,29 @@
       unsigned int mask_4x4_l = mask_4x4 & 0xffff;
       unsigned int mask_4x4_int_l = mask_4x4_int & 0xffff;
 
-      // Disable filtering on the leftmost column
+      // Disable filtering on the leftmost column.
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (cm->use_highbitdepth) {
+        highbd_filter_selectively_vert_row2(plane->plane_type,
+                                            CONVERT_TO_SHORTPTR(dst->buf),
+                                            dst->stride,
+                                            mask_16x16_l,
+                                            mask_8x8_l,
+                                            mask_4x4_l,
+                                            mask_4x4_int_l,
+                                            &cm->lf_info, &lfm->lfl_y[r << 3],
+                                            (int)cm->bit_depth);
+      } else {
+        filter_selectively_vert_row2(plane->plane_type,
+                                     dst->buf, dst->stride,
+                                     mask_16x16_l,
+                                     mask_8x8_l,
+                                     mask_4x4_l,
+                                     mask_4x4_int_l,
+                                     &cm->lf_info,
+                                     &lfm->lfl_y[r << 3]);
+      }
+#else
       filter_selectively_vert_row2(plane->plane_type,
                                    dst->buf, dst->stride,
                                    mask_16x16_l,
@@ -1070,7 +1377,7 @@
                                    mask_4x4_l,
                                    mask_4x4_int_l,
                                    &cm->lf_info, &lfm->lfl_y[r << 3]);
-
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       dst->buf += 16 * dst->stride;
       mask_16x16 >>= 16;
       mask_8x8 >>= 16;
@@ -1100,12 +1407,35 @@
         mask_4x4_r = mask_4x4 & 0xff;
       }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (cm->use_highbitdepth) {
+        highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
+                                        dst->stride,
+                                        mask_16x16_r,
+                                        mask_8x8_r,
+                                        mask_4x4_r,
+                                        mask_4x4_int & 0xff,
+                                        &cm->lf_info,
+                                        &lfm->lfl_y[r << 3],
+                                        (int)cm->bit_depth);
+      } else {
+        filter_selectively_horiz(dst->buf, dst->stride,
+                                 mask_16x16_r,
+                                 mask_8x8_r,
+                                 mask_4x4_r,
+                                 mask_4x4_int & 0xff,
+                                 &cm->lf_info,
+                                 &lfm->lfl_y[r << 3]);
+      }
+#else
       filter_selectively_horiz(dst->buf, dst->stride,
                                mask_16x16_r,
                                mask_8x8_r,
                                mask_4x4_r,
                                mask_4x4_int & 0xff,
-                               &cm->lf_info, &lfm->lfl_y[r << 3]);
+                               &cm->lf_info,
+                               &lfm->lfl_y[r << 3]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
       dst->buf += 8 * dst->stride;
       mask_16x16 >>= 8;
@@ -1135,14 +1465,39 @@
         unsigned int mask_4x4_l = mask_4x4 & 0xff;
         unsigned int mask_4x4_int_l = mask_4x4_int & 0xff;
 
-        // Disable filtering on the leftmost column
+        // Disable filtering on the leftmost column.
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (cm->use_highbitdepth) {
+          highbd_filter_selectively_vert_row2(plane->plane_type,
+                                              CONVERT_TO_SHORTPTR(dst->buf),
+                                              dst->stride,
+                                              mask_16x16_l,
+                                              mask_8x8_l,
+                                              mask_4x4_l,
+                                              mask_4x4_int_l,
+                                              &cm->lf_info,
+                                              &lfm->lfl_uv[r << 1],
+                                              (int)cm->bit_depth);
+        } else {
+          filter_selectively_vert_row2(plane->plane_type,
+                                       dst->buf, dst->stride,
+                                       mask_16x16_l,
+                                       mask_8x8_l,
+                                       mask_4x4_l,
+                                       mask_4x4_int_l,
+                                       &cm->lf_info,
+                                       &lfm->lfl_uv[r << 1]);
+        }
+#else
         filter_selectively_vert_row2(plane->plane_type,
                                      dst->buf, dst->stride,
                                      mask_16x16_l,
                                      mask_8x8_l,
                                      mask_4x4_l,
                                      mask_4x4_int_l,
-                                     &cm->lf_info, &lfm->lfl_uv[r << 1]);
+                                     &cm->lf_info,
+                                     &lfm->lfl_uv[r << 1]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
         dst->buf += 16 * dst->stride;
         mask_16x16 >>= 8;
@@ -1177,12 +1532,35 @@
         mask_4x4_r = mask_4x4 & 0xf;
       }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (cm->use_highbitdepth) {
+        highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
+                                        dst->stride,
+                                        mask_16x16_r,
+                                        mask_8x8_r,
+                                        mask_4x4_r,
+                                        mask_4x4_int_r,
+                                        &cm->lf_info,
+                                        &lfm->lfl_uv[r << 1],
+                                        (int)cm->bit_depth);
+      } else {
+        filter_selectively_horiz(dst->buf, dst->stride,
+                                 mask_16x16_r,
+                                 mask_8x8_r,
+                                 mask_4x4_r,
+                                 mask_4x4_int_r,
+                                 &cm->lf_info,
+                                 &lfm->lfl_uv[r << 1]);
+      }
+#else
       filter_selectively_horiz(dst->buf, dst->stride,
                                mask_16x16_r,
                                mask_8x8_r,
                                mask_4x4_r,
                                mask_4x4_int_r,
-                               &cm->lf_info, &lfm->lfl_uv[r << 1]);
+                               &cm->lf_info,
+                               &lfm->lfl_uv[r << 1]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
       dst->buf += 8 * dst->stride;
       mask_16x16 >>= 4;

diff --git a/source/libvpx/vp9/common/vp9_loopfilter_filters.c b/source/libvpx/vp9/common/vp9_loopfilter_filters.c
index 25d3311..2e32c40 100644
--- a/source/libvpx/vp9/common/vp9_loopfilter_filters.c
+++ b/source/libvpx/vp9/common/vp9_loopfilter_filters.c

@@ -17,6 +17,20 @@
   return (int8_t)clamp(t, -128, 127);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE int16_t signed_char_clamp_high(int t, int bd) {
+  switch (bd) {
+    case 10:
+      return (int16_t)clamp(t, -128*4, 128*4-1);
+    case 12:
+      return (int16_t)clamp(t, -128*16, 128*16-1);
+    case 8:
+    default:
+      return (int16_t)clamp(t, -128, 128-1);
+  }
+}
+#endif
+
 // should we apply any filter at all: 11111111 yes, 00000000 no
 static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit,
                                  uint8_t p3, uint8_t p2,
@@ -337,3 +351,394 @@
                                 const uint8_t *limit, const uint8_t *thresh) {
   mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16);
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+// Should we apply any filter at all: 11111111 yes, 00000000 no ?
+static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
+                                        uint16_t p3, uint16_t p2,
+                                        uint16_t p1, uint16_t p0,
+                                        uint16_t q0, uint16_t q1,
+                                        uint16_t q2, uint16_t q3, int bd) {
+  int8_t mask = 0;
+  int16_t limit16 = (uint16_t)limit << (bd - 8);
+  int16_t blimit16 = (uint16_t)blimit << (bd - 8);
+  mask |= (abs(p3 - p2) > limit16) * -1;
+  mask |= (abs(p2 - p1) > limit16) * -1;
+  mask |= (abs(p1 - p0) > limit16) * -1;
+  mask |= (abs(q1 - q0) > limit16) * -1;
+  mask |= (abs(q2 - q1) > limit16) * -1;
+  mask |= (abs(q3 - q2) > limit16) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit16) * -1;
+  return ~mask;
+}
+
+static INLINE int8_t highbd_flat_mask4(uint8_t thresh,
+                                       uint16_t p3, uint16_t p2,
+                                       uint16_t p1, uint16_t p0,
+                                       uint16_t q0, uint16_t q1,
+                                       uint16_t q2, uint16_t q3, int bd) {
+  int8_t mask = 0;
+  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
+  mask |= (abs(p1 - p0) > thresh16) * -1;
+  mask |= (abs(q1 - q0) > thresh16) * -1;
+  mask |= (abs(p2 - p0) > thresh16) * -1;
+  mask |= (abs(q2 - q0) > thresh16) * -1;
+  mask |= (abs(p3 - p0) > thresh16) * -1;
+  mask |= (abs(q3 - q0) > thresh16) * -1;
+  return ~mask;
+}
+
+static INLINE int8_t highbd_flat_mask5(uint8_t thresh,
+                                       uint16_t p4, uint16_t p3,
+                                       uint16_t p2, uint16_t p1,
+                                       uint16_t p0, uint16_t q0,
+                                       uint16_t q1, uint16_t q2,
+                                       uint16_t q3, uint16_t q4, int bd) {
+  int8_t mask = ~highbd_flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
+  mask |= (abs(p4 - p0) > thresh16) * -1;
+  mask |= (abs(q4 - q0) > thresh16) * -1;
+  return ~mask;
+}
+
+// Is there high edge variance internal edge:
+// 11111111_11111111 yes, 00000000_00000000 no ?
+static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0,
+                                      uint16_t q0, uint16_t q1, int bd) {
+  int16_t hev = 0;
+  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
+  hev |= (abs(p1 - p0) > thresh16) * -1;
+  hev |= (abs(q1 - q0) > thresh16) * -1;
+  return hev;
+}
+
+static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
+                                  uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
+                                  int bd) {
+  int16_t filter1, filter2;
+  // ^0x80 equivalent to subtracting 0x80 from the values to turn them
+  // into -128 to +127 instead of 0 to 255.
+  int shift = bd - 8;
+  const int16_t ps1 = (int16_t)*op1 - (0x80 << shift);
+  const int16_t ps0 = (int16_t)*op0 - (0x80 << shift);
+  const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift);
+  const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift);
+  const uint16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd);
+
+  // Add outer taps if we have high edge variance.
+  int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev;
+
+  // Inner taps.
+  filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask;
+
+  // Save bottom 3 bits so that we round one side +4 and the other +3
+  // if it equals 4 we'll set to adjust by -1 to account for the fact
+  // we'd round 3 the other way.
+  filter1 = signed_char_clamp_high(filter + 4, bd) >> 3;
+  filter2 = signed_char_clamp_high(filter + 3, bd) >> 3;
+
+  *oq0 = signed_char_clamp_high(qs0 - filter1, bd) + (0x80 << shift);
+  *op0 = signed_char_clamp_high(ps0 + filter2, bd) + (0x80 << shift);
+
+  // Outer tap adjustments.
+  filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
+
+  *oq1 = signed_char_clamp_high(qs1 - filter, bd) + (0x80 << shift);
+  *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift);
+}
+
+void vp9_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
+                                   const uint8_t *blimit, const uint8_t *limit,
+                                   const uint8_t *thresh, int count, int bd) {
+  int i;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < 8 * count; ++i) {
+    const uint16_t p3 = s[-4 * p];
+    const uint16_t p2 = s[-3 * p];
+    const uint16_t p1 = s[-2 * p];
+    const uint16_t p0 = s[-p];
+    const uint16_t q0 = s[0 * p];
+    const uint16_t q1 = s[1 * p];
+    const uint16_t q2 = s[2 * p];
+    const uint16_t q3 = s[3 * p];
+    const int8_t mask = highbd_filter_mask(*limit, *blimit,
+                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
+    ++s;
+  }
+}
+
+void vp9_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int p,
+                                        const uint8_t *blimit0,
+                                        const uint8_t *limit0,
+                                        const uint8_t *thresh0,
+                                        const uint8_t *blimit1,
+                                        const uint8_t *limit1,
+                                        const uint8_t *thresh1,
+                                        int bd) {
+  vp9_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1, bd);
+  vp9_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1, bd);
+}
+
+void vp9_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
+                                 const uint8_t *limit, const uint8_t *thresh,
+                                 int count, int bd) {
+  int i;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < 8 * count; ++i) {
+    const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint16_t q0 = s[0],  q1 = s[1],  q2 = s[2],  q3 = s[3];
+    const int8_t mask = highbd_filter_mask(*limit, *blimit,
+                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd);
+    s += pitch;
+  }
+}
+
+void vp9_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch,
+                                      const uint8_t *blimit0,
+                                      const uint8_t *limit0,
+                                      const uint8_t *thresh0,
+                                      const uint8_t *blimit1,
+                                      const uint8_t *limit1,
+                                      const uint8_t *thresh1,
+                                      int bd) {
+  vp9_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1, bd);
+  vp9_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1,
+                              thresh1, 1, bd);
+}
+
+static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat,
+                                  uint16_t *op3, uint16_t *op2,
+                                  uint16_t *op1, uint16_t *op0,
+                                  uint16_t *oq0, uint16_t *oq1,
+                                  uint16_t *oq2, uint16_t *oq3, int bd) {
+  if (flat && mask) {
+    const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+    const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+
+    // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
+    *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
+    *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
+    *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
+    *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
+    *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
+    *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
+  } else {
+    highbd_filter4(mask, thresh, op1,  op0, oq0, oq1, bd);
+  }
+}
+
+void vp9_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
+                                   const uint8_t *limit, const uint8_t *thresh,
+                                   int count, int bd) {
+  int i;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < 8 * count; ++i) {
+    const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+
+    const int8_t mask = highbd_filter_mask(*limit, *blimit,
+                                         p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
+                                          bd);
+    highbd_filter8(mask, *thresh, flat,
+                 s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
+                 s, s + 1 * p, s + 2 * p, s + 3 * p, bd);
+    ++s;
+  }
+}
+
+void vp9_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int p,
+                                        const uint8_t *blimit0,
+                                        const uint8_t *limit0,
+                                        const uint8_t *thresh0,
+                                        const uint8_t *blimit1,
+                                        const uint8_t *limit1,
+                                        const uint8_t *thresh1,
+                                        int bd) {
+  vp9_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1, bd);
+  vp9_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1, bd);
+}
+
+void vp9_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
+                                 const uint8_t *limit, const uint8_t *thresh,
+                                 int count, int bd) {
+  int i;
+
+  for (i = 0; i < 8 * count; ++i) {
+    const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+    const int8_t mask = highbd_filter_mask(*limit, *blimit,
+                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
+                                          bd);
+    highbd_filter8(mask, *thresh, flat,
+                 s - 4, s - 3, s - 2, s - 1,
+                 s, s + 1, s + 2, s + 3,
+                 bd);
+    s += pitch;
+  }
+}
+
+void vp9_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch,
+                                      const uint8_t *blimit0,
+                                      const uint8_t *limit0,
+                                      const uint8_t *thresh0,
+                                      const uint8_t *blimit1,
+                                      const uint8_t *limit1,
+                                      const uint8_t *thresh1,
+                                      int bd) {
+  vp9_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1, bd);
+  vp9_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1,
+                              thresh1, 1, bd);
+}
+
+static INLINE void highbd_filter16(int8_t mask, uint8_t thresh,
+                                   uint8_t flat, uint8_t flat2,
+                                   uint16_t *op7, uint16_t *op6,
+                                   uint16_t *op5, uint16_t *op4,
+                                   uint16_t *op3, uint16_t *op2,
+                                   uint16_t *op1, uint16_t *op0,
+                                   uint16_t *oq0, uint16_t *oq1,
+                                   uint16_t *oq2, uint16_t *oq3,
+                                   uint16_t *oq4, uint16_t *oq5,
+                                   uint16_t *oq6, uint16_t *oq7, int bd) {
+  if (flat2 && flat && mask) {
+    const uint16_t p7 = *op7;
+    const uint16_t p6 = *op6;
+    const uint16_t p5 = *op5;
+    const uint16_t p4 = *op4;
+    const uint16_t p3 = *op3;
+    const uint16_t p2 = *op2;
+    const uint16_t p1 = *op1;
+    const uint16_t p0 = *op0;
+    const uint16_t q0 = *oq0;
+    const uint16_t q1 = *oq1;
+    const uint16_t q2 = *oq2;
+    const uint16_t q3 = *oq3;
+    const uint16_t q4 = *oq4;
+    const uint16_t q5 = *oq5;
+    const uint16_t q6 = *oq6;
+    const uint16_t q7 = *oq7;
+
+    // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
+    *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 +
+                              q0, 4);
+    *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 +
+                              q0 + q1, 4);
+    *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 +
+                              q0 + q1 + q2, 4);
+    *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 +
+                              q0 + q1 + q2 + q3, 4);
+    *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 +
+                              q0 + q1 + q2 + q3 + q4, 4);
+    *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
+                              q0 + q1 + q2 + q3 + q4 + q5, 4);
+    *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
+                              q0 + q1 + q2 + q3 + q4 + q5 + q6, 4);
+    *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 +
+                              q0 * 2 + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4);
+    *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 +
+                              q0 + q1 * 2 + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4);
+    *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 +
+                              q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, 4);
+    *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 +
+                              q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
+    *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 +
+                              q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
+    *oq5 = ROUND_POWER_OF_TWO(p1 + p0 +
+                              q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
+    *oq6 = ROUND_POWER_OF_TWO(p0 +
+                              q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
+  } else {
+    highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
+                   bd);
+  }
+}
+
+void vp9_highbd_lpf_horizontal_16_c(uint16_t *s, int p, const uint8_t *blimit,
+                                    const uint8_t *limit, const uint8_t *thresh,
+                                    int count, int bd) {
+  int i;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < 8 * count; ++i) {
+    const uint16_t p3 = s[-4 * p];
+    const uint16_t p2 = s[-3 * p];
+    const uint16_t p1 = s[-2 * p];
+    const uint16_t p0 = s[-p];
+    const uint16_t q0 = s[0 * p];
+    const uint16_t q1 = s[1 * p];
+    const uint16_t q2 = s[2 * p];
+    const uint16_t q3 = s[3 * p];
+    const int8_t mask = highbd_filter_mask(*limit, *blimit,
+                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
+                                          bd);
+    const int8_t flat2 = highbd_flat_mask5(
+        1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0,
+        q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd);
+
+    highbd_filter16(mask, *thresh, flat, flat2,
+                    s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
+                    s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
+                    s, s + 1 * p, s + 2 * p, s + 3 * p,
+                    s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p,
+                    bd);
+    ++s;
+  }
+}
+
+static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
+                                          const uint8_t *blimit,
+                                          const uint8_t *limit,
+                                          const uint8_t *thresh,
+                                          int count, int bd) {
+  int i;
+
+  for (i = 0; i < count; ++i) {
+    const uint16_t p3 = s[-4];
+    const uint16_t p2 = s[-3];
+    const uint16_t p1 = s[-2];
+    const uint16_t p0 = s[-1];
+    const uint16_t q0 = s[0];
+    const uint16_t q1 = s[1];
+    const uint16_t q2 = s[2];
+    const uint16_t q3 = s[3];
+    const int8_t mask = highbd_filter_mask(*limit, *blimit,
+                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
+                                          bd);
+    const int8_t flat2 = highbd_flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
+                                           q0, s[4], s[5], s[6], s[7], bd);
+
+    highbd_filter16(mask, *thresh, flat, flat2,
+                    s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
+                    s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7,
+                    bd);
+    s += p;
+  }
+}
+
+void vp9_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
+                                  const uint8_t *limit, const uint8_t *thresh,
+                                  int bd) {
+  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd);
+}
+
+void vp9_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p,
+                                       const uint8_t *blimit,
+                                       const uint8_t *limit,
+                                       const uint8_t *thresh,
+                                       int bd) {
+  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH

diff --git a/source/libvpx/vp9/common/vp9_mvref_common.c b/source/libvpx/vp9/common/vp9_mvref_common.c
index a09afff..3b34050 100644
--- a/source/libvpx/vp9/common/vp9_mvref_common.c
+++ b/source/libvpx/vp9/common/vp9_mvref_common.c

@@ -24,10 +24,7 @@
         ? cm->prev_mi[mi_row * xd->mi_stride + mi_col].src_mi
         : NULL;
   const MB_MODE_INFO *const prev_mbmi = prev_mi ? &prev_mi->src_mi->mbmi : NULL;
-
-
   const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];
-
   int different_ref_found = 0;
   int context_counter = 0;
 
@@ -109,10 +106,10 @@
 }
 
 void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                                    const TileInfo *const tile,
-                                    MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
-                                    int_mv *mv_ref_list,
-                                    int mi_row, int mi_col) {
+                      const TileInfo *const tile,
+                      MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                      int_mv *mv_ref_list,
+                      int mi_row, int mi_col) {
   find_mv_refs_idx(cm, xd, tile, mi, ref_frame, mv_ref_list, -1,
                    mi_row, mi_col);
 }
@@ -127,7 +124,6 @@
   }
 }
 
-
 void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp,
                            int_mv *mvlist, int_mv *nearest, int_mv *near) {
   int i;

diff --git a/source/libvpx/vp9/common/vp9_onyxc_int.h b/source/libvpx/vp9/common/vp9_onyxc_int.h
index 792e9d9..f1eda91 100644
--- a/source/libvpx/vp9/common/vp9_onyxc_int.h
+++ b/source/libvpx/vp9/common/vp9_onyxc_int.h

@@ -143,7 +143,6 @@
   int prev_mi_idx;
   int mi_alloc_size;
   MODE_INFO *mip_array[2];
-  MODE_INFO **mi_grid_base_array[2];
 
   MODE_INFO *mip; /* Base of allocated array */
   MODE_INFO *mi;  /* Corresponds to upper left visible macroblock */
@@ -180,6 +179,7 @@
 
   // VPX_BITS_8 in profile 0 or 1, VPX_BITS_10 or VPX_BITS_12 in profile 2 or 3.
   vpx_bit_depth_t bit_depth;
+  vpx_bit_depth_t dequant_bit_depth;  // bit_depth of current dequantizer
 
 #if CONFIG_VP9_POSTPROC
   struct postproc_state  postproc_state;
@@ -328,11 +328,11 @@
   const PARTITION_CONTEXT *above_ctx = xd->above_seg_context + mi_col;
   const PARTITION_CONTEXT *left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
 
-  const int bsl = mi_width_log2(bsize);
+  const int bsl = mi_width_log2_lookup[bsize];
   const int bs = 1 << bsl;
   int above = 0, left = 0, i;
 
-  assert(b_width_log2(bsize) == b_height_log2(bsize));
+  assert(b_width_log2_lookup[bsize] == b_height_log2_lookup[bsize]);
   assert(bsl >= 0);
 
   for (i = 0; i < bs; i++) {

diff --git a/source/libvpx/vp9/common/vp9_postproc.c b/source/libvpx/vp9/common/vp9_postproc.c
index e4e6ce7..575ffbc 100644
--- a/source/libvpx/vp9/common/vp9_postproc.c
+++ b/source/libvpx/vp9/common/vp9_postproc.c

@@ -19,6 +19,9 @@
 #include "vpx_scale/vpx_scale.h"
 #include "vpx_scale/yv12config.h"
 
+#if CONFIG_VP9_HIGHBITDEPTH
+#include "vp9/common/vp9_common.h"
+#endif
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_postproc.h"
 #include "vp9/common/vp9_systemdependent.h"
@@ -152,6 +155,84 @@
   }
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_post_proc_down_and_across_c(const uint16_t *src_ptr,
+                                            uint16_t *dst_ptr,
+                                            int src_pixels_per_line,
+                                            int dst_pixels_per_line,
+                                            int rows,
+                                            int cols,
+                                            int flimit) {
+  uint16_t const *p_src;
+  uint16_t *p_dst;
+  int row;
+  int col;
+  int i;
+  int v;
+  int pitch = src_pixels_per_line;
+  uint16_t d[8];
+
+  for (row = 0; row < rows; row++) {
+    // post_proc_down for one row.
+    p_src = src_ptr;
+    p_dst = dst_ptr;
+
+    for (col = 0; col < cols; col++) {
+      int kernel = 4;
+      int v = p_src[col];
+
+      for (i = -2; i <= 2; i++) {
+        if (abs(v - p_src[col + i * pitch]) > flimit)
+          goto down_skip_convolve;
+
+        kernel += kernel5[2 + i] * p_src[col + i * pitch];
+      }
+
+      v = (kernel >> 3);
+
+    down_skip_convolve:
+      p_dst[col] = v;
+    }
+
+    /* now post_proc_across */
+    p_src = dst_ptr;
+    p_dst = dst_ptr;
+
+    for (i = 0; i < 8; i++)
+      d[i] = p_src[i];
+
+    for (col = 0; col < cols; col++) {
+      int kernel = 4;
+      v = p_src[col];
+
+      d[col & 7] = v;
+
+      for (i = -2; i <= 2; i++) {
+        if (abs(v - p_src[col + i]) > flimit)
+          goto across_skip_convolve;
+
+        kernel += kernel5[2 + i] * p_src[col + i];
+      }
+
+      d[col & 7] = (kernel >> 3);
+
+    across_skip_convolve:
+      if (col >= 2)
+        p_dst[col - 2] = d[(col - 2) & 7];
+    }
+
+    /* handle the last two pixels */
+    p_dst[col - 2] = d[(col - 2) & 7];
+    p_dst[col - 1] = d[(col - 1) & 7];
+
+
+    /* next row */
+    src_ptr += pitch;
+    dst_ptr += dst_pixels_per_line;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static int q2mbl(int x) {
   if (x < 20) x = 20;
 
@@ -162,10 +243,46 @@
 void vp9_mbpost_proc_across_ip_c(uint8_t *src, int pitch,
                                  int rows, int cols, int flimit) {
   int r, c, i;
-
   uint8_t *s = src;
   uint8_t d[16];
 
+  for (r = 0; r < rows; r++) {
+    int sumsq = 0;
+    int sum = 0;
+
+    for (i = -8; i <= 6; i++) {
+      sumsq += s[i] * s[i];
+      sum += s[i];
+      d[i + 8] = 0;
+    }
+
+    for (c = 0; c < cols + 8; c++) {
+      int x = s[c + 7] - s[c - 8];
+      int y = s[c + 7] + s[c - 8];
+
+      sum += x;
+      sumsq += x * y;
+
+      d[c & 15] = s[c];
+
+      if (sumsq * 15 - sum * sum < flimit) {
+        d[c & 15] = (8 + sum + s[c]) >> 4;
+      }
+
+      s[c - 8] = d[(c - 8) & 15];
+    }
+    s += pitch;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch,
+                                        int rows, int cols, int flimit) {
+  int r, c, i;
+
+  uint16_t *s = src;
+  uint16_t d[16];
+
 
   for (r = 0; r < rows; r++) {
     int sumsq = 0;
@@ -196,6 +313,7 @@
     s += pitch;
   }
 }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 void vp9_mbpost_proc_down_c(uint8_t *dst, int pitch,
                             int rows, int cols, int flimit) {
@@ -229,6 +347,40 @@
   }
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_mbpost_proc_down_c(uint16_t *dst, int pitch,
+                                   int rows, int cols, int flimit) {
+  int r, c, i;
+  const int16_t *rv3 = &vp9_rv[63 & rand()];  // NOLINT
+
+  for (c = 0; c < cols; c++) {
+    uint16_t *s = &dst[c];
+    int sumsq = 0;
+    int sum = 0;
+    uint16_t d[16];
+    const int16_t *rv2 = rv3 + ((c * 17) & 127);
+
+    for (i = -8; i <= 6; i++) {
+      sumsq += s[i * pitch] * s[i * pitch];
+      sum += s[i * pitch];
+    }
+
+    for (r = 0; r < rows + 8; r++) {
+      sumsq += s[7 * pitch] * s[ 7 * pitch] - s[-8 * pitch] * s[-8 * pitch];
+      sum += s[7 * pitch] - s[-8 * pitch];
+      d[r & 15] = s[0];
+
+      if (sumsq * 15 - sum * sum < flimit) {
+        d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;
+      }
+
+      s[-8 * pitch] = d[(r - 8) & 15];
+      s += pitch;
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static void deblock_and_de_macro_block(YV12_BUFFER_CONFIG   *source,
                                        YV12_BUFFER_CONFIG   *post,
                                        int                   q,
@@ -239,6 +391,51 @@
   (void) low_var_thresh;
   (void) flag;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (source->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vp9_highbd_post_proc_down_and_across(CONVERT_TO_SHORTPTR(source->y_buffer),
+                                         CONVERT_TO_SHORTPTR(post->y_buffer),
+                                         source->y_stride, post->y_stride,
+                                         source->y_height, source->y_width,
+                                         ppl);
+
+    vp9_highbd_mbpost_proc_across_ip(CONVERT_TO_SHORTPTR(post->y_buffer),
+                                     post->y_stride, post->y_height,
+                                     post->y_width, q2mbl(q));
+
+    vp9_highbd_mbpost_proc_down(CONVERT_TO_SHORTPTR(post->y_buffer),
+                                post->y_stride, post->y_height,
+                                post->y_width, q2mbl(q));
+
+    vp9_highbd_post_proc_down_and_across(CONVERT_TO_SHORTPTR(source->u_buffer),
+                                         CONVERT_TO_SHORTPTR(post->u_buffer),
+                                         source->uv_stride, post->uv_stride,
+                                         source->uv_height, source->uv_width,
+                                         ppl);
+    vp9_highbd_post_proc_down_and_across(CONVERT_TO_SHORTPTR(source->v_buffer),
+                                         CONVERT_TO_SHORTPTR(post->v_buffer),
+                                         source->uv_stride, post->uv_stride,
+                                         source->uv_height, source->uv_width,
+                                         ppl);
+  } else {
+    vp9_post_proc_down_and_across(source->y_buffer, post->y_buffer,
+                                  source->y_stride, post->y_stride,
+                                  source->y_height, source->y_width, ppl);
+
+    vp9_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
+                              post->y_width, q2mbl(q));
+
+    vp9_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
+                         post->y_width, q2mbl(q));
+
+    vp9_post_proc_down_and_across(source->u_buffer, post->u_buffer,
+                                  source->uv_stride, post->uv_stride,
+                                  source->uv_height, source->uv_width, ppl);
+    vp9_post_proc_down_and_across(source->v_buffer, post->v_buffer,
+                                  source->uv_stride, post->uv_stride,
+                                  source->uv_height, source->uv_width, ppl);
+  }
+#else
   vp9_post_proc_down_and_across(source->y_buffer, post->y_buffer,
                                 source->y_stride, post->y_stride,
                                 source->y_height, source->y_width, ppl);
@@ -255,6 +452,7 @@
   vp9_post_proc_down_and_across(source->v_buffer, post->v_buffer,
                                 source->uv_stride, post->uv_stride,
                                 source->uv_height, source->uv_width, ppl);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
 void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
@@ -271,10 +469,26 @@
   uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer};
   const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride};
 
-  for (i = 0; i < MAX_MB_PLANE; ++i)
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    assert((src->flags & YV12_FLAG_HIGHBITDEPTH) ==
+           (dst->flags & YV12_FLAG_HIGHBITDEPTH));
+    if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vp9_highbd_post_proc_down_and_across(CONVERT_TO_SHORTPTR(srcs[i]),
+                                           CONVERT_TO_SHORTPTR(dsts[i]),
+                                           src_strides[i], dst_strides[i],
+                                           src_heights[i], src_widths[i], ppl);
+    } else {
+      vp9_post_proc_down_and_across(srcs[i], dsts[i],
+                                    src_strides[i], dst_strides[i],
+                                    src_heights[i], src_widths[i], ppl);
+    }
+#else
     vp9_post_proc_down_and_across(srcs[i], dsts[i],
                                   src_strides[i], dst_strides[i],
                                   src_heights[i], src_widths[i], ppl);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
 }
 
 void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
@@ -293,15 +507,32 @@
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     const int src_stride = src_strides[i];
-    const uint8_t *const src = srcs[i] + 2 * src_stride + 2;
     const int src_width = src_widths[i] - 4;
     const int src_height = src_heights[i] - 4;
-
     const int dst_stride = dst_strides[i];
-    uint8_t *const dst = dsts[i] + 2 * dst_stride + 2;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+    assert((src->flags & YV12_FLAG_HIGHBITDEPTH) ==
+           (dst->flags & YV12_FLAG_HIGHBITDEPTH));
+    if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+      const uint16_t *const src = CONVERT_TO_SHORTPTR(srcs[i] + 2 * src_stride
+                                                      + 2);
+      uint16_t *const dst = CONVERT_TO_SHORTPTR(dsts[i] + 2 * dst_stride + 2);
+      vp9_highbd_post_proc_down_and_across(src, dst, src_stride, dst_stride,
+                                           src_height, src_width, ppl);
+    } else {
+      const uint8_t *const src = srcs[i] + 2 * src_stride + 2;
+      uint8_t *const dst = dsts[i] + 2 * dst_stride + 2;
+
+      vp9_post_proc_down_and_across(src, dst, src_stride, dst_stride,
+                                    src_height, src_width, ppl);
+    }
+#else
+    const uint8_t *const src = srcs[i] + 2 * src_stride + 2;
+    uint8_t *const dst = dsts[i] + 2 * dst_stride + 2;
     vp9_post_proc_down_and_across(src, dst, src_stride, dst_stride,
                                   src_height, src_width, ppl);
+#endif
   }
 }
 
@@ -405,6 +636,9 @@
 #if CONFIG_VP9_POSTPROC || CONFIG_INTERNAL_STATS
   if (vp9_realloc_frame_buffer(&cm->post_proc_buffer, cm->width, cm->height,
                                cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
                                VP9_DEC_BORDER_IN_PIXELS, NULL, NULL, NULL) < 0)
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate post-processing buffer");

diff --git a/source/libvpx/vp9/common/vp9_prob.h b/source/libvpx/vp9/common/vp9_prob.h
index fa0e36d..bc1511a 100644
--- a/source/libvpx/vp9/common/vp9_prob.h
+++ b/source/libvpx/vp9/common/vp9_prob.h

@@ -14,7 +14,6 @@
 #include "./vpx_config.h"
 
 #include "vpx_ports/mem.h"
-#include "vpx/vpx_integer.h"
 
 #include "vp9/common/vp9_common.h"
 

diff --git a/source/libvpx/vp9/common/vp9_quant_common.h b/source/libvpx/vp9/common/vp9_quant_common.h
index b626605..4bae4a8 100644
--- a/source/libvpx/vp9/common/vp9_quant_common.h
+++ b/source/libvpx/vp9/common/vp9_quant_common.h

@@ -12,7 +12,7 @@
 #define VP9_COMMON_VP9_QUANT_COMMON_H_
 
 #include "vpx/vpx_codec.h"
-#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_seg_common.h"
 
 #ifdef __cplusplus
 extern "C" {

diff --git a/source/libvpx/vp9/common/vp9_reconinter.c b/source/libvpx/vp9/common/vp9_reconinter.c
index b49f130..3492a23 100644
--- a/source/libvpx/vp9/common/vp9_reconinter.c
+++ b/source/libvpx/vp9/common/vp9_reconinter.c

@@ -153,19 +153,19 @@
                                  int w, int h, int ref,
                                  const InterpKernel *kernel,
                                  int xs, int ys, int bd) {
-  sf->high_predict[subpel_x != 0][subpel_y != 0][ref](
+  sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref](
       src, src_stride, dst, dst_stride,
       kernel[subpel_x], xs, kernel[subpel_y], ys, w, h, bd);
 }
 
-void vp9_high_build_inter_predictor(const uint8_t *src, int src_stride,
-                                    uint8_t *dst, int dst_stride,
-                                    const MV *src_mv,
-                                    const struct scale_factors *sf,
-                                    int w, int h, int ref,
-                                    const InterpKernel *kernel,
-                                    enum mv_precision precision,
-                                    int x, int y, int bd) {
+void vp9_highbd_build_inter_predictor(const uint8_t *src, int src_stride,
+                                      uint8_t *dst, int dst_stride,
+                                      const MV *src_mv,
+                                      const struct scale_factors *sf,
+                                      int w, int h, int ref,
+                                      const InterpKernel *kernel,
+                                      enum mv_precision precision,
+                                      int x, int y, int bd) {
   const int is_q4 = precision == MV_PRECISION_Q4;
   const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
                      is_q4 ? src_mv->col : src_mv->col * 2 };
@@ -288,8 +288,9 @@
     uint8_t *pre;
     MV32 scaled_mv;
     int xs, ys, subpel_x, subpel_y;
+    const int is_scaled = vp9_is_scaled(sf);
 
-    if (vp9_is_scaled(sf)) {
+    if (is_scaled) {
       pre = pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, sf);
       scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
       xs = sf->x_step_q4;
@@ -385,12 +386,6 @@
                ? average_split_mvs(pd, mi, ref, block)
                : mi->mbmi.mv[ref].as_mv;
 
-
-    // TODO(jkoleszar): This clamping is done in the incorrect place for the
-    // scaling case. It needs to be done on the scaled MV, not the pre-scaling
-    // MV. Note however that it performs the subsampling aware scaling so
-    // that the result is always q4.
-    // mv_precision precision is MV_PRECISION_Q4.
     const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
                                                pd->subsampling_x,
                                                pd->subsampling_y);
@@ -400,6 +395,7 @@
         subpel_x, subpel_y;
     uint8_t *ref_frame, *buf_ptr;
     const YV12_BUFFER_CONFIG *ref_buf = xd->block_refs[ref]->buf;
+    const int is_scaled = vp9_is_scaled(sf);
 
     // Get reference frame pointer, width and height.
     if (plane == 0) {
@@ -412,7 +408,7 @@
       ref_frame = plane == 1 ? ref_buf->u_buffer : ref_buf->v_buffer;
     }
 
-    if (vp9_is_scaled(sf)) {
+    if (is_scaled) {
       // Co-ordinate of containing block to pixel precision.
       int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
       int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
@@ -451,7 +447,8 @@
     subpel_x = scaled_mv.col & SUBPEL_MASK;
     subpel_y = scaled_mv.row & SUBPEL_MASK;
 
-    // Calculate the top left corner of the best matching block in the reference frame.
+    // Calculate the top left corner of the best matching block in the
+    // reference frame.
     x0 += scaled_mv.col >> SUBPEL_BITS;
     y0 += scaled_mv.row >> SUBPEL_BITS;
     x0_16 += scaled_mv.col;
@@ -463,20 +460,20 @@
 
     // Do border extension if there is motion or the
     // width/height is not a multiple of 8 pixels.
-    if (scaled_mv.col || scaled_mv.row ||
+    if (is_scaled || scaled_mv.col || scaled_mv.row ||
         (frame_width & 0x7) || (frame_height & 0x7)) {
       // Get reference block bottom right coordinate.
       int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1;
       int y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
       int x_pad = 0, y_pad = 0;
 
-      if (subpel_x || (sf->x_step_q4 & SUBPEL_MASK)) {
+      if (subpel_x || (sf->x_step_q4 != SUBPEL_SHIFTS)) {
         x0 -= VP9_INTERP_EXTEND - 1;
         x1 += VP9_INTERP_EXTEND;
         x_pad = 1;
       }
 
-      if (subpel_y || (sf->y_step_q4 & SUBPEL_MASK)) {
+      if (subpel_y || (sf->y_step_q4 != SUBPEL_SHIFTS)) {
         y0 -= VP9_INTERP_EXTEND - 1;
         y1 += VP9_INTERP_EXTEND;
         y_pad = 1;

diff --git a/source/libvpx/vp9/common/vp9_reconinter.h b/source/libvpx/vp9/common/vp9_reconinter.h
index e70cc4c..3eaf07c 100644
--- a/source/libvpx/vp9/common/vp9_reconinter.h
+++ b/source/libvpx/vp9/common/vp9_reconinter.h

@@ -40,14 +40,14 @@
                                int x, int y);
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vp9_high_build_inter_predictor(const uint8_t *src, int src_stride,
-                                    uint8_t *dst, int dst_stride,
-                                    const MV *mv_q3,
-                                    const struct scale_factors *sf,
-                                    int w, int h, int do_avg,
-                                    const InterpKernel *kernel,
-                                    enum mv_precision precision,
-                                    int x, int y, int bd);
+void vp9_highbd_build_inter_predictor(const uint8_t *src, int src_stride,
+                                      uint8_t *dst, int dst_stride,
+                                      const MV *mv_q3,
+                                      const struct scale_factors *sf,
+                                      int w, int h, int do_avg,
+                                      const InterpKernel *kernel,
+                                      enum mv_precision precision,
+                                      int x, int y, int bd);
 #endif
 
 static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride,

diff --git a/source/libvpx/vp9/common/vp9_reconintra.c b/source/libvpx/vp9/common/vp9_reconintra.c
index 7ebd2ea..720bb44 100644
--- a/source/libvpx/vp9/common/vp9_reconintra.c
+++ b/source/libvpx/vp9/common/vp9_reconintra.c

@@ -41,11 +41,11 @@
   }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-#define intra_pred_high_sized(type, size) \
-  void vp9_high_##type##_predictor_##size##x##size##_c( \
+#define intra_pred_highbd_sized(type, size) \
+  void vp9_highbd_##type##_predictor_##size##x##size##_c( \
       uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
       const uint16_t *left, int bd) { \
-    high_##type##_predictor(dst, stride, size, above, left, bd); \
+    highbd_##type##_predictor(dst, stride, size, above, left, bd); \
   }
 
 #define intra_pred_allsizes(type) \
@@ -53,10 +53,10 @@
   intra_pred_sized(type, 8) \
   intra_pred_sized(type, 16) \
   intra_pred_sized(type, 32) \
-  intra_pred_high_sized(type, 4) \
-  intra_pred_high_sized(type, 8) \
-  intra_pred_high_sized(type, 16) \
-  intra_pred_high_sized(type, 32)
+  intra_pred_highbd_sized(type, 4) \
+  intra_pred_highbd_sized(type, 8) \
+  intra_pred_highbd_sized(type, 16) \
+  intra_pred_highbd_sized(type, 32)
 
 #else
 
@@ -68,9 +68,9 @@
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static INLINE void high_d207_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
-                                       const uint16_t *above,
-                                       const uint16_t *left, int bd) {
+static INLINE void highbd_d207_predictor(uint16_t *dst, ptrdiff_t stride,
+                                         int bs, const uint16_t *above,
+                                         const uint16_t *left, int bd) {
   int r, c;
   (void) above;
   (void) bd;
@@ -102,9 +102,9 @@
   }
 }
 
-static INLINE void high_d63_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
-                                      const uint16_t *above,
-                                      const uint16_t *left, int bd) {
+static INLINE void highbd_d63_predictor(uint16_t *dst, ptrdiff_t stride,
+                                        int bs, const uint16_t *above,
+                                        const uint16_t *left, int bd) {
   int r, c;
   (void) left;
   (void) bd;
@@ -120,9 +120,9 @@
   }
 }
 
-static INLINE void high_d45_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
-                                      const uint16_t *above,
-                                      const uint16_t *left, int bd) {
+static INLINE void highbd_d45_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
   int r, c;
   (void) left;
   (void) bd;
@@ -137,9 +137,9 @@
   }
 }
 
-static INLINE void high_d117_predictor(uint16_t *dst, ptrdiff_t stride,
-                                       int bs, const uint16_t *above,
-                                       const uint16_t *left, int bd) {
+static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride,
+                                         int bs, const uint16_t *above,
+                                         const uint16_t *left, int bd) {
   int r, c;
   (void) bd;
 
@@ -168,9 +168,9 @@
   }
 }
 
-static INLINE void high_d135_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
-                                       const uint16_t *above,
-                                       const uint16_t *left, int bd) {
+static INLINE void highbd_d135_predictor(uint16_t *dst, ptrdiff_t stride,
+                                         int bs, const uint16_t *above,
+                                         const uint16_t *left, int bd) {
   int r, c;
   (void) bd;
   dst[0] = ROUND_POWER_OF_TWO(left[0] + above[-1] * 2 + above[0], 2);
@@ -190,9 +190,9 @@
   }
 }
 
-static INLINE void high_d153_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
-                                       const uint16_t *above,
-                                       const uint16_t *left, int bd) {
+static INLINE void highbd_d153_predictor(uint16_t *dst, ptrdiff_t stride,
+                                         int bs, const uint16_t *above,
+                                         const uint16_t *left, int bd) {
   int r, c;
   (void) bd;
   dst[0] = ROUND_POWER_OF_TWO(above[-1] + left[0], 1);
@@ -218,9 +218,9 @@
   }
 }
 
-static INLINE void high_v_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
-                                    const uint16_t *above,
-                                    const uint16_t *left, int bd) {
+static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride,
+                                      int bs, const uint16_t *above,
+                                      const uint16_t *left, int bd) {
   int r;
   (void) left;
   (void) bd;
@@ -230,9 +230,9 @@
   }
 }
 
-static INLINE void high_h_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
-                                    const uint16_t *above, const uint16_t *left,
-                                    int bd) {
+static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride,
+                                      int bs, const uint16_t *above,
+                                      const uint16_t *left, int bd) {
   int r;
   (void) above;
   (void) bd;
@@ -242,23 +242,23 @@
   }
 }
 
-static INLINE void high_tm_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
-                                     const uint16_t *above,
-                                     const uint16_t *left, int bd) {
+static INLINE void highbd_tm_predictor(uint16_t *dst, ptrdiff_t stride,
+                                       int bs, const uint16_t *above,
+                                       const uint16_t *left, int bd) {
   int r, c;
   int ytop_left = above[-1];
   (void) bd;
 
   for (r = 0; r < bs; r++) {
     for (c = 0; c < bs; c++)
-      dst[c] = clip_pixel_high(left[r] + above[c] - ytop_left, bd);
+      dst[c] = clip_pixel_highbd(left[r] + above[c] - ytop_left, bd);
     dst += stride;
   }
 }
 
-static INLINE void high_dc_128_predictor(uint16_t *dst, ptrdiff_t stride,
-                                         int bs, const uint16_t *above,
-                                         const uint16_t *left, int bd) {
+static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride,
+                                           int bs, const uint16_t *above,
+                                           const uint16_t *left, int bd) {
   int r;
   (void) above;
   (void) left;
@@ -269,9 +269,9 @@
   }
 }
 
-static INLINE void high_dc_left_predictor(uint16_t *dst, ptrdiff_t stride,
-                                          int bs, const uint16_t *above,
-                                          const uint16_t *left, int bd) {
+static INLINE void highbd_dc_left_predictor(uint16_t *dst, ptrdiff_t stride,
+                                            int bs, const uint16_t *above,
+                                            const uint16_t *left, int bd) {
   int i, r, expected_dc, sum = 0;
   (void) above;
   (void) bd;
@@ -286,9 +286,9 @@
   }
 }
 
-static INLINE void high_dc_top_predictor(uint16_t *dst, ptrdiff_t stride,
-                                         int bs, const uint16_t *above,
-                                         const uint16_t *left, int bd) {
+static INLINE void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride,
+                                           int bs, const uint16_t *above,
+                                           const uint16_t *left, int bd) {
   int i, r, expected_dc, sum = 0;
   (void) left;
   (void) bd;
@@ -303,9 +303,9 @@
   }
 }
 
-static INLINE void high_dc_predictor(uint16_t *dst, ptrdiff_t stride,
-                                     int bs, const uint16_t *above,
-                                     const uint16_t *left, int bd) {
+static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride,
+                                       int bs, const uint16_t *above,
+                                       const uint16_t *left, int bd) {
   int i, r, expected_dc, sum = 0;
   const int count = 2 * bs;
   (void) bd;
@@ -602,20 +602,20 @@
   INIT_ALL_SIZES(dc_pred[1][1], dc);
 
 #if CONFIG_VP9_HIGHBITDEPTH
-  INIT_ALL_SIZES(pred_high[V_PRED], high_v);
-  INIT_ALL_SIZES(pred_high[H_PRED], high_h);
-  INIT_ALL_SIZES(pred_high[D207_PRED], high_d207);
-  INIT_ALL_SIZES(pred_high[D45_PRED], high_d45);
-  INIT_ALL_SIZES(pred_high[D63_PRED], high_d63);
-  INIT_ALL_SIZES(pred_high[D117_PRED], high_d117);
-  INIT_ALL_SIZES(pred_high[D135_PRED], high_d135);
-  INIT_ALL_SIZES(pred_high[D153_PRED], high_d153);
-  INIT_ALL_SIZES(pred_high[TM_PRED], high_tm);
+  INIT_ALL_SIZES(pred_high[V_PRED], highbd_v);
+  INIT_ALL_SIZES(pred_high[H_PRED], highbd_h);
+  INIT_ALL_SIZES(pred_high[D207_PRED], highbd_d207);
+  INIT_ALL_SIZES(pred_high[D45_PRED], highbd_d45);
+  INIT_ALL_SIZES(pred_high[D63_PRED], highbd_d63);
+  INIT_ALL_SIZES(pred_high[D117_PRED], highbd_d117);
+  INIT_ALL_SIZES(pred_high[D135_PRED], highbd_d135);
+  INIT_ALL_SIZES(pred_high[D153_PRED], highbd_d153);
+  INIT_ALL_SIZES(pred_high[TM_PRED], highbd_tm);
 
-  INIT_ALL_SIZES(dc_pred_high[0][0], high_dc_128);
-  INIT_ALL_SIZES(dc_pred_high[0][1], high_dc_top);
-  INIT_ALL_SIZES(dc_pred_high[1][0], high_dc_left);
-  INIT_ALL_SIZES(dc_pred_high[1][1], high_dc);
+  INIT_ALL_SIZES(dc_pred_high[0][0], highbd_dc_128);
+  INIT_ALL_SIZES(dc_pred_high[0][1], highbd_dc_top);
+  INIT_ALL_SIZES(dc_pred_high[1][0], highbd_dc_left);
+  INIT_ALL_SIZES(dc_pred_high[1][1], highbd_dc);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #undef intra_pred_allsizes

diff --git a/source/libvpx/vp9/common/vp9_rtcd_defs.pl b/source/libvpx/vp9/common/vp9_rtcd_defs.pl
index 0f52ae1..0530f3a 100644
--- a/source/libvpx/vp9/common/vp9_rtcd_defs.pl
+++ b/source/libvpx/vp9/common/vp9_rtcd_defs.pl

@@ -5,8 +5,8 @@
  */
 
 #include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_enums.h"
-#include "vp9/common/vp9_idct.h"
 
 struct macroblockd;
 
@@ -331,6 +331,8 @@
 # dct
 #
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  # Note as optimized versions of these functions are added we need to add a check to ensure
+  # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
   add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
   specialize qw/vp9_idct4x4_1_add/;
 
@@ -380,69 +382,123 @@
 
   add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
   specialize qw/vp9_iwht4x4_16_add/;
+
 } else {
-  add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/vp9_idct4x4_1_add sse2 neon_asm dspr2/;
-  $vp9_idct4x4_1_add_neon_asm=vp9_idct4x4_1_add_neon;
+  # Force C versions if CONFIG_EMULATE_HARDWARE is 1
+  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
+    add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp9_idct4x4_1_add/;
 
-  add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/vp9_idct4x4_16_add sse2 neon_asm dspr2/;
-  $vp9_idct4x4_16_add_neon_asm=vp9_idct4x4_16_add_neon;
+    add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp9_idct4x4_16_add/;
 
-  add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/vp9_idct8x8_1_add sse2 neon_asm dspr2/;
-  $vp9_idct8x8_1_add_neon_asm=vp9_idct8x8_1_add_neon;
+    add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp9_idct8x8_1_add/;
 
-  add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/vp9_idct8x8_64_add sse2 neon_asm dspr2/, "$ssse3_x86_64";
-  $vp9_idct8x8_64_add_neon_asm=vp9_idct8x8_64_add_neon;
+    add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp9_idct8x8_64_add/;
 
-  add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/vp9_idct8x8_12_add sse2 neon_asm dspr2/, "$ssse3_x86_64";
-  $vp9_idct8x8_12_add_neon_asm=vp9_idct8x8_12_add_neon;
+    add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp9_idct8x8_12_add/;
 
-  add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/vp9_idct16x16_1_add sse2 neon_asm dspr2/;
-  $vp9_idct16x16_1_add_neon_asm=vp9_idct16x16_1_add_neon;
+    add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp9_idct16x16_1_add/;
 
-  add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/vp9_idct16x16_256_add sse2 ssse3 neon_asm dspr2/;
-  $vp9_idct16x16_256_add_neon_asm=vp9_idct16x16_256_add_neon;
+    add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp9_idct16x16_256_add/;
 
-  add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/vp9_idct16x16_10_add sse2 ssse3 neon_asm dspr2/;
-  $vp9_idct16x16_10_add_neon_asm=vp9_idct16x16_10_add_neon;
+    add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp9_idct16x16_10_add/;
 
-  add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/vp9_idct32x32_1024_add sse2 neon_asm dspr2/;
-  $vp9_idct32x32_1024_add_neon_asm=vp9_idct32x32_1024_add_neon;
+    add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp9_idct32x32_1024_add/;
 
-  add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/vp9_idct32x32_34_add sse2 neon_asm dspr2/;
-  $vp9_idct32x32_34_add_neon_asm=vp9_idct32x32_1024_add_neon;
+    add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp9_idct32x32_34_add/;
 
-  add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/vp9_idct32x32_1_add sse2 neon_asm dspr2/;
-  $vp9_idct32x32_1_add_neon_asm=vp9_idct32x32_1_add_neon;
+    add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp9_idct32x32_1_add/;
 
-  add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
-  specialize qw/vp9_iht4x4_16_add sse2 neon_asm dspr2/;
-  $vp9_iht4x4_16_add_neon_asm=vp9_iht4x4_16_add_neon;
+    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp9_iht4x4_16_add/;
 
-  add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
-  specialize qw/vp9_iht8x8_64_add sse2 neon_asm dspr2/;
-  $vp9_iht8x8_64_add_neon_asm=vp9_iht8x8_64_add_neon;
+    add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp9_iht8x8_64_add/;
 
-  add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
-  specialize qw/vp9_iht16x16_256_add sse2 dspr2/;
+    add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+    specialize qw/vp9_iht16x16_256_add/;
 
-  # dct and add
+    # dct and add
 
-  add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/vp9_iwht4x4_1_add/;
+    add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp9_iwht4x4_1_add/;
 
-  add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/vp9_iwht4x4_16_add/;
+    add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp9_iwht4x4_16_add/;
+  } else {
+    add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp9_idct4x4_1_add sse2 neon_asm dspr2/;
+    $vp9_idct4x4_1_add_neon_asm=vp9_idct4x4_1_add_neon;
+
+    add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp9_idct4x4_16_add sse2 neon_asm dspr2/;
+    $vp9_idct4x4_16_add_neon_asm=vp9_idct4x4_16_add_neon;
+
+    add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp9_idct8x8_1_add sse2 neon_asm dspr2/;
+    $vp9_idct8x8_1_add_neon_asm=vp9_idct8x8_1_add_neon;
+
+    add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp9_idct8x8_64_add sse2 neon_asm dspr2/, "$ssse3_x86_64";
+    $vp9_idct8x8_64_add_neon_asm=vp9_idct8x8_64_add_neon;
+
+    add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp9_idct8x8_12_add sse2 neon_asm dspr2/, "$ssse3_x86_64";
+    $vp9_idct8x8_12_add_neon_asm=vp9_idct8x8_12_add_neon;
+
+    add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp9_idct16x16_1_add sse2 neon_asm dspr2/;
+    $vp9_idct16x16_1_add_neon_asm=vp9_idct16x16_1_add_neon;
+
+    add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp9_idct16x16_256_add sse2 ssse3 neon_asm dspr2/;
+    $vp9_idct16x16_256_add_neon_asm=vp9_idct16x16_256_add_neon;
+
+    add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp9_idct16x16_10_add sse2 ssse3 neon_asm dspr2/;
+    $vp9_idct16x16_10_add_neon_asm=vp9_idct16x16_10_add_neon;
+
+    add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp9_idct32x32_1024_add sse2 neon_asm dspr2/;
+    $vp9_idct32x32_1024_add_neon_asm=vp9_idct32x32_1024_add_neon;
+
+    add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp9_idct32x32_34_add sse2 neon_asm dspr2/;
+    $vp9_idct32x32_34_add_neon_asm=vp9_idct32x32_1024_add_neon;
+
+    add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp9_idct32x32_1_add sse2 neon_asm dspr2/;
+    $vp9_idct32x32_1_add_neon_asm=vp9_idct32x32_1_add_neon;
+
+    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp9_iht4x4_16_add sse2 neon_asm dspr2/;
+    $vp9_iht4x4_16_add_neon_asm=vp9_iht4x4_16_add_neon;
+
+    add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp9_iht8x8_64_add sse2 neon_asm dspr2/;
+    $vp9_iht8x8_64_add_neon_asm=vp9_iht8x8_64_add_neon;
+
+    add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+    specialize qw/vp9_iht16x16_256_add sse2 dspr2/;
+
+    # dct and add
+
+    add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp9_iwht4x4_1_add/;
+
+    add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp9_iwht4x4_16_add/;
+  }
 }
 
 # High bitdepth functions
@@ -450,241 +506,296 @@
   #
   # Intra prediction
   #
-  add_proto qw/void vp9_high_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_d207_predictor_4x4/;
+  add_proto qw/void vp9_highbd_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_d207_predictor_4x4/;
 
-  add_proto qw/void vp9_high_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_d45_predictor_4x4/;
+  add_proto qw/void vp9_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_d45_predictor_4x4/;
 
-  add_proto qw/void vp9_high_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_d63_predictor_4x4/;
+  add_proto qw/void vp9_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_d63_predictor_4x4/;
 
-  add_proto qw/void vp9_high_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_h_predictor_4x4/;
+  add_proto qw/void vp9_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_h_predictor_4x4/;
 
-  add_proto qw/void vp9_high_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_d117_predictor_4x4/;
+  add_proto qw/void vp9_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_d117_predictor_4x4/;
 
-  add_proto qw/void vp9_high_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_d135_predictor_4x4/;
+  add_proto qw/void vp9_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_d135_predictor_4x4/;
 
-  add_proto qw/void vp9_high_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_d153_predictor_4x4/;
+  add_proto qw/void vp9_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_d153_predictor_4x4/;
 
-  add_proto qw/void vp9_high_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_v_predictor_4x4 neon/, "$sse_x86inc";
+  add_proto qw/void vp9_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_v_predictor_4x4 neon/, "$sse_x86inc";
 
-  add_proto qw/void vp9_high_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_tm_predictor_4x4/, "$sse_x86inc";
+  add_proto qw/void vp9_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_tm_predictor_4x4/, "$sse_x86inc";
 
-  add_proto qw/void vp9_high_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_dc_predictor_4x4/, "$sse_x86inc";
+  add_proto qw/void vp9_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_dc_predictor_4x4/, "$sse_x86inc";
 
-  add_proto qw/void vp9_high_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_dc_top_predictor_4x4/;
+  add_proto qw/void vp9_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_dc_top_predictor_4x4/;
 
-  add_proto qw/void vp9_high_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_dc_left_predictor_4x4/;
+  add_proto qw/void vp9_highbd_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_dc_left_predictor_4x4/;
 
-  add_proto qw/void vp9_high_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_dc_128_predictor_4x4/;
+  add_proto qw/void vp9_highbd_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_dc_128_predictor_4x4/;
 
-  add_proto qw/void vp9_high_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_d207_predictor_8x8/;
+  add_proto qw/void vp9_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_d207_predictor_8x8/;
 
-  add_proto qw/void vp9_high_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_d45_predictor_8x8/;
+  add_proto qw/void vp9_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_d45_predictor_8x8/;
 
-  add_proto qw/void vp9_high_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_d63_predictor_8x8/;
+  add_proto qw/void vp9_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_d63_predictor_8x8/;
 
-  add_proto qw/void vp9_high_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_h_predictor_8x8/;
+  add_proto qw/void vp9_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_h_predictor_8x8/;
 
-  add_proto qw/void vp9_high_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_d117_predictor_8x8/;
+  add_proto qw/void vp9_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_d117_predictor_8x8/;
 
-  add_proto qw/void vp9_high_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_d135_predictor_8x8/;
+  add_proto qw/void vp9_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_d135_predictor_8x8/;
 
-  add_proto qw/void vp9_high_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_d153_predictor_8x8/;
+  add_proto qw/void vp9_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_d153_predictor_8x8/;
 
-  add_proto qw/void vp9_high_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_v_predictor_8x8/, "$sse2_x86inc";
+  add_proto qw/void vp9_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_v_predictor_8x8/, "$sse2_x86inc";
 
-  add_proto qw/void vp9_high_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_tm_predictor_8x8/, "$sse2_x86inc";
+  add_proto qw/void vp9_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_tm_predictor_8x8/, "$sse2_x86inc";
 
-  add_proto qw/void vp9_high_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_dc_predictor_8x8/, "$sse2_x86inc";;
+  add_proto qw/void vp9_highbd_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_dc_predictor_8x8/, "$sse2_x86inc";;
 
-  add_proto qw/void vp9_high_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_dc_top_predictor_8x8/;
+  add_proto qw/void vp9_highbd_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_dc_top_predictor_8x8/;
 
-  add_proto qw/void vp9_high_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_dc_left_predictor_8x8/;
+  add_proto qw/void vp9_highbd_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_dc_left_predictor_8x8/;
 
-  add_proto qw/void vp9_high_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_dc_128_predictor_8x8/;
+  add_proto qw/void vp9_highbd_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_dc_128_predictor_8x8/;
 
-  add_proto qw/void vp9_high_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_d207_predictor_16x16/;
+  add_proto qw/void vp9_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_d207_predictor_16x16/;
 
-  add_proto qw/void vp9_high_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_d45_predictor_16x16/;
+  add_proto qw/void vp9_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_d45_predictor_16x16/;
 
-  add_proto qw/void vp9_high_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_d63_predictor_16x16/;
+  add_proto qw/void vp9_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_d63_predictor_16x16/;
 
-  add_proto qw/void vp9_high_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_h_predictor_16x16/;
+  add_proto qw/void vp9_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_h_predictor_16x16/;
 
-  add_proto qw/void vp9_high_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_d117_predictor_16x16/;
+  add_proto qw/void vp9_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_d117_predictor_16x16/;
 
-  add_proto qw/void vp9_high_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_d135_predictor_16x16/;
+  add_proto qw/void vp9_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_d135_predictor_16x16/;
 
-  add_proto qw/void vp9_high_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_d153_predictor_16x16/;
+  add_proto qw/void vp9_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_d153_predictor_16x16/;
 
-  add_proto qw/void vp9_high_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_v_predictor_16x16 neon/, "$sse2_x86inc";
+  add_proto qw/void vp9_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_v_predictor_16x16 neon/, "$sse2_x86inc";
 
-  add_proto qw/void vp9_high_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_tm_predictor_16x16/, "$sse2_x86_64";
+  add_proto qw/void vp9_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_tm_predictor_16x16/, "$sse2_x86_64";
 
-  add_proto qw/void vp9_high_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_dc_predictor_16x16/, "$sse2_x86inc";
+  add_proto qw/void vp9_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_dc_predictor_16x16/, "$sse2_x86inc";
 
-  add_proto qw/void vp9_high_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_dc_top_predictor_16x16/;
+  add_proto qw/void vp9_highbd_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_dc_top_predictor_16x16/;
 
-  add_proto qw/void vp9_high_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_dc_left_predictor_16x16/;
+  add_proto qw/void vp9_highbd_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_dc_left_predictor_16x16/;
 
-  add_proto qw/void vp9_high_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_dc_128_predictor_16x16/;
+  add_proto qw/void vp9_highbd_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_dc_128_predictor_16x16/;
 
-  add_proto qw/void vp9_high_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_d207_predictor_32x32/;
+  add_proto qw/void vp9_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_d207_predictor_32x32/;
 
-  add_proto qw/void vp9_high_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_d45_predictor_32x32/;
+  add_proto qw/void vp9_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_d45_predictor_32x32/;
 
-  add_proto qw/void vp9_high_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_d63_predictor_32x32/;
+  add_proto qw/void vp9_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_d63_predictor_32x32/;
 
-  add_proto qw/void vp9_high_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_h_predictor_32x32/;
+  add_proto qw/void vp9_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_h_predictor_32x32/;
 
-  add_proto qw/void vp9_high_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_d117_predictor_32x32/;
+  add_proto qw/void vp9_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_d117_predictor_32x32/;
 
-  add_proto qw/void vp9_high_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_d135_predictor_32x32/;
+  add_proto qw/void vp9_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_d135_predictor_32x32/;
 
-  add_proto qw/void vp9_high_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_d153_predictor_32x32/;
+  add_proto qw/void vp9_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_d153_predictor_32x32/;
 
-  add_proto qw/void vp9_high_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_v_predictor_32x32/, "$sse2_x86inc";
+  add_proto qw/void vp9_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_v_predictor_32x32/, "$sse2_x86inc";
 
-  add_proto qw/void vp9_high_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_tm_predictor_32x32/, "$sse2_x86_64";
+  add_proto qw/void vp9_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_tm_predictor_32x32/, "$sse2_x86_64";
 
-  add_proto qw/void vp9_high_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_dc_predictor_32x32/, "$sse2_x86_64";
+  add_proto qw/void vp9_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_dc_predictor_32x32/, "$sse2_x86_64";
 
-  add_proto qw/void vp9_high_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_dc_top_predictor_32x32/;
+  add_proto qw/void vp9_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_dc_top_predictor_32x32/;
 
-  add_proto qw/void vp9_high_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_dc_left_predictor_32x32/;
+  add_proto qw/void vp9_highbd_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_dc_left_predictor_32x32/;
 
-  add_proto qw/void vp9_high_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
-  specialize qw/vp9_high_dc_128_predictor_32x32/;
+  add_proto qw/void vp9_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vp9_highbd_dc_128_predictor_32x32/;
 
   #
   # Sub Pixel Filters
   #
-  add_proto qw/void vp9_high_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vp9_high_convolve_copy/;
+  add_proto qw/void vp9_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vp9_highbd_convolve_copy/;
 
-  add_proto qw/void vp9_high_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vp9_high_convolve_avg/;
+  add_proto qw/void vp9_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vp9_highbd_convolve_avg/;
 
-  add_proto qw/void vp9_high_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vp9_high_convolve8/, "$sse2_x86_64";
+  add_proto qw/void vp9_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vp9_highbd_convolve8/, "$sse2_x86_64";
 
-  add_proto qw/void vp9_high_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vp9_high_convolve8_horiz/, "$sse2_x86_64";
+  add_proto qw/void vp9_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vp9_highbd_convolve8_horiz/, "$sse2_x86_64";
 
-  add_proto qw/void vp9_high_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vp9_high_convolve8_vert/, "$sse2_x86_64";
+  add_proto qw/void vp9_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vp9_highbd_convolve8_vert/, "$sse2_x86_64";
 
-  add_proto qw/void vp9_high_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vp9_high_convolve8_avg/, "$sse2_x86_64";
+  add_proto qw/void vp9_highbd_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vp9_highbd_convolve8_avg/, "$sse2_x86_64";
 
-  add_proto qw/void vp9_high_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vp9_high_convolve8_avg_horiz/, "$sse2_x86_64";
+  add_proto qw/void vp9_highbd_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vp9_highbd_convolve8_avg_horiz/, "$sse2_x86_64";
 
-  add_proto qw/void vp9_high_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vp9_high_convolve8_avg_vert/, "$sse2_x86_64";
+  add_proto qw/void vp9_highbd_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vp9_highbd_convolve8_avg_vert/, "$sse2_x86_64";
+
+  #
+  # Loopfilter
+  #
+  add_proto qw/void vp9_highbd_lpf_vertical_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/vp9_highbd_lpf_vertical_16 sse2/;
+
+  add_proto qw/void vp9_highbd_lpf_vertical_16_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/vp9_highbd_lpf_vertical_16_dual sse2/;
+
+  add_proto qw/void vp9_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  specialize qw/vp9_highbd_lpf_vertical_8 sse2/;
+
+  add_proto qw/void vp9_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/vp9_highbd_lpf_vertical_8_dual sse2/;
+
+  add_proto qw/void vp9_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  specialize qw/vp9_highbd_lpf_vertical_4 sse2/;
+
+  add_proto qw/void vp9_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/vp9_highbd_lpf_vertical_4_dual sse2/;
+
+  add_proto qw/void vp9_highbd_lpf_horizontal_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  specialize qw/vp9_highbd_lpf_horizontal_16 sse2/;
+
+  add_proto qw/void vp9_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  specialize qw/vp9_highbd_lpf_horizontal_8 sse2/;
+
+  add_proto qw/void vp9_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/vp9_highbd_lpf_horizontal_8_dual sse2/;
+
+  add_proto qw/void vp9_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  specialize qw/vp9_highbd_lpf_horizontal_4 sse2/;
+
+  add_proto qw/void vp9_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/vp9_highbd_lpf_horizontal_4_dual sse2/;
+
+  #
+  # post proc
+  #
+  if (vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
+    add_proto qw/void vp9_highbd_mbpost_proc_down/, "uint16_t *dst, int pitch, int rows, int cols, int flimit";
+    specialize qw/vp9_highbd_mbpost_proc_down/;
+
+    add_proto qw/void vp9_highbd_mbpost_proc_across_ip/, "uint16_t *src, int pitch, int rows, int cols, int flimit";
+    specialize qw/vp9_highbd_mbpost_proc_across_ip/;
+
+    add_proto qw/void vp9_highbd_post_proc_down_and_across/, "const uint16_t *src_ptr, uint16_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit";
+    specialize qw/vp9_highbd_post_proc_down_and_across/;
+
+    add_proto qw/void vp9_highbd_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
+    specialize qw/vp9_highbd_plane_add_noise/;
+  }
 
   #
   # dct
   #
-  add_proto qw/void vp9_high_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/vp9_high_idct4x4_1_add/;
+  # Note as optimized versions of these functions are added we need to add a check to ensure
+  # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
+  add_proto qw/void vp9_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp9_highbd_idct4x4_1_add/;
 
-  add_proto qw/void vp9_high_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/vp9_high_idct4x4_16_add/;
+  add_proto qw/void vp9_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp9_highbd_idct4x4_16_add/;
 
-  add_proto qw/void vp9_high_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/vp9_high_idct8x8_1_add/;
+  add_proto qw/void vp9_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp9_highbd_idct8x8_1_add/;
 
-  add_proto qw/void vp9_high_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/vp9_high_idct8x8_64_add/;
+  add_proto qw/void vp9_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp9_highbd_idct8x8_64_add/;
 
-  add_proto qw/void vp9_high_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/vp9_high_idct8x8_10_add/;
+  add_proto qw/void vp9_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp9_highbd_idct8x8_10_add/;
 
-  add_proto qw/void vp9_high_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/vp9_high_idct16x16_1_add/;
+  add_proto qw/void vp9_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp9_highbd_idct16x16_1_add/;
 
-  add_proto qw/void vp9_high_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/vp9_high_idct16x16_256_add/;
+  add_proto qw/void vp9_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp9_highbd_idct16x16_256_add/;
 
-  add_proto qw/void vp9_high_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/vp9_high_idct16x16_10_add/;
+  add_proto qw/void vp9_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp9_highbd_idct16x16_10_add/;
 
-  add_proto qw/void vp9_high_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/vp9_high_idct32x32_1024_add/;
+  add_proto qw/void vp9_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp9_highbd_idct32x32_1024_add/;
 
-  add_proto qw/void vp9_high_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/vp9_high_idct32x32_34_add/;
+  add_proto qw/void vp9_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp9_highbd_idct32x32_34_add/;
 
-  add_proto qw/void vp9_high_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/vp9_high_idct32x32_1_add/;
+  add_proto qw/void vp9_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp9_highbd_idct32x32_1_add/;
 
-  add_proto qw/void vp9_high_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
-  specialize qw/vp9_high_iht4x4_16_add/;
+  add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+  specialize qw/vp9_highbd_iht4x4_16_add/;
 
-  add_proto qw/void vp9_high_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
-  specialize qw/vp9_high_iht8x8_64_add/;
+  add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+  specialize qw/vp9_highbd_iht8x8_64_add/;
 
-  add_proto qw/void vp9_high_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd";
-  specialize qw/vp9_high_iht16x16_256_add/;
+  add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd";
+  specialize qw/vp9_highbd_iht16x16_256_add/;
 
   # dct and add
 
-  add_proto qw/void vp9_high_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/vp9_high_iwht4x4_1_add/;
+  add_proto qw/void vp9_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp9_highbd_iwht4x4_1_add/;
 
-  add_proto qw/void vp9_high_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/vp9_high_iwht4x4_16_add/;
+  add_proto qw/void vp9_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp9_highbd_iwht4x4_16_add/;
 }
 
 #
@@ -820,22 +931,22 @@
 specialize qw/vp9_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
 
 add_proto qw/unsigned int vp9_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-specialize qw/vp9_sad64x64 neon/, "$sse2_x86inc";
+specialize qw/vp9_sad64x64 neon avx2/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad32x64/, "$sse2_x86inc";
+specialize qw/vp9_sad32x64 avx2/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad64x32/, "$sse2_x86inc";
+specialize qw/vp9_sad64x32 avx2/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad32x16/, "$sse2_x86inc";
+specialize qw/vp9_sad32x16 avx2/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vp9_sad16x32/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-specialize qw/vp9_sad32x32 neon/, "$sse2_x86inc";
+specialize qw/vp9_sad32x32 neon avx2/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
 specialize qw/vp9_sad16x16 neon/, "$sse2_x86inc";
@@ -859,22 +970,22 @@
 specialize qw/vp9_sad4x4/, "$sse_x86inc";
 
 add_proto qw/unsigned int vp9_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad64x64_avg/, "$sse2_x86inc";
+specialize qw/vp9_sad64x64_avg avx2/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sad32x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad32x64_avg/, "$sse2_x86inc";
+specialize qw/vp9_sad32x64_avg avx2/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sad64x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad64x32_avg/, "$sse2_x86inc";
+specialize qw/vp9_sad64x32_avg avx2/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sad32x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad32x16_avg/, "$sse2_x86inc";
+specialize qw/vp9_sad32x16_avg avx2/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sad16x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
 specialize qw/vp9_sad16x32_avg/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sad32x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad32x32_avg/, "$sse2_x86inc";
+specialize qw/vp9_sad32x32_avg avx2/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sad16x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
 specialize qw/vp9_sad16x16_avg/, "$sse2_x86inc";
@@ -999,11 +1110,28 @@
 
 add_proto qw/unsigned int vp9_get_mb_ss/, "const int16_t *";
 specialize qw/vp9_get_mb_ss/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p";
+specialize qw/vp9_avg_8x8 sse2/;
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/unsigned int vp9_highbd_avg_8x8/, "const uint8_t *, int p";
+  specialize qw/vp9_highbd_avg_8x8/;
+}
+
 # ENCODEMB INVOKE
 
 add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
 specialize qw/vp9_subtract_block neon/, "$sse2_x86inc";
 
+#
+# Denoiser
+#
+if (vpx_config("CONFIG_VP9_TEMPORAL_DENOISING") eq "yes") {
+  add_proto qw/int vp9_denoiser_filter/, "const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude";
+  specialize qw/vp9_denoiser_filter sse2/;
+}
+
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 # the transform coefficients are held in 32-bit
 # values, so the assembler code for  vp9_block_error can no longer be used.
@@ -1032,7 +1160,7 @@
   specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64";
 
   add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_quantize_b/, "$ssse3_x86_64";
+  specialize qw/vp9_quantize_b sse2/, "$ssse3_x86_64";
 
   add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_quantize_b_32x32/, "$ssse3_x86_64";
@@ -1154,647 +1282,644 @@
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 
   # variance
-  add_proto qw/unsigned int vp9_high_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_variance32x16/;
+  add_proto qw/unsigned int vp9_highbd_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_variance32x16/;
 
-  add_proto qw/unsigned int vp9_high_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_variance16x32/;
+  add_proto qw/unsigned int vp9_highbd_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_variance16x32/;
 
-  add_proto qw/unsigned int vp9_high_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_variance64x32/;
+  add_proto qw/unsigned int vp9_highbd_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_variance64x32/;
 
-  add_proto qw/unsigned int vp9_high_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_variance32x64/;
+  add_proto qw/unsigned int vp9_highbd_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_variance32x64/;
 
-  add_proto qw/unsigned int vp9_high_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_variance32x32/;
+  add_proto qw/unsigned int vp9_highbd_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_variance32x32/;
 
-  add_proto qw/unsigned int vp9_high_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_variance64x64/;
+  add_proto qw/unsigned int vp9_highbd_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_variance64x64/;
 
-  add_proto qw/unsigned int vp9_high_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_variance16x16/;
+  add_proto qw/unsigned int vp9_highbd_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_variance16x16/;
 
-  add_proto qw/unsigned int vp9_high_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_variance16x8/;
+  add_proto qw/unsigned int vp9_highbd_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_variance16x8/;
 
-  add_proto qw/unsigned int vp9_high_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_variance8x16/;
+  add_proto qw/unsigned int vp9_highbd_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_variance8x16/;
 
-  add_proto qw/unsigned int vp9_high_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_variance8x8/;
+  add_proto qw/unsigned int vp9_highbd_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_variance8x8/;
 
-  add_proto qw/unsigned int vp9_high_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_variance8x4/;
+  add_proto qw/unsigned int vp9_highbd_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_variance8x4/;
 
-  add_proto qw/unsigned int vp9_high_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_variance4x8/;
+  add_proto qw/unsigned int vp9_highbd_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_variance4x8/;
 
-  add_proto qw/unsigned int vp9_high_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_variance4x4/;
+  add_proto qw/unsigned int vp9_highbd_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_variance4x4/;
 
-  add_proto qw/void vp9_high_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vp9_high_get8x8var/;
+  add_proto qw/void vp9_highbd_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vp9_highbd_get8x8var/;
 
-  add_proto qw/void vp9_high_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vp9_high_get16x16var/;
+  add_proto qw/void vp9_highbd_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vp9_highbd_get16x16var/;
 
-  add_proto qw/unsigned int vp9_high_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_10_variance32x16/;
+  add_proto qw/unsigned int vp9_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_10_variance32x16/;
 
-  add_proto qw/unsigned int vp9_high_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_10_variance16x32/;
+  add_proto qw/unsigned int vp9_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_10_variance16x32/;
 
-  add_proto qw/unsigned int vp9_high_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_10_variance64x32/;
+  add_proto qw/unsigned int vp9_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_10_variance64x32/;
 
-  add_proto qw/unsigned int vp9_high_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_10_variance32x64/;
+  add_proto qw/unsigned int vp9_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_10_variance32x64/;
 
-  add_proto qw/unsigned int vp9_high_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_10_variance32x32/;
+  add_proto qw/unsigned int vp9_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_10_variance32x32/;
 
-  add_proto qw/unsigned int vp9_high_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_10_variance64x64/;
+  add_proto qw/unsigned int vp9_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_10_variance64x64/;
 
-  add_proto qw/unsigned int vp9_high_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_10_variance16x16/;
+  add_proto qw/unsigned int vp9_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_10_variance16x16/;
 
-  add_proto qw/unsigned int vp9_high_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_10_variance16x8/;
+  add_proto qw/unsigned int vp9_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_10_variance16x8/;
 
-  add_proto qw/unsigned int vp9_high_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_10_variance8x16/;
+  add_proto qw/unsigned int vp9_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_10_variance8x16/;
 
-  add_proto qw/unsigned int vp9_high_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_10_variance8x8/;
+  add_proto qw/unsigned int vp9_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_10_variance8x8/;
 
-  add_proto qw/unsigned int vp9_high_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_10_variance8x4/;
+  add_proto qw/unsigned int vp9_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_10_variance8x4/;
 
-  add_proto qw/unsigned int vp9_high_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_10_variance4x8/;
+  add_proto qw/unsigned int vp9_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_10_variance4x8/;
 
-  add_proto qw/unsigned int vp9_high_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_10_variance4x4/;
+  add_proto qw/unsigned int vp9_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_10_variance4x4/;
 
-  add_proto qw/void vp9_high_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vp9_high_10_get8x8var/;
+  add_proto qw/void vp9_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vp9_highbd_10_get8x8var/;
 
-  add_proto qw/void vp9_high_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vp9_high_10_get16x16var/;
+  add_proto qw/void vp9_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vp9_highbd_10_get16x16var/;
 
-  add_proto qw/unsigned int vp9_high_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_12_variance32x16/;
+  add_proto qw/unsigned int vp9_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_12_variance32x16/;
 
-  add_proto qw/unsigned int vp9_high_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_12_variance16x32/;
+  add_proto qw/unsigned int vp9_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_12_variance16x32/;
 
-  add_proto qw/unsigned int vp9_high_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_12_variance64x32/;
+  add_proto qw/unsigned int vp9_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_12_variance64x32/;
 
-  add_proto qw/unsigned int vp9_high_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_12_variance32x64/;
+  add_proto qw/unsigned int vp9_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_12_variance32x64/;
 
-  add_proto qw/unsigned int vp9_high_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_12_variance32x32/;
+  add_proto qw/unsigned int vp9_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_12_variance32x32/;
 
-  add_proto qw/unsigned int vp9_high_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_12_variance64x64/;
+  add_proto qw/unsigned int vp9_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_12_variance64x64/;
 
-  add_proto qw/unsigned int vp9_high_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_12_variance16x16/;
+  add_proto qw/unsigned int vp9_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_12_variance16x16/;
 
-  add_proto qw/unsigned int vp9_high_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_12_variance16x8/;
+  add_proto qw/unsigned int vp9_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_12_variance16x8/;
 
-  add_proto qw/unsigned int vp9_high_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_12_variance8x16/;
+  add_proto qw/unsigned int vp9_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_12_variance8x16/;
 
-  add_proto qw/unsigned int vp9_high_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_12_variance8x8/;
+  add_proto qw/unsigned int vp9_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_12_variance8x8/;
 
-  add_proto qw/unsigned int vp9_high_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_12_variance8x4/;
+  add_proto qw/unsigned int vp9_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_12_variance8x4/;
 
-  add_proto qw/unsigned int vp9_high_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_12_variance4x8/;
+  add_proto qw/unsigned int vp9_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_12_variance4x8/;
 
-  add_proto qw/unsigned int vp9_high_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_12_variance4x4/;
+  add_proto qw/unsigned int vp9_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_12_variance4x4/;
 
-  add_proto qw/void vp9_high_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vp9_high_12_get8x8var/;
+  add_proto qw/void vp9_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vp9_highbd_12_get8x8var/;
 
-  add_proto qw/void vp9_high_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vp9_high_12_get16x16var/;
+  add_proto qw/void vp9_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vp9_highbd_12_get16x16var/;
 
-  add_proto qw/unsigned int vp9_high_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_sub_pixel_variance64x64/;
+  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_sub_pixel_variance64x64/;
 
-  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_sub_pixel_avg_variance64x64/;
+  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_sub_pixel_avg_variance64x64/;
 
-  add_proto qw/unsigned int vp9_high_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_sub_pixel_variance32x64/;
+  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_sub_pixel_variance32x64/;
 
-  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_sub_pixel_avg_variance32x64/;
+  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_sub_pixel_avg_variance32x64/;
 
-  add_proto qw/unsigned int vp9_high_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_sub_pixel_variance64x32/;
+  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_sub_pixel_variance64x32/;
 
-  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_sub_pixel_avg_variance64x32/;
+  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_sub_pixel_avg_variance64x32/;
 
-  add_proto qw/unsigned int vp9_high_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_sub_pixel_variance32x16/;
+  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_sub_pixel_variance32x16/;
 
-  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_sub_pixel_avg_variance32x16/;
+  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_sub_pixel_avg_variance32x16/;
 
-  add_proto qw/unsigned int vp9_high_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_sub_pixel_variance16x32/;
+  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_sub_pixel_variance16x32/;
 
-  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_sub_pixel_avg_variance16x32/;
+  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_sub_pixel_avg_variance16x32/;
 
-  add_proto qw/unsigned int vp9_high_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_sub_pixel_variance32x32/;
+  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_sub_pixel_variance32x32/;
 
-  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_sub_pixel_avg_variance32x32/;
+  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_sub_pixel_avg_variance32x32/;
 
-  add_proto qw/unsigned int vp9_high_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_sub_pixel_variance16x16/;
+  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_sub_pixel_variance16x16/;
 
-  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_sub_pixel_avg_variance16x16/;
+  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_sub_pixel_avg_variance16x16/;
 
-  add_proto qw/unsigned int vp9_high_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_sub_pixel_variance8x16/;
+  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_sub_pixel_variance8x16/;
 
-  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_sub_pixel_avg_variance8x16/;
+  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_sub_pixel_avg_variance8x16/;
 
-  add_proto qw/unsigned int vp9_high_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_sub_pixel_variance16x8/;
+  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_sub_pixel_variance16x8/;
 
-  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_sub_pixel_avg_variance16x8/;
+  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_sub_pixel_avg_variance16x8/;
 
-  add_proto qw/unsigned int vp9_high_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_sub_pixel_variance8x8/;
+  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_sub_pixel_variance8x8/;
 
-  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_sub_pixel_avg_variance8x8/;
+  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_sub_pixel_avg_variance8x8/;
 
-  add_proto qw/unsigned int vp9_high_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_sub_pixel_variance8x4/;
+  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_sub_pixel_variance8x4/;
 
-  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_sub_pixel_avg_variance8x4/;
+  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_sub_pixel_avg_variance8x4/;
 
-  add_proto qw/unsigned int vp9_high_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_sub_pixel_variance4x8/;
+  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_sub_pixel_variance4x8/;
 
-  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_sub_pixel_avg_variance4x8/;
+  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_sub_pixel_avg_variance4x8/;
 
-  add_proto qw/unsigned int vp9_high_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_sub_pixel_variance4x4/;
+  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_sub_pixel_variance4x4/;
 
-  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_sub_pixel_avg_variance4x4/;
+  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_sub_pixel_avg_variance4x4/;
 
-  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_10_sub_pixel_variance64x64/;
+  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_10_sub_pixel_variance64x64/;
 
-  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_10_sub_pixel_avg_variance64x64/;
+  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x64/;
 
-  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_10_sub_pixel_variance32x64/;
+  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_10_sub_pixel_variance32x64/;
 
-  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_10_sub_pixel_avg_variance32x64/;
+  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x64/;
 
-  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_10_sub_pixel_variance64x32/;
+  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_10_sub_pixel_variance64x32/;
 
-  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_10_sub_pixel_avg_variance64x32/;
+  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x32/;
 
-  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_10_sub_pixel_variance32x16/;
+  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_10_sub_pixel_variance32x16/;
 
-  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_10_sub_pixel_avg_variance32x16/;
+  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x16/;
 
-  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_10_sub_pixel_variance16x32/;
+  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_10_sub_pixel_variance16x32/;
 
-  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_10_sub_pixel_avg_variance16x32/;
+  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x32/;
 
-  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_10_sub_pixel_variance32x32/;
+  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_10_sub_pixel_variance32x32/;
 
-  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_10_sub_pixel_avg_variance32x32/;
+  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x32/;
 
-  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_10_sub_pixel_variance16x16/;
+  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_10_sub_pixel_variance16x16/;
 
-  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_10_sub_pixel_avg_variance16x16/;
+  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x16/;
 
-  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_10_sub_pixel_variance8x16/;
+  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_10_sub_pixel_variance8x16/;
 
-  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_10_sub_pixel_avg_variance8x16/;
+  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x16/;
 
-  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_10_sub_pixel_variance16x8/;
+  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_10_sub_pixel_variance16x8/;
 
-  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_10_sub_pixel_avg_variance16x8/;
+  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x8/;
 
-  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_10_sub_pixel_variance8x8/;
+  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_10_sub_pixel_variance8x8/;
 
-  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_10_sub_pixel_avg_variance8x8/;
+  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x8/;
 
-  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_10_sub_pixel_variance8x4/;
+  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_10_sub_pixel_variance8x4/;
 
-  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_10_sub_pixel_avg_variance8x4/;
+  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x4/;
 
-  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_10_sub_pixel_variance4x8/;
+  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_10_sub_pixel_variance4x8/;
 
-  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_10_sub_pixel_avg_variance4x8/;
+  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance4x8/;
 
-  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_10_sub_pixel_variance4x4/;
+  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_10_sub_pixel_variance4x4/;
 
-  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_10_sub_pixel_avg_variance4x4/;
+  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance4x4/;
 
-  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_12_sub_pixel_variance64x64/;
+  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_12_sub_pixel_variance64x64/;
 
-  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_12_sub_pixel_avg_variance64x64/;
+  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x64/;
 
-  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_12_sub_pixel_variance32x64/;
+  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_12_sub_pixel_variance32x64/;
 
-  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_12_sub_pixel_avg_variance32x64/;
+  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x64/;
 
-  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_12_sub_pixel_variance64x32/;
+  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_12_sub_pixel_variance64x32/;
 
-  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_12_sub_pixel_avg_variance64x32/;
+  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x32/;
 
-  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_12_sub_pixel_variance32x16/;
+  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_12_sub_pixel_variance32x16/;
 
-  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_12_sub_pixel_avg_variance32x16/;
+  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x16/;
 
-  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_12_sub_pixel_variance16x32/;
+  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_12_sub_pixel_variance16x32/;
 
-  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_12_sub_pixel_avg_variance16x32/;
+  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x32/;
 
-  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_12_sub_pixel_variance32x32/;
+  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_12_sub_pixel_variance32x32/;
 
-  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_12_sub_pixel_avg_variance32x32/;
+  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x32/;
 
-  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_12_sub_pixel_variance16x16/;
+  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_12_sub_pixel_variance16x16/;
 
-  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_12_sub_pixel_avg_variance16x16/;
+  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x16/;
 
-  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_12_sub_pixel_variance8x16/;
+  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_12_sub_pixel_variance8x16/;
 
-  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_12_sub_pixel_avg_variance8x16/;
+  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x16/;
 
-  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_12_sub_pixel_variance16x8/;
+  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_12_sub_pixel_variance16x8/;
 
-  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_12_sub_pixel_avg_variance16x8/;
+  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x8/;
 
-  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_12_sub_pixel_variance8x8/;
+  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_12_sub_pixel_variance8x8/;
 
-  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_12_sub_pixel_avg_variance8x8/;
+  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x8/;
 
-  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_12_sub_pixel_variance8x4/;
+  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_12_sub_pixel_variance8x4/;
 
-  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_12_sub_pixel_avg_variance8x4/;
+  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x4/;
 
-  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_12_sub_pixel_variance4x8/;
+  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_12_sub_pixel_variance4x8/;
 
-  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_12_sub_pixel_avg_variance4x8/;
+  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x8/;
 
-  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_high_12_sub_pixel_variance4x4/;
+  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_12_sub_pixel_variance4x4/;
 
-  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_high_12_sub_pixel_avg_variance4x4/;
+  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x4/;
 
-  add_proto qw/unsigned int vp9_high_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-  specialize qw/vp9_high_sad64x64/;
+  add_proto qw/unsigned int vp9_highbd_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
+  specialize qw/vp9_highbd_sad64x64/;
 
-  add_proto qw/unsigned int vp9_high_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vp9_high_sad32x64/;
+  add_proto qw/unsigned int vp9_highbd_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vp9_highbd_sad32x64/;
 
-  add_proto qw/unsigned int vp9_high_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vp9_high_sad64x32/;
+  add_proto qw/unsigned int vp9_highbd_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vp9_highbd_sad64x32/;
 
-  add_proto qw/unsigned int vp9_high_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vp9_high_sad32x16/;
+  add_proto qw/unsigned int vp9_highbd_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vp9_highbd_sad32x16/;
 
-  add_proto qw/unsigned int vp9_high_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vp9_high_sad16x32/;
+  add_proto qw/unsigned int vp9_highbd_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vp9_highbd_sad16x32/;
 
-  add_proto qw/unsigned int vp9_high_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-  specialize qw/vp9_high_sad32x32/;
+  add_proto qw/unsigned int vp9_highbd_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
+  specialize qw/vp9_highbd_sad32x32/;
 
-  add_proto qw/unsigned int vp9_high_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-  specialize qw/vp9_high_sad16x16/;
+  add_proto qw/unsigned int vp9_highbd_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
+  specialize qw/vp9_highbd_sad16x16/;
 
-  add_proto qw/unsigned int vp9_high_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-  specialize qw/vp9_high_sad16x8/;
+  add_proto qw/unsigned int vp9_highbd_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
+  specialize qw/vp9_highbd_sad16x8/;
 
-  add_proto qw/unsigned int vp9_high_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-  specialize qw/vp9_high_sad8x16/;
+  add_proto qw/unsigned int vp9_highbd_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
+  specialize qw/vp9_highbd_sad8x16/;
 
-  add_proto qw/unsigned int vp9_high_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-  specialize qw/vp9_high_sad8x8/;
+  add_proto qw/unsigned int vp9_highbd_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
+  specialize qw/vp9_highbd_sad8x8/;
 
-  add_proto qw/unsigned int vp9_high_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vp9_high_sad8x4/;
+  add_proto qw/unsigned int vp9_highbd_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vp9_highbd_sad8x4/;
 
-  add_proto qw/unsigned int vp9_high_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vp9_high_sad4x8/;
+  add_proto qw/unsigned int vp9_highbd_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vp9_highbd_sad4x8/;
 
-  add_proto qw/unsigned int vp9_high_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-  specialize qw/vp9_high_sad4x4/;
+  add_proto qw/unsigned int vp9_highbd_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
+  specialize qw/vp9_highbd_sad4x4/;
 
-  add_proto qw/unsigned int vp9_high_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_high_sad64x64_avg/;
+  add_proto qw/unsigned int vp9_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_sad64x64_avg/;
 
-  add_proto qw/unsigned int vp9_high_sad32x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_high_sad32x64_avg/;
+  add_proto qw/unsigned int vp9_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_sad32x64_avg/;
 
-  add_proto qw/unsigned int vp9_high_sad64x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_high_sad64x32_avg/;
+  add_proto qw/unsigned int vp9_highbd_sad64x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_sad64x32_avg/;
 
-  add_proto qw/unsigned int vp9_high_sad32x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_high_sad32x16_avg/;
+  add_proto qw/unsigned int vp9_highbd_sad32x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_sad32x16_avg/;
 
-  add_proto qw/unsigned int vp9_high_sad16x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_high_sad16x32_avg/;
+  add_proto qw/unsigned int vp9_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_sad16x32_avg/;
 
-  add_proto qw/unsigned int vp9_high_sad32x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_high_sad32x32_avg/;
+  add_proto qw/unsigned int vp9_highbd_sad32x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_sad32x32_avg/;
 
-  add_proto qw/unsigned int vp9_high_sad16x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_high_sad16x16_avg/;
+  add_proto qw/unsigned int vp9_highbd_sad16x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_sad16x16_avg/;
 
-  add_proto qw/unsigned int vp9_high_sad16x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_high_sad16x8_avg/;
+  add_proto qw/unsigned int vp9_highbd_sad16x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_sad16x8_avg/;
 
-  add_proto qw/unsigned int vp9_high_sad8x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_high_sad8x16_avg/;
+  add_proto qw/unsigned int vp9_highbd_sad8x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_sad8x16_avg/;
 
-  add_proto qw/unsigned int vp9_high_sad8x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_high_sad8x8_avg/;
+  add_proto qw/unsigned int vp9_highbd_sad8x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_sad8x8_avg/;
 
-  add_proto qw/unsigned int vp9_high_sad8x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_high_sad8x4_avg/;
+  add_proto qw/unsigned int vp9_highbd_sad8x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_sad8x4_avg/;
 
-  add_proto qw/unsigned int vp9_high_sad4x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_high_sad4x8_avg/;
+  add_proto qw/unsigned int vp9_highbd_sad4x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_sad4x8_avg/;
 
-  add_proto qw/unsigned int vp9_high_sad4x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_high_sad4x4_avg/;
+  add_proto qw/unsigned int vp9_highbd_sad4x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_highbd_sad4x4_avg/;
 
-  add_proto qw/void vp9_high_sad64x64x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_high_sad64x64x3/;
+  add_proto qw/void vp9_highbd_sad64x64x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_highbd_sad64x64x3/;
 
-  add_proto qw/void vp9_high_sad32x32x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_high_sad32x32x3/;
+  add_proto qw/void vp9_highbd_sad32x32x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_highbd_sad32x32x3/;
 
-  add_proto qw/void vp9_high_sad16x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_high_sad16x16x3/;
+  add_proto qw/void vp9_highbd_sad16x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_highbd_sad16x16x3/;
 
-  add_proto qw/void vp9_high_sad16x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_high_sad16x8x3/;
+  add_proto qw/void vp9_highbd_sad16x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_highbd_sad16x8x3/;
 
-  add_proto qw/void vp9_high_sad8x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_high_sad8x16x3/;
+  add_proto qw/void vp9_highbd_sad8x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_highbd_sad8x16x3/;
 
-  add_proto qw/void vp9_high_sad8x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_high_sad8x8x3/;
+  add_proto qw/void vp9_highbd_sad8x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_highbd_sad8x8x3/;
 
-  add_proto qw/void vp9_high_sad4x4x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_high_sad4x4x3/;
+  add_proto qw/void vp9_highbd_sad4x4x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_highbd_sad4x4x3/;
 
-  add_proto qw/void vp9_high_sad64x64x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
-  specialize qw/vp9_high_sad64x64x8/;
+  add_proto qw/void vp9_highbd_sad64x64x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
+  specialize qw/vp9_highbd_sad64x64x8/;
 
-  add_proto qw/void vp9_high_sad32x32x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
-  specialize qw/vp9_high_sad32x32x8/;
+  add_proto qw/void vp9_highbd_sad32x32x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
+  specialize qw/vp9_highbd_sad32x32x8/;
 
-  add_proto qw/void vp9_high_sad16x16x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
-  specialize qw/vp9_high_sad16x16x8/;
+  add_proto qw/void vp9_highbd_sad16x16x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
+  specialize qw/vp9_highbd_sad16x16x8/;
 
-  add_proto qw/void vp9_high_sad16x8x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
-  specialize qw/vp9_high_sad16x8x8/;
+  add_proto qw/void vp9_highbd_sad16x8x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
+  specialize qw/vp9_highbd_sad16x8x8/;
 
-  add_proto qw/void vp9_high_sad8x16x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
-  specialize qw/vp9_high_sad8x16x8/;
+  add_proto qw/void vp9_highbd_sad8x16x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
+  specialize qw/vp9_highbd_sad8x16x8/;
 
-  add_proto qw/void vp9_high_sad8x8x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
-  specialize qw/vp9_high_sad8x8x8/;
+  add_proto qw/void vp9_highbd_sad8x8x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
+  specialize qw/vp9_highbd_sad8x8x8/;
 
-  add_proto qw/void vp9_high_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vp9_high_sad8x4x8/;
+  add_proto qw/void vp9_highbd_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vp9_highbd_sad8x4x8/;
 
-  add_proto qw/void vp9_high_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vp9_high_sad4x8x8/;
+  add_proto qw/void vp9_highbd_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vp9_highbd_sad4x8x8/;
 
-  add_proto qw/void vp9_high_sad4x4x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
-  specialize qw/vp9_high_sad4x4x8/;
+  add_proto qw/void vp9_highbd_sad4x4x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
+  specialize qw/vp9_highbd_sad4x4x8/;
 
-  add_proto qw/void vp9_high_sad64x64x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_high_sad64x64x4d/;
+  add_proto qw/void vp9_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_highbd_sad64x64x4d/;
 
-  add_proto qw/void vp9_high_sad32x64x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_high_sad32x64x4d/;
+  add_proto qw/void vp9_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_highbd_sad32x64x4d/;
 
-  add_proto qw/void vp9_high_sad64x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_high_sad64x32x4d/;
+  add_proto qw/void vp9_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_highbd_sad64x32x4d/;
 
-  add_proto qw/void vp9_high_sad32x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_high_sad32x16x4d/;
+  add_proto qw/void vp9_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_highbd_sad32x16x4d/;
 
-  add_proto qw/void vp9_high_sad16x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_high_sad16x32x4d/;
+  add_proto qw/void vp9_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_highbd_sad16x32x4d/;
 
-  add_proto qw/void vp9_high_sad32x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_high_sad32x32x4d/;
+  add_proto qw/void vp9_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_highbd_sad32x32x4d/;
 
-  add_proto qw/void vp9_high_sad16x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_high_sad16x16x4d/;
+  add_proto qw/void vp9_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_highbd_sad16x16x4d/;
 
-  add_proto qw/void vp9_high_sad16x8x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_high_sad16x8x4d/;
+  add_proto qw/void vp9_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_highbd_sad16x8x4d/;
 
-  add_proto qw/void vp9_high_sad8x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_high_sad8x16x4d/;
+  add_proto qw/void vp9_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_highbd_sad8x16x4d/;
 
-  add_proto qw/void vp9_high_sad8x8x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_high_sad8x8x4d/;
+  add_proto qw/void vp9_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_highbd_sad8x8x4d/;
 
   # TODO(jingning): need to convert these 4x8/8x4 functions into sse2 form
-  add_proto qw/void vp9_high_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_high_sad8x4x4d/;
+  add_proto qw/void vp9_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_highbd_sad8x4x4d/;
 
-  add_proto qw/void vp9_high_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_high_sad4x8x4d/;
+  add_proto qw/void vp9_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_highbd_sad4x8x4d/;
 
-  add_proto qw/void vp9_high_sad4x4x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_high_sad4x4x4d/;
+  add_proto qw/void vp9_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_highbd_sad4x4x4d/;
 
-  add_proto qw/unsigned int vp9_high_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_high_mse16x16/;
+  add_proto qw/unsigned int vp9_highbd_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_mse16x16/;
 
-  add_proto qw/unsigned int vp9_high_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_high_mse8x16/;
+  add_proto qw/unsigned int vp9_highbd_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_mse8x16/;
 
-  add_proto qw/unsigned int vp9_high_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_high_mse16x8/;
+  add_proto qw/unsigned int vp9_highbd_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_mse16x8/;
 
-  add_proto qw/unsigned int vp9_high_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_high_mse8x8/;
+  add_proto qw/unsigned int vp9_highbd_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_mse8x8/;
 
-  add_proto qw/unsigned int vp9_high_10_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_high_10_mse16x16/;
+  add_proto qw/unsigned int vp9_highbd_10_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_10_mse16x16/;
 
-  add_proto qw/unsigned int vp9_high_10_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_high_10_mse8x16/;
+  add_proto qw/unsigned int vp9_highbd_10_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_10_mse8x16/;
 
-  add_proto qw/unsigned int vp9_high_10_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_high_10_mse16x8/;
+  add_proto qw/unsigned int vp9_highbd_10_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_10_mse16x8/;
 
-  add_proto qw/unsigned int vp9_high_10_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_high_10_mse8x8/;
+  add_proto qw/unsigned int vp9_highbd_10_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_10_mse8x8/;
 
-  add_proto qw/unsigned int vp9_high_12_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_high_12_mse16x16/;
+  add_proto qw/unsigned int vp9_highbd_12_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_12_mse16x16/;
 
-  add_proto qw/unsigned int vp9_high_12_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_high_12_mse8x16/;
+  add_proto qw/unsigned int vp9_highbd_12_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_12_mse8x16/;
 
-  add_proto qw/unsigned int vp9_high_12_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_high_12_mse16x8/;
+  add_proto qw/unsigned int vp9_highbd_12_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_12_mse16x8/;
 
-  add_proto qw/unsigned int vp9_high_12_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_high_12_mse8x8/;
+  add_proto qw/unsigned int vp9_highbd_12_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vp9_highbd_12_mse8x8/;
 
   # ENCODEMB INVOKE
 
-  add_proto qw/int64_t vp9_high_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
-  specialize qw/vp9_high_block_error/;
+  add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
+  specialize qw/vp9_highbd_block_error/;
 
-  add_proto qw/void vp9_high_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
-  specialize qw/vp9_high_subtract_block/;
+  add_proto qw/void vp9_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
+  specialize qw/vp9_highbd_subtract_block/;
 
-  add_proto qw/void vp9_high_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_high_quantize_fp/;
+  add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_highbd_quantize_fp/;
 
-  add_proto qw/void vp9_high_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_high_quantize_fp_32x32/;
+  add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_highbd_quantize_fp_32x32/;
 
-  add_proto qw/void vp9_high_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_high_quantize_b/;
+  add_proto qw/void vp9_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_highbd_quantize_b/;
 
-  add_proto qw/void vp9_high_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_high_quantize_b_32x32/;
+  add_proto qw/void vp9_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_highbd_quantize_b_32x32/;
 
   #
   # Structured Similarity (SSIM)
   #
   if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
-    add_proto qw/void vp9_high_ssim_parms_8x8/, "uint16_t *s, int sp, uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
-    specialize qw/vp9_high_ssim_parms_8x8/;
-
-    add_proto qw/void vp9_high_ssim_parms_8x8_shift/, "uint16_t *s, int sp, uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr, unsigned int bd, unsigned int shift";
-    specialize qw/vp9_high_ssim_parms_8x8_shift/;
+    add_proto qw/void vp9_highbd_ssim_parms_8x8/, "uint16_t *s, int sp, uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+    specialize qw/vp9_highbd_ssim_parms_8x8/;
   }
 
   # fdct functions
-  add_proto qw/void vp9_high_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_high_fht4x4/;
+  add_proto qw/void vp9_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_highbd_fht4x4/;
 
-  add_proto qw/void vp9_high_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_high_fht8x8/;
+  add_proto qw/void vp9_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_highbd_fht8x8/;
 
-  add_proto qw/void vp9_high_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_high_fht16x16/;
+  add_proto qw/void vp9_highbd_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_highbd_fht16x16/;
 
-  add_proto qw/void vp9_high_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_high_fwht4x4/;
+  add_proto qw/void vp9_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_highbd_fwht4x4/;
 
-  add_proto qw/void vp9_high_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_high_fdct4x4/;
+  add_proto qw/void vp9_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_highbd_fdct4x4/;
 
-  add_proto qw/void vp9_high_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_high_fdct8x8_1/;
+  add_proto qw/void vp9_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_highbd_fdct8x8_1/;
 
-  add_proto qw/void vp9_high_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_high_fdct8x8/;
+  add_proto qw/void vp9_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_highbd_fdct8x8/;
 
-  add_proto qw/void vp9_high_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_high_fdct16x16_1/;
+  add_proto qw/void vp9_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_highbd_fdct16x16_1/;
 
-  add_proto qw/void vp9_high_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_high_fdct16x16/;
+  add_proto qw/void vp9_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_highbd_fdct16x16/;
 
-  add_proto qw/void vp9_high_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_high_fdct32x32_1/;
+  add_proto qw/void vp9_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_highbd_fdct32x32_1/;
 
-  add_proto qw/void vp9_high_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_high_fdct32x32/;
+  add_proto qw/void vp9_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_highbd_fdct32x32/;
 
-  add_proto qw/void vp9_high_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_high_fdct32x32_rd/;
+  add_proto qw/void vp9_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_highbd_fdct32x32_rd/;
 
-  add_proto qw/void vp9_high_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
-  specialize qw/vp9_high_temporal_filter_apply/;
+  add_proto qw/void vp9_highbd_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
+  specialize qw/vp9_highbd_temporal_filter_apply/;
 
 }
 # End vp9_high encoder functions

diff --git a/source/libvpx/vp9/common/vp9_scale.c b/source/libvpx/vp9/common/vp9_scale.c
index 63e2b53..6db8f9c 100644
--- a/source/libvpx/vp9/common/vp9_scale.c
+++ b/source/libvpx/vp9/common/vp9_scale.c

@@ -47,7 +47,7 @@
 void vp9_setup_scale_factors_for_frame(struct scale_factors *sf,
                                        int other_w, int other_h,
                                        int this_w, int this_h,
-                                       int use_high) {
+                                       int use_highbd) {
 #else
 void vp9_setup_scale_factors_for_frame(struct scale_factors *sf,
                                        int other_w, int other_h,
@@ -119,47 +119,47 @@
   sf->predict[1][1][0] = vp9_convolve8;
   sf->predict[1][1][1] = vp9_convolve8_avg;
 #if CONFIG_VP9_HIGHBITDEPTH
-  if (use_high) {
+  if (use_highbd) {
     if (sf->x_step_q4 == 16) {
       if (sf->y_step_q4 == 16) {
         // No scaling in either direction.
-        sf->high_predict[0][0][0] = vp9_high_convolve_copy;
-        sf->high_predict[0][0][1] = vp9_high_convolve_avg;
-        sf->high_predict[0][1][0] = vp9_high_convolve8_vert;
-        sf->high_predict[0][1][1] = vp9_high_convolve8_avg_vert;
-        sf->high_predict[1][0][0] = vp9_high_convolve8_horiz;
-        sf->high_predict[1][0][1] = vp9_high_convolve8_avg_horiz;
+        sf->highbd_predict[0][0][0] = vp9_highbd_convolve_copy;
+        sf->highbd_predict[0][0][1] = vp9_highbd_convolve_avg;
+        sf->highbd_predict[0][1][0] = vp9_highbd_convolve8_vert;
+        sf->highbd_predict[0][1][1] = vp9_highbd_convolve8_avg_vert;
+        sf->highbd_predict[1][0][0] = vp9_highbd_convolve8_horiz;
+        sf->highbd_predict[1][0][1] = vp9_highbd_convolve8_avg_horiz;
       } else {
         // No scaling in x direction. Must always scale in the y direction.
-        sf->high_predict[0][0][0] = vp9_high_convolve8_vert;
-        sf->high_predict[0][0][1] = vp9_high_convolve8_avg_vert;
-        sf->high_predict[0][1][0] = vp9_high_convolve8_vert;
-        sf->high_predict[0][1][1] = vp9_high_convolve8_avg_vert;
-        sf->high_predict[1][0][0] = vp9_high_convolve8;
-        sf->high_predict[1][0][1] = vp9_high_convolve8_avg;
+        sf->highbd_predict[0][0][0] = vp9_highbd_convolve8_vert;
+        sf->highbd_predict[0][0][1] = vp9_highbd_convolve8_avg_vert;
+        sf->highbd_predict[0][1][0] = vp9_highbd_convolve8_vert;
+        sf->highbd_predict[0][1][1] = vp9_highbd_convolve8_avg_vert;
+        sf->highbd_predict[1][0][0] = vp9_highbd_convolve8;
+        sf->highbd_predict[1][0][1] = vp9_highbd_convolve8_avg;
       }
     } else {
       if (sf->y_step_q4 == 16) {
         // No scaling in the y direction. Must always scale in the x direction.
-        sf->high_predict[0][0][0] = vp9_high_convolve8_horiz;
-        sf->high_predict[0][0][1] = vp9_high_convolve8_avg_horiz;
-        sf->high_predict[0][1][0] = vp9_high_convolve8;
-        sf->high_predict[0][1][1] = vp9_high_convolve8_avg;
-        sf->high_predict[1][0][0] = vp9_high_convolve8_horiz;
-        sf->high_predict[1][0][1] = vp9_high_convolve8_avg_horiz;
+        sf->highbd_predict[0][0][0] = vp9_highbd_convolve8_horiz;
+        sf->highbd_predict[0][0][1] = vp9_highbd_convolve8_avg_horiz;
+        sf->highbd_predict[0][1][0] = vp9_highbd_convolve8;
+        sf->highbd_predict[0][1][1] = vp9_highbd_convolve8_avg;
+        sf->highbd_predict[1][0][0] = vp9_highbd_convolve8_horiz;
+        sf->highbd_predict[1][0][1] = vp9_highbd_convolve8_avg_horiz;
       } else {
         // Must always scale in both directions.
-        sf->high_predict[0][0][0] = vp9_high_convolve8;
-        sf->high_predict[0][0][1] = vp9_high_convolve8_avg;
-        sf->high_predict[0][1][0] = vp9_high_convolve8;
-        sf->high_predict[0][1][1] = vp9_high_convolve8_avg;
-        sf->high_predict[1][0][0] = vp9_high_convolve8;
-        sf->high_predict[1][0][1] = vp9_high_convolve8_avg;
+        sf->highbd_predict[0][0][0] = vp9_highbd_convolve8;
+        sf->highbd_predict[0][0][1] = vp9_highbd_convolve8_avg;
+        sf->highbd_predict[0][1][0] = vp9_highbd_convolve8;
+        sf->highbd_predict[0][1][1] = vp9_highbd_convolve8_avg;
+        sf->highbd_predict[1][0][0] = vp9_highbd_convolve8;
+        sf->highbd_predict[1][0][1] = vp9_highbd_convolve8_avg;
       }
     }
     // 2D subpel motion always gets filtered in both directions.
-    sf->high_predict[1][1][0] = vp9_high_convolve8;
-    sf->high_predict[1][1][1] = vp9_high_convolve8_avg;
+    sf->highbd_predict[1][1][0] = vp9_highbd_convolve8;
+    sf->highbd_predict[1][1][1] = vp9_highbd_convolve8_avg;
   }
 #endif
 }

diff --git a/source/libvpx/vp9/common/vp9_scale.h b/source/libvpx/vp9/common/vp9_scale.h
index 2e923db..a1601a7 100644
--- a/source/libvpx/vp9/common/vp9_scale.h
+++ b/source/libvpx/vp9/common/vp9_scale.h

@@ -33,7 +33,7 @@
 
   convolve_fn_t predict[2][2][2];  // horiz, vert, avg
 #if CONFIG_VP9_HIGHBITDEPTH
-  high_convolve_fn_t high_predict[2][2][2];  // horiz, vert, avg
+  highbd_convolve_fn_t highbd_predict[2][2][2];  // horiz, vert, avg
 #endif
 };
 

diff --git a/source/libvpx/vp9/common/vp9_scan.c b/source/libvpx/vp9/common/vp9_scan.c
index 1ec5a0c..d6fb8b2 100644
--- a/source/libvpx/vp9/common/vp9_scan.c
+++ b/source/libvpx/vp9/common/vp9_scan.c

@@ -233,37 +233,467 @@
 // in {top, left, topleft, topright, bottomleft} order
 // for each position in raster scan order.
 // -1 indicates the neighbor does not exist.
-DECLARE_ALIGNED(16, static int16_t,
-                default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, static int16_t,
-                col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, static int16_t,
-                row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, static int16_t,
-                col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, static int16_t,
-                row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, static int16_t,
-                default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, static int16_t,
-                col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, static int16_t,
-                row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, static int16_t,
-                default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, static int16_t,
-                default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 0, 0, 1, 4, 4, 4, 1, 1, 8, 8, 5, 8, 2, 2, 2, 5, 9, 12, 6, 9,
+  3, 6, 10, 13, 7, 10, 11, 14, 0, 0,
+};
 
-DECLARE_ALIGNED(16, static int16_t, vp9_default_iscan_4x4[16]);
-DECLARE_ALIGNED(16, static int16_t, vp9_col_iscan_4x4[16]);
-DECLARE_ALIGNED(16, static int16_t, vp9_row_iscan_4x4[16]);
-DECLARE_ALIGNED(16, static int16_t, vp9_col_iscan_8x8[64]);
-DECLARE_ALIGNED(16, static int16_t, vp9_row_iscan_8x8[64]);
-DECLARE_ALIGNED(16, static int16_t, vp9_default_iscan_8x8[64]);
-DECLARE_ALIGNED(16, static int16_t, vp9_col_iscan_16x16[256]);
-DECLARE_ALIGNED(16, static int16_t, vp9_row_iscan_16x16[256]);
-DECLARE_ALIGNED(16, static  int16_t, vp9_default_iscan_16x16[256]);
-DECLARE_ALIGNED(16, static int16_t, vp9_default_iscan_32x32[1024]);
+DECLARE_ALIGNED(16, static const int16_t,
+                col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 4, 4, 0, 0, 8, 8, 1, 1, 5, 5, 1, 1, 9, 9, 2, 2, 6, 6, 2, 2, 3,
+  3, 10, 10, 7, 7, 11, 11, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 0, 0, 1, 1, 4, 4, 2, 2, 5, 5, 4, 4, 8, 8, 6, 6, 8, 8, 9, 9, 12,
+  12, 10, 10, 13, 13, 14, 14, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 8, 8, 0, 0, 16, 16, 1, 1, 24, 24, 9, 9, 1, 1, 32, 32, 17, 17, 2,
+  2, 25, 25, 10, 10, 40, 40, 2, 2, 18, 18, 33, 33, 3, 3, 48, 48, 11, 11, 26,
+  26, 3, 3, 41, 41, 19, 19, 34, 34, 4, 4, 27, 27, 12, 12, 49, 49, 42, 42, 20,
+  20, 4, 4, 35, 35, 5, 5, 28, 28, 50, 50, 43, 43, 13, 13, 36, 36, 5, 5, 21, 21,
+  51, 51, 29, 29, 6, 6, 44, 44, 14, 14, 6, 6, 37, 37, 52, 52, 22, 22, 7, 7, 30,
+  30, 45, 45, 15, 15, 38, 38, 23, 23, 53, 53, 31, 31, 46, 46, 39, 39, 54, 54,
+  47, 47, 55, 55, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 1, 1, 0, 0, 8, 8, 2, 2, 8, 8, 9, 9, 3, 3, 16, 16, 10, 10, 16, 16,
+  4, 4, 17, 17, 24, 24, 11, 11, 18, 18, 25, 25, 24, 24, 5, 5, 12, 12, 19, 19,
+  32, 32, 26, 26, 6, 6, 33, 33, 32, 32, 20, 20, 27, 27, 40, 40, 13, 13, 34, 34,
+  40, 40, 41, 41, 28, 28, 35, 35, 48, 48, 21, 21, 42, 42, 14, 14, 48, 48, 36,
+  36, 49, 49, 43, 43, 29, 29, 56, 56, 22, 22, 50, 50, 57, 57, 44, 44, 37, 37,
+  51, 51, 30, 30, 58, 58, 52, 52, 45, 45, 59, 59, 38, 38, 60, 60, 46, 46, 53,
+  53, 54, 54, 61, 61, 62, 62, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 0, 0, 8, 8, 1, 8, 1, 1, 9, 16, 16, 16, 2, 9, 2, 2, 10, 17, 17,
+  24, 24, 24, 3, 10, 3, 3, 18, 25, 25, 32, 11, 18, 32, 32, 4, 11, 26, 33, 19,
+  26, 4, 4, 33, 40, 12, 19, 40, 40, 5, 12, 27, 34, 34, 41, 20, 27, 13, 20, 5,
+  5, 41, 48, 48, 48, 28, 35, 35, 42, 21, 28, 6, 6, 6, 13, 42, 49, 49, 56, 36,
+  43, 14, 21, 29, 36, 7, 14, 43, 50, 50, 57, 22, 29, 37, 44, 15, 22, 44, 51,
+  51, 58, 30, 37, 23, 30, 52, 59, 45, 52, 38, 45, 31, 38, 53, 60, 46, 53, 39,
+  46, 54, 61, 47, 54, 55, 62, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 16, 16, 32, 32, 0, 0, 48, 48, 1, 1, 64, 64,
+  17, 17, 80, 80, 33, 33, 1, 1, 49, 49, 96, 96, 2, 2, 65, 65,
+  18, 18, 112, 112, 34, 34, 81, 81, 2, 2, 50, 50, 128, 128, 3, 3,
+  97, 97, 19, 19, 66, 66, 144, 144, 82, 82, 35, 35, 113, 113, 3, 3,
+  51, 51, 160, 160, 4, 4, 98, 98, 129, 129, 67, 67, 20, 20, 83, 83,
+  114, 114, 36, 36, 176, 176, 4, 4, 145, 145, 52, 52, 99, 99, 5, 5,
+  130, 130, 68, 68, 192, 192, 161, 161, 21, 21, 115, 115, 84, 84, 37, 37,
+  146, 146, 208, 208, 53, 53, 5, 5, 100, 100, 177, 177, 131, 131, 69, 69,
+  6, 6, 224, 224, 116, 116, 22, 22, 162, 162, 85, 85, 147, 147, 38, 38,
+  193, 193, 101, 101, 54, 54, 6, 6, 132, 132, 178, 178, 70, 70, 163, 163,
+  209, 209, 7, 7, 117, 117, 23, 23, 148, 148, 7, 7, 86, 86, 194, 194,
+  225, 225, 39, 39, 179, 179, 102, 102, 133, 133, 55, 55, 164, 164, 8, 8,
+  71, 71, 210, 210, 118, 118, 149, 149, 195, 195, 24, 24, 87, 87, 40, 40,
+  56, 56, 134, 134, 180, 180, 226, 226, 103, 103, 8, 8, 165, 165, 211, 211,
+  72, 72, 150, 150, 9, 9, 119, 119, 25, 25, 88, 88, 196, 196, 41, 41,
+  135, 135, 181, 181, 104, 104, 57, 57, 227, 227, 166, 166, 120, 120, 151, 151,
+  197, 197, 73, 73, 9, 9, 212, 212, 89, 89, 136, 136, 182, 182, 10, 10,
+  26, 26, 105, 105, 167, 167, 228, 228, 152, 152, 42, 42, 121, 121, 213, 213,
+  58, 58, 198, 198, 74, 74, 137, 137, 183, 183, 168, 168, 10, 10, 90, 90,
+  229, 229, 11, 11, 106, 106, 214, 214, 153, 153, 27, 27, 199, 199, 43, 43,
+  184, 184, 122, 122, 169, 169, 230, 230, 59, 59, 11, 11, 75, 75, 138, 138,
+  200, 200, 215, 215, 91, 91, 12, 12, 28, 28, 185, 185, 107, 107, 154, 154,
+  44, 44, 231, 231, 216, 216, 60, 60, 123, 123, 12, 12, 76, 76, 201, 201,
+  170, 170, 232, 232, 139, 139, 92, 92, 13, 13, 108, 108, 29, 29, 186, 186,
+  217, 217, 155, 155, 45, 45, 13, 13, 61, 61, 124, 124, 14, 14, 233, 233,
+  77, 77, 14, 14, 171, 171, 140, 140, 202, 202, 30, 30, 93, 93, 109, 109,
+  46, 46, 156, 156, 62, 62, 187, 187, 15, 15, 125, 125, 218, 218, 78, 78,
+  31, 31, 172, 172, 47, 47, 141, 141, 94, 94, 234, 234, 203, 203, 63, 63,
+  110, 110, 188, 188, 157, 157, 126, 126, 79, 79, 173, 173, 95, 95, 219, 219,
+  142, 142, 204, 204, 235, 235, 111, 111, 158, 158, 127, 127, 189, 189, 220,
+  220, 143, 143, 174, 174, 205, 205, 236, 236, 159, 159, 190, 190, 221, 221,
+  175, 175, 237, 237, 206, 206, 222, 222, 191, 191, 238, 238, 207, 207, 223,
+  223, 239, 239, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 1, 1, 0, 0, 2, 2, 16, 16, 3, 3, 17, 17,
+  16, 16, 4, 4, 32, 32, 18, 18, 5, 5, 33, 33, 32, 32, 19, 19,
+  48, 48, 6, 6, 34, 34, 20, 20, 49, 49, 48, 48, 7, 7, 35, 35,
+  64, 64, 21, 21, 50, 50, 36, 36, 64, 64, 8, 8, 65, 65, 51, 51,
+  22, 22, 37, 37, 80, 80, 66, 66, 9, 9, 52, 52, 23, 23, 81, 81,
+  67, 67, 80, 80, 38, 38, 10, 10, 53, 53, 82, 82, 96, 96, 68, 68,
+  24, 24, 97, 97, 83, 83, 39, 39, 96, 96, 54, 54, 11, 11, 69, 69,
+  98, 98, 112, 112, 84, 84, 25, 25, 40, 40, 55, 55, 113, 113, 99, 99,
+  12, 12, 70, 70, 112, 112, 85, 85, 26, 26, 114, 114, 100, 100, 128, 128,
+  41, 41, 56, 56, 71, 71, 115, 115, 13, 13, 86, 86, 129, 129, 101, 101,
+  128, 128, 72, 72, 130, 130, 116, 116, 27, 27, 57, 57, 14, 14, 87, 87,
+  42, 42, 144, 144, 102, 102, 131, 131, 145, 145, 117, 117, 73, 73, 144, 144,
+  88, 88, 132, 132, 103, 103, 28, 28, 58, 58, 146, 146, 118, 118, 43, 43,
+  160, 160, 147, 147, 89, 89, 104, 104, 133, 133, 161, 161, 119, 119, 160, 160,
+  74, 74, 134, 134, 148, 148, 29, 29, 59, 59, 162, 162, 176, 176, 44, 44,
+  120, 120, 90, 90, 105, 105, 163, 163, 177, 177, 149, 149, 176, 176, 135, 135,
+  164, 164, 178, 178, 30, 30, 150, 150, 192, 192, 75, 75, 121, 121, 60, 60,
+  136, 136, 193, 193, 106, 106, 151, 151, 179, 179, 192, 192, 45, 45, 165, 165,
+  166, 166, 194, 194, 91, 91, 180, 180, 137, 137, 208, 208, 122, 122, 152, 152,
+  208, 208, 195, 195, 76, 76, 167, 167, 209, 209, 181, 181, 224, 224, 107, 107,
+  196, 196, 61, 61, 153, 153, 224, 224, 182, 182, 168, 168, 210, 210, 46, 46,
+  138, 138, 92, 92, 183, 183, 225, 225, 211, 211, 240, 240, 197, 197, 169, 169,
+  123, 123, 154, 154, 198, 198, 77, 77, 212, 212, 184, 184, 108, 108, 226, 226,
+  199, 199, 62, 62, 227, 227, 241, 241, 139, 139, 213, 213, 170, 170, 185, 185,
+  155, 155, 228, 228, 242, 242, 124, 124, 93, 93, 200, 200, 243, 243, 214, 214,
+  215, 215, 229, 229, 140, 140, 186, 186, 201, 201, 78, 78, 171, 171, 109, 109,
+  156, 156, 244, 244, 216, 216, 230, 230, 94, 94, 245, 245, 231, 231, 125, 125,
+  202, 202, 246, 246, 232, 232, 172, 172, 217, 217, 141, 141, 110, 110, 157,
+  157, 187, 187, 247, 247, 126, 126, 233, 233, 218, 218, 248, 248, 188, 188,
+  203, 203, 142, 142, 173, 173, 158, 158, 249, 249, 234, 234, 204, 204, 219,
+  219, 174, 174, 189, 189, 250, 250, 220, 220, 190, 190, 205, 205, 235, 235,
+  206, 206, 236, 236, 251, 251, 221, 221, 252, 252, 222, 222, 237, 237, 238,
+  238, 253, 253, 254, 254, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 0, 0, 16, 16, 1, 16, 1, 1, 32, 32, 17, 32,
+  2, 17, 2, 2, 48, 48, 18, 33, 33, 48, 3, 18, 49, 64, 64, 64,
+  34, 49, 3, 3, 19, 34, 50, 65, 4, 19, 65, 80, 80, 80, 35, 50,
+  4, 4, 20, 35, 66, 81, 81, 96, 51, 66, 96, 96, 5, 20, 36, 51,
+  82, 97, 21, 36, 67, 82, 97, 112, 5, 5, 52, 67, 112, 112, 37, 52,
+  6, 21, 83, 98, 98, 113, 68, 83, 6, 6, 113, 128, 22, 37, 53, 68,
+  84, 99, 99, 114, 128, 128, 114, 129, 69, 84, 38, 53, 7, 22, 7, 7,
+  129, 144, 23, 38, 54, 69, 100, 115, 85, 100, 115, 130, 144, 144, 130, 145,
+  39, 54, 70, 85, 8, 23, 55, 70, 116, 131, 101, 116, 145, 160, 24, 39,
+  8, 8, 86, 101, 131, 146, 160, 160, 146, 161, 71, 86, 40, 55, 9, 24,
+  117, 132, 102, 117, 161, 176, 132, 147, 56, 71, 87, 102, 25, 40, 147, 162,
+  9, 9, 176, 176, 162, 177, 72, 87, 41, 56, 118, 133, 133, 148, 103, 118,
+  10, 25, 148, 163, 57, 72, 88, 103, 177, 192, 26, 41, 163, 178, 192, 192,
+  10, 10, 119, 134, 73, 88, 149, 164, 104, 119, 134, 149, 42, 57, 178, 193,
+  164, 179, 11, 26, 58, 73, 193, 208, 89, 104, 135, 150, 120, 135, 27, 42,
+  74, 89, 208, 208, 150, 165, 179, 194, 165, 180, 105, 120, 194, 209, 43, 58,
+  11, 11, 136, 151, 90, 105, 151, 166, 180, 195, 59, 74, 121, 136, 209, 224,
+  195, 210, 224, 224, 166, 181, 106, 121, 75, 90, 12, 27, 181, 196, 12, 12,
+  210, 225, 152, 167, 167, 182, 137, 152, 28, 43, 196, 211, 122, 137, 91, 106,
+  225, 240, 44, 59, 13, 28, 107, 122, 182, 197, 168, 183, 211, 226, 153, 168,
+  226, 241, 60, 75, 197, 212, 138, 153, 29, 44, 76, 91, 13, 13, 183, 198,
+  123, 138, 45, 60, 212, 227, 198, 213, 154, 169, 169, 184, 227, 242, 92, 107,
+  61, 76, 139, 154, 14, 29, 14, 14, 184, 199, 213, 228, 108, 123, 199, 214,
+  228, 243, 77, 92, 30, 45, 170, 185, 155, 170, 185, 200, 93, 108, 124, 139,
+  214, 229, 46, 61, 200, 215, 229, 244, 15, 30, 109, 124, 62, 77, 140, 155,
+  215, 230, 31, 46, 171, 186, 186, 201, 201, 216, 78, 93, 230, 245, 125, 140,
+  47, 62, 216, 231, 156, 171, 94, 109, 231, 246, 141, 156, 63, 78, 202, 217,
+  187, 202, 110, 125, 217, 232, 172, 187, 232, 247, 79, 94, 157, 172, 126, 141,
+  203, 218, 95, 110, 233, 248, 218, 233, 142, 157, 111, 126, 173, 188, 188, 203,
+  234, 249, 219, 234, 127, 142, 158, 173, 204, 219, 189, 204, 143, 158, 235,
+  250, 174, 189, 205, 220, 159, 174, 220, 235, 221, 236, 175, 190, 190, 205,
+  236, 251, 206, 221, 237, 252, 191, 206, 222, 237, 207, 222, 238, 253, 223,
+  238, 239, 254, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 0, 0, 32, 32, 1, 32, 1, 1, 64, 64, 33, 64,
+  2, 33, 96, 96, 2, 2, 65, 96, 34, 65, 128, 128, 97, 128, 3, 34,
+  66, 97, 3, 3, 35, 66, 98, 129, 129, 160, 160, 160, 4, 35, 67, 98,
+  192, 192, 4, 4, 130, 161, 161, 192, 36, 67, 99, 130, 5, 36, 68, 99,
+  193, 224, 162, 193, 224, 224, 131, 162, 37, 68, 100, 131, 5, 5, 194, 225,
+  225, 256, 256, 256, 163, 194, 69, 100, 132, 163, 6, 37, 226, 257, 6, 6,
+  195, 226, 257, 288, 101, 132, 288, 288, 38, 69, 164, 195, 133, 164, 258, 289,
+  227, 258, 196, 227, 7, 38, 289, 320, 70, 101, 320, 320, 7, 7, 165, 196,
+  39, 70, 102, 133, 290, 321, 259, 290, 228, 259, 321, 352, 352, 352, 197, 228,
+  134, 165, 71, 102, 8, 39, 322, 353, 291, 322, 260, 291, 103, 134, 353, 384,
+  166, 197, 229, 260, 40, 71, 8, 8, 384, 384, 135, 166, 354, 385, 323, 354,
+  198, 229, 292, 323, 72, 103, 261, 292, 9, 40, 385, 416, 167, 198, 104, 135,
+  230, 261, 355, 386, 416, 416, 293, 324, 324, 355, 9, 9, 41, 72, 386, 417,
+  199, 230, 136, 167, 417, 448, 262, 293, 356, 387, 73, 104, 387, 418, 231, 262,
+  10, 41, 168, 199, 325, 356, 418, 449, 105, 136, 448, 448, 42, 73, 294, 325,
+  200, 231, 10, 10, 357, 388, 137, 168, 263, 294, 388, 419, 74, 105, 419, 450,
+  449, 480, 326, 357, 232, 263, 295, 326, 169, 200, 11, 42, 106, 137, 480, 480,
+  450, 481, 358, 389, 264, 295, 201, 232, 138, 169, 389, 420, 43, 74, 420, 451,
+  327, 358, 11, 11, 481, 512, 233, 264, 451, 482, 296, 327, 75, 106, 170, 201,
+  482, 513, 512, 512, 390, 421, 359, 390, 421, 452, 107, 138, 12, 43, 202, 233,
+  452, 483, 265, 296, 328, 359, 139, 170, 44, 75, 483, 514, 513, 544, 234, 265,
+  297, 328, 422, 453, 12, 12, 391, 422, 171, 202, 76, 107, 514, 545, 453, 484,
+  544, 544, 266, 297, 203, 234, 108, 139, 329, 360, 298, 329, 140, 171, 515,
+  546, 13, 44, 423, 454, 235, 266, 545, 576, 454, 485, 45, 76, 172, 203, 330,
+  361, 576, 576, 13, 13, 267, 298, 546, 577, 77, 108, 204, 235, 455, 486, 577,
+  608, 299, 330, 109, 140, 547, 578, 14, 45, 14, 14, 141, 172, 578, 609, 331,
+  362, 46, 77, 173, 204, 15, 15, 78, 109, 205, 236, 579, 610, 110, 141, 15, 46,
+  142, 173, 47, 78, 174, 205, 16, 16, 79, 110, 206, 237, 16, 47, 111, 142,
+  48, 79, 143, 174, 80, 111, 175, 206, 17, 48, 17, 17, 207, 238, 49, 80,
+  81, 112, 18, 18, 18, 49, 50, 81, 82, 113, 19, 50, 51, 82, 83, 114, 608, 608,
+  484, 515, 360, 391, 236, 267, 112, 143, 19, 19, 640, 640, 609, 640, 516, 547,
+  485, 516, 392, 423, 361, 392, 268, 299, 237, 268, 144, 175, 113, 144, 20, 51,
+  20, 20, 672, 672, 641, 672, 610, 641, 548, 579, 517, 548, 486, 517, 424, 455,
+  393, 424, 362, 393, 300, 331, 269, 300, 238, 269, 176, 207, 145, 176, 114,
+  145, 52, 83, 21, 52, 21, 21, 704, 704, 673, 704, 642, 673, 611, 642, 580,
+  611, 549, 580, 518, 549, 487, 518, 456, 487, 425, 456, 394, 425, 363, 394,
+  332, 363, 301, 332, 270, 301, 239, 270, 208, 239, 177, 208, 146, 177, 115,
+  146, 84, 115, 53, 84, 22, 53, 22, 22, 705, 736, 674, 705, 643, 674, 581, 612,
+  550, 581, 519, 550, 457, 488, 426, 457, 395, 426, 333, 364, 302, 333, 271,
+  302, 209, 240, 178, 209, 147, 178, 85, 116, 54, 85, 23, 54, 706, 737, 675,
+  706, 582, 613, 551, 582, 458, 489, 427, 458, 334, 365, 303, 334, 210, 241,
+  179, 210, 86, 117, 55, 86, 707, 738, 583, 614, 459, 490, 335, 366, 211, 242,
+  87, 118, 736, 736, 612, 643, 488, 519, 364, 395, 240, 271, 116, 147, 23, 23,
+  768, 768, 737, 768, 644, 675, 613, 644, 520, 551, 489, 520, 396, 427, 365,
+  396, 272, 303, 241, 272, 148, 179, 117, 148, 24, 55, 24, 24, 800, 800, 769,
+  800, 738, 769, 676, 707, 645, 676, 614, 645, 552, 583, 521, 552, 490, 521,
+  428, 459, 397, 428, 366, 397, 304, 335, 273, 304, 242, 273, 180, 211, 149,
+  180, 118, 149, 56, 87, 25, 56, 25, 25, 832, 832, 801, 832, 770, 801, 739,
+  770, 708, 739, 677, 708, 646, 677, 615, 646, 584, 615, 553, 584, 522, 553,
+  491, 522, 460, 491, 429, 460, 398, 429, 367, 398, 336, 367, 305, 336, 274,
+  305, 243, 274, 212, 243, 181, 212, 150, 181, 119, 150, 88, 119, 57, 88, 26,
+  57, 26, 26, 833, 864, 802, 833, 771, 802, 709, 740, 678, 709, 647, 678, 585,
+  616, 554, 585, 523, 554, 461, 492, 430, 461, 399, 430, 337, 368, 306, 337,
+  275, 306, 213, 244, 182, 213, 151, 182, 89, 120, 58, 89, 27, 58, 834, 865,
+  803, 834, 710, 741, 679, 710, 586, 617, 555, 586, 462, 493, 431, 462, 338,
+  369, 307, 338, 214, 245, 183, 214, 90, 121, 59, 90, 835, 866, 711, 742, 587,
+  618, 463, 494, 339, 370, 215, 246, 91, 122, 864, 864, 740, 771, 616, 647,
+  492, 523, 368, 399, 244, 275, 120, 151, 27, 27, 896, 896, 865, 896, 772, 803,
+  741, 772, 648, 679, 617, 648, 524, 555, 493, 524, 400, 431, 369, 400, 276,
+  307, 245, 276, 152, 183, 121, 152, 28, 59, 28, 28, 928, 928, 897, 928, 866,
+  897, 804, 835, 773, 804, 742, 773, 680, 711, 649, 680, 618, 649, 556, 587,
+  525, 556, 494, 525, 432, 463, 401, 432, 370, 401, 308, 339, 277, 308, 246,
+  277, 184, 215, 153, 184, 122, 153, 60, 91, 29, 60, 29, 29, 960, 960, 929,
+  960, 898, 929, 867, 898, 836, 867, 805, 836, 774, 805, 743, 774, 712, 743,
+  681, 712, 650, 681, 619, 650, 588, 619, 557, 588, 526, 557, 495, 526, 464,
+  495, 433, 464, 402, 433, 371, 402, 340, 371, 309, 340, 278, 309, 247, 278,
+  216, 247, 185, 216, 154, 185, 123, 154, 92, 123, 61, 92, 30, 61, 30, 30,
+  961, 992, 930, 961, 899, 930, 837, 868, 806, 837, 775, 806, 713, 744, 682,
+  713, 651, 682, 589, 620, 558, 589, 527, 558, 465, 496, 434, 465, 403, 434,
+  341, 372, 310, 341, 279, 310, 217, 248, 186, 217, 155, 186, 93, 124, 62, 93,
+  31, 62, 962, 993, 931, 962, 838, 869, 807, 838, 714, 745, 683, 714, 590, 621,
+  559, 590, 466, 497, 435, 466, 342, 373, 311, 342, 218, 249, 187, 218, 94,
+  125, 63, 94, 963, 994, 839, 870, 715, 746, 591, 622, 467, 498, 343, 374, 219,
+  250, 95, 126, 868, 899, 744, 775, 620, 651, 496, 527, 372, 403, 248, 279,
+  124, 155, 900, 931, 869, 900, 776, 807, 745, 776, 652, 683, 621, 652, 528,
+  559, 497, 528, 404, 435, 373, 404, 280, 311, 249, 280, 156, 187, 125, 156,
+  932, 963, 901, 932, 870, 901, 808, 839, 777, 808, 746, 777, 684, 715, 653,
+  684, 622, 653, 560, 591, 529, 560, 498, 529, 436, 467, 405, 436, 374, 405,
+  312, 343, 281, 312, 250, 281, 188, 219, 157, 188, 126, 157, 964, 995, 933,
+  964, 902, 933, 871, 902, 840, 871, 809, 840, 778, 809, 747, 778, 716, 747,
+  685, 716, 654, 685, 623, 654, 592, 623, 561, 592, 530, 561, 499, 530, 468,
+  499, 437, 468, 406, 437, 375, 406, 344, 375, 313, 344, 282, 313, 251, 282,
+  220, 251, 189, 220, 158, 189, 127, 158, 965, 996, 934, 965, 903, 934, 841,
+  872, 810, 841, 779, 810, 717, 748, 686, 717, 655, 686, 593, 624, 562, 593,
+  531, 562, 469, 500, 438, 469, 407, 438, 345, 376, 314, 345, 283, 314, 221,
+  252, 190, 221, 159, 190, 966, 997, 935, 966, 842, 873, 811, 842, 718, 749,
+  687, 718, 594, 625, 563, 594, 470, 501, 439, 470, 346, 377, 315, 346, 222,
+  253, 191, 222, 967, 998, 843, 874, 719, 750, 595, 626, 471, 502, 347, 378,
+  223, 254, 872, 903, 748, 779, 624, 655, 500, 531, 376, 407, 252, 283, 904,
+  935, 873, 904, 780, 811, 749, 780, 656, 687, 625, 656, 532, 563, 501, 532,
+  408, 439, 377, 408, 284, 315, 253, 284, 936, 967, 905, 936, 874, 905, 812,
+  843, 781, 812, 750, 781, 688, 719, 657, 688, 626, 657, 564, 595, 533, 564,
+  502, 533, 440, 471, 409, 440, 378, 409, 316, 347, 285, 316, 254, 285, 968,
+  999, 937, 968, 906, 937, 875, 906, 844, 875, 813, 844, 782, 813, 751, 782,
+  720, 751, 689, 720, 658, 689, 627, 658, 596, 627, 565, 596, 534, 565, 503,
+  534, 472, 503, 441, 472, 410, 441, 379, 410, 348, 379, 317, 348, 286, 317,
+  255, 286, 969, 1000, 938, 969, 907, 938, 845, 876, 814, 845, 783, 814, 721,
+  752, 690, 721, 659, 690, 597, 628, 566, 597, 535, 566, 473, 504, 442, 473,
+  411, 442, 349, 380, 318, 349, 287, 318, 970, 1001, 939, 970, 846, 877, 815,
+  846, 722, 753, 691, 722, 598, 629, 567, 598, 474, 505, 443, 474, 350, 381,
+  319, 350, 971, 1002, 847, 878, 723, 754, 599, 630, 475, 506, 351, 382, 876,
+  907, 752, 783, 628, 659, 504, 535, 380, 411, 908, 939, 877, 908, 784, 815,
+  753, 784, 660, 691, 629, 660, 536, 567, 505, 536, 412, 443, 381, 412, 940,
+  971, 909, 940, 878, 909, 816, 847, 785, 816, 754, 785, 692, 723, 661, 692,
+  630, 661, 568, 599, 537, 568, 506, 537, 444, 475, 413, 444, 382, 413, 972,
+  1003, 941, 972, 910, 941, 879, 910, 848, 879, 817, 848, 786, 817, 755, 786,
+  724, 755, 693, 724, 662, 693, 631, 662, 600, 631, 569, 600, 538, 569, 507,
+  538, 476, 507, 445, 476, 414, 445, 383, 414, 973, 1004, 942, 973, 911, 942,
+  849, 880, 818, 849, 787, 818, 725, 756, 694, 725, 663, 694, 601, 632, 570,
+  601, 539, 570, 477, 508, 446, 477, 415, 446, 974, 1005, 943, 974, 850, 881,
+  819, 850, 726, 757, 695, 726, 602, 633, 571, 602, 478, 509, 447, 478, 975,
+  1006, 851, 882, 727, 758, 603, 634, 479, 510, 880, 911, 756, 787, 632, 663,
+  508, 539, 912, 943, 881, 912, 788, 819, 757, 788, 664, 695, 633, 664, 540,
+  571, 509, 540, 944, 975, 913, 944, 882, 913, 820, 851, 789, 820, 758, 789,
+  696, 727, 665, 696, 634, 665, 572, 603, 541, 572, 510, 541, 976, 1007, 945,
+  976, 914, 945, 883, 914, 852, 883, 821, 852, 790, 821, 759, 790, 728, 759,
+  697, 728, 666, 697, 635, 666, 604, 635, 573, 604, 542, 573, 511, 542, 977,
+  1008, 946, 977, 915, 946, 853, 884, 822, 853, 791, 822, 729, 760, 698, 729,
+  667, 698, 605, 636, 574, 605, 543, 574, 978, 1009, 947, 978, 854, 885, 823,
+  854, 730, 761, 699, 730, 606, 637, 575, 606, 979, 1010, 855, 886, 731, 762,
+  607, 638, 884, 915, 760, 791, 636, 667, 916, 947, 885, 916, 792, 823, 761,
+  792, 668, 699, 637, 668, 948, 979, 917, 948, 886, 917, 824, 855, 793, 824,
+  762, 793, 700, 731, 669, 700, 638, 669, 980, 1011, 949, 980, 918, 949, 887,
+  918, 856, 887, 825, 856, 794, 825, 763, 794, 732, 763, 701, 732, 670, 701,
+  639, 670, 981, 1012, 950, 981, 919, 950, 857, 888, 826, 857, 795, 826, 733,
+  764, 702, 733, 671, 702, 982, 1013, 951, 982, 858, 889, 827, 858, 734, 765,
+  703, 734, 983, 1014, 859, 890, 735, 766, 888, 919, 764, 795, 920, 951, 889,
+  920, 796, 827, 765, 796, 952, 983, 921, 952, 890, 921, 828, 859, 797, 828,
+  766, 797, 984, 1015, 953, 984, 922, 953, 891, 922, 860, 891, 829, 860, 798,
+  829, 767, 798, 985, 1016, 954, 985, 923, 954, 861, 892, 830, 861, 799, 830,
+  986, 1017, 955, 986, 862, 893, 831, 862, 987, 1018, 863, 894, 892, 923, 924,
+  955, 893, 924, 956, 987, 925, 956, 894, 925, 988, 1019, 957, 988, 926, 957,
+  895, 926, 989, 1020, 958, 989, 927, 958, 990, 1021, 959, 990, 991, 1022, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_4x4[16]) = {
+  0, 2, 5, 8, 1, 3, 9, 12, 4, 7, 11, 14, 6, 10, 13, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_4x4[16]) = {
+  0, 3, 7, 11, 1, 5, 9, 12, 2, 6, 10, 14, 4, 8, 13, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_4x4[16]) = {
+  0, 1, 3, 5, 2, 4, 6, 9, 7, 8, 11, 13, 10, 12, 14, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_8x8[64]) = {
+  0, 3, 8, 15, 22, 32, 40, 47, 1, 5, 11, 18, 26, 34, 44, 51,
+  2, 7, 13, 20, 28, 38, 46, 54, 4, 10, 16, 24, 31, 41, 50, 56,
+  6, 12, 21, 27, 35, 43, 52, 58, 9, 17, 25, 33, 39, 48, 55, 60,
+  14, 23, 30, 37, 45, 53, 59, 62, 19, 29, 36, 42, 49, 57, 61, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_8x8[64]) = {
+  0, 1, 2, 5, 8, 12, 19, 24, 3, 4, 7, 10, 15, 20, 30, 39,
+  6, 9, 13, 16, 21, 27, 37, 46, 11, 14, 17, 23, 28, 34, 44, 52,
+  18, 22, 25, 31, 35, 41, 50, 57, 26, 29, 33, 38, 43, 49, 55, 59,
+  32, 36, 42, 47, 51, 54, 60, 61, 40, 45, 48, 53, 56, 58, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_8x8[64]) = {
+  0, 2, 5, 9, 14, 22, 31, 37, 1, 4, 8, 13, 19, 26, 38, 44,
+  3, 6, 10, 17, 24, 30, 42, 49, 7, 11, 15, 21, 29, 36, 47, 53,
+  12, 16, 20, 27, 34, 43, 52, 57, 18, 23, 28, 35, 41, 48, 56, 60,
+  25, 32, 39, 45, 50, 55, 59, 62, 33, 40, 46, 51, 54, 58, 61, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_16x16[256]) = {
+  0, 4, 11, 20, 31, 43, 59, 75, 85, 109, 130, 150, 165, 181, 195, 198,
+  1, 6, 14, 23, 34, 47, 64, 81, 95, 114, 135, 153, 171, 188, 201, 212,
+  2, 8, 16, 25, 38, 52, 67, 83, 101, 116, 136, 157, 172, 190, 205, 216,
+  3, 10, 18, 29, 41, 55, 71, 89, 103, 119, 141, 159, 176, 194, 208, 218,
+  5, 12, 21, 32, 45, 58, 74, 93, 104, 123, 144, 164, 179, 196, 210, 223,
+  7, 15, 26, 37, 49, 63, 78, 96, 112, 129, 146, 166, 182, 200, 215, 228,
+  9, 19, 28, 39, 54, 69, 86, 102, 117, 132, 151, 170, 187, 206, 220, 230,
+  13, 24, 35, 46, 60, 73, 91, 108, 122, 137, 154, 174, 189, 207, 224, 235,
+  17, 30, 40, 53, 66, 82, 98, 115, 126, 142, 161, 180, 197, 213, 227, 237,
+  22, 36, 48, 62, 76, 92, 105, 120, 133, 147, 167, 186, 203, 219, 232, 240,
+  27, 44, 56, 70, 84, 99, 113, 127, 140, 156, 175, 193, 209, 226, 236, 244,
+  33, 51, 68, 79, 94, 110, 125, 138, 149, 162, 184, 202, 217, 229, 241, 247,
+  42, 61, 77, 90, 106, 121, 134, 148, 160, 173, 191, 211, 225, 238, 245, 251,
+  50, 72, 87, 100, 118, 128, 145, 158, 168, 183, 204, 222, 233, 242, 249, 253,
+  57, 80, 97, 111, 131, 143, 155, 169, 178, 192, 214, 231, 239, 246, 250, 254,
+  65, 88, 107, 124, 139, 152, 163, 177, 185, 199, 221, 234, 243, 248, 252, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_16x16[256]) = {
+  0, 1, 2, 4, 6, 9, 12, 17, 22, 29, 36, 43, 54, 64, 76, 86,
+  3, 5, 7, 11, 15, 19, 25, 32, 38, 48, 59, 68, 84, 99, 115, 130,
+  8, 10, 13, 18, 23, 27, 33, 42, 51, 60, 72, 88, 103, 119, 142, 167,
+  14, 16, 20, 26, 31, 37, 44, 53, 61, 73, 85, 100, 116, 135, 161, 185,
+  21, 24, 30, 35, 40, 47, 55, 65, 74, 81, 94, 112, 133, 154, 179, 205,
+  28, 34, 39, 45, 50, 58, 67, 77, 87, 96, 106, 121, 146, 169, 196, 212,
+  41, 46, 49, 56, 63, 70, 79, 90, 98, 107, 122, 138, 159, 182, 207, 222,
+  52, 57, 62, 69, 75, 83, 93, 102, 110, 120, 134, 150, 176, 195, 215, 226,
+  66, 71, 78, 82, 91, 97, 108, 113, 127, 136, 148, 168, 188, 202, 221, 232,
+  80, 89, 92, 101, 105, 114, 125, 131, 139, 151, 162, 177, 192, 208, 223, 234,
+  95, 104, 109, 117, 123, 128, 143, 144, 155, 165, 175, 190, 206, 219, 233, 239,
+  111, 118, 124, 129, 140, 147, 157, 164, 170, 181, 191, 203, 224, 230, 240,
+  243, 126, 132, 137, 145, 153, 160, 174, 178, 184, 197, 204, 216, 231, 237,
+  244, 246, 141, 149, 156, 166, 172, 180, 189, 199, 200, 210, 220, 228, 238,
+  242, 249, 251, 152, 163, 171, 183, 186, 193, 201, 211, 214, 218, 227, 236,
+  245, 247, 252, 253, 158, 173, 187, 194, 198, 209, 213, 217, 225, 229, 235,
+  241, 248, 250, 254, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_16x16[256]) = {
+  0, 2, 5, 9, 17, 24, 36, 44, 55, 72, 88, 104, 128, 143, 166, 179,
+  1, 4, 8, 13, 20, 30, 40, 54, 66, 79, 96, 113, 141, 154, 178, 196,
+  3, 7, 11, 18, 25, 33, 46, 57, 71, 86, 101, 119, 148, 164, 186, 201,
+  6, 12, 16, 23, 31, 39, 53, 64, 78, 92, 110, 127, 153, 169, 193, 208,
+  10, 14, 19, 28, 37, 47, 58, 67, 84, 98, 114, 133, 161, 176, 198, 214,
+  15, 21, 26, 34, 43, 52, 65, 77, 91, 106, 120, 140, 165, 185, 205, 221,
+  22, 27, 32, 41, 48, 60, 73, 85, 99, 116, 130, 151, 175, 190, 211, 225,
+  29, 35, 42, 49, 59, 69, 81, 95, 108, 125, 139, 155, 182, 197, 217, 229,
+  38, 45, 51, 61, 68, 80, 93, 105, 118, 134, 150, 168, 191, 207, 223, 234,
+  50, 56, 63, 74, 83, 94, 109, 117, 129, 147, 163, 177, 199, 213, 228, 238,
+  62, 70, 76, 87, 97, 107, 122, 131, 145, 159, 172, 188, 210, 222, 235, 242,
+  75, 82, 90, 102, 112, 124, 138, 146, 157, 173, 187, 202, 219, 230, 240, 245,
+  89, 100, 111, 123, 132, 142, 156, 167, 180, 189, 203, 216, 231, 237, 246, 250,
+  103, 115, 126, 136, 149, 162, 171, 183, 194, 204, 215, 224, 236, 241, 248,
+  252, 121, 135, 144, 158, 170, 181, 192, 200, 209, 218, 227, 233, 243, 244,
+  251, 254, 137, 152, 160, 174, 184, 195, 206, 212, 220, 226, 232, 239, 247,
+  249, 253, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_32x32[1024]) = {
+  0, 2, 5, 10, 17, 25, 38, 47, 62, 83, 101, 121, 145, 170, 193, 204,
+  210, 219, 229, 233, 245, 257, 275, 299, 342, 356, 377, 405, 455, 471, 495,
+  527, 1, 4, 8, 15, 22, 30, 45, 58, 74, 92, 112, 133, 158, 184, 203, 215, 222,
+  228, 234, 237, 256, 274, 298, 317, 355, 376, 404, 426, 470, 494, 526, 551,
+  3, 7, 12, 18, 28, 36, 52, 64, 82, 102, 118, 142, 164, 189, 208, 217, 224,
+  231, 235, 238, 273, 297, 316, 329, 375, 403, 425, 440, 493, 525, 550, 567,
+  6, 11, 16, 23, 31, 43, 60, 73, 90, 109, 126, 150, 173, 196, 211, 220, 226,
+  232, 236, 239, 296, 315, 328, 335, 402, 424, 439, 447, 524, 549, 566, 575,
+  9, 14, 19, 29, 37, 50, 65, 78, 95, 116, 134, 157, 179, 201, 214, 223, 244,
+  255, 272, 295, 341, 354, 374, 401, 454, 469, 492, 523, 582, 596, 617, 645,
+  13, 20, 26, 35, 44, 54, 72, 85, 105, 123, 140, 163, 182, 205, 216, 225,
+  254, 271, 294, 314, 353, 373, 400, 423, 468, 491, 522, 548, 595, 616, 644,
+  666, 21, 27, 33, 42, 53, 63, 80, 94, 113, 132, 151, 172, 190, 209, 218, 227,
+  270, 293, 313, 327, 372, 399, 422, 438, 490, 521, 547, 565, 615, 643, 665,
+  680, 24, 32, 39, 48, 57, 71, 88, 104, 120, 139, 159, 178, 197, 212, 221, 230,
+  292, 312, 326, 334, 398, 421, 437, 446, 520, 546, 564, 574, 642, 664, 679,
+  687, 34, 40, 46, 56, 68, 81, 96, 111, 130, 147, 167, 186, 243, 253, 269, 291,
+  340, 352, 371, 397, 453, 467, 489, 519, 581, 594, 614, 641, 693, 705, 723,
+  747, 41, 49, 55, 67, 77, 91, 107, 124, 138, 161, 177, 194, 252, 268, 290,
+  311, 351, 370, 396, 420, 466, 488, 518, 545, 593, 613, 640, 663, 704, 722,
+  746, 765, 51, 59, 66, 76, 89, 99, 119, 131, 149, 168, 181, 200, 267, 289,
+  310, 325, 369, 395, 419, 436, 487, 517, 544, 563, 612, 639, 662, 678, 721,
+  745, 764, 777, 61, 69, 75, 87, 100, 114, 129, 144, 162, 180, 191, 207, 288,
+  309, 324, 333, 394, 418, 435, 445, 516, 543, 562, 573, 638, 661, 677, 686,
+  744, 763, 776, 783, 70, 79, 86, 97, 108, 122, 137, 155, 242, 251, 266, 287,
+  339, 350, 368, 393, 452, 465, 486, 515, 580, 592, 611, 637, 692, 703, 720,
+  743, 788, 798, 813, 833, 84, 93, 103, 110, 125, 141, 154, 171, 250, 265, 286,
+  308, 349, 367, 392, 417, 464, 485, 514, 542, 591, 610, 636, 660, 702, 719,
+  742, 762, 797, 812, 832, 848, 98, 106, 115, 127, 143, 156, 169, 185, 264,
+  285, 307, 323, 366, 391, 416, 434, 484, 513, 541, 561, 609, 635, 659, 676,
+  718, 741, 761, 775, 811, 831, 847, 858, 117, 128, 136, 148, 160, 175, 188,
+  198, 284, 306, 322, 332, 390, 415, 433, 444, 512, 540, 560, 572, 634, 658,
+  675, 685, 740, 760, 774, 782, 830, 846, 857, 863, 135, 146, 152, 165, 241,
+  249, 263, 283, 338, 348, 365, 389, 451, 463, 483, 511, 579, 590, 608, 633,
+  691, 701, 717, 739, 787, 796, 810, 829, 867, 875, 887, 903, 153, 166, 174,
+  183, 248, 262, 282, 305, 347, 364, 388, 414, 462, 482, 510, 539, 589, 607,
+  632, 657, 700, 716, 738, 759, 795, 809, 828, 845, 874, 886, 902, 915, 176,
+  187, 195, 202, 261, 281, 304, 321, 363, 387, 413, 432, 481, 509, 538, 559,
+  606, 631, 656, 674, 715, 737, 758, 773, 808, 827, 844, 856, 885, 901, 914,
+  923, 192, 199, 206, 213, 280, 303, 320, 331, 386, 412, 431, 443, 508, 537,
+  558, 571, 630, 655, 673, 684, 736, 757, 772, 781, 826, 843, 855, 862, 900,
+  913, 922, 927, 240, 247, 260, 279, 337, 346, 362, 385, 450, 461, 480, 507,
+  578, 588, 605, 629, 690, 699, 714, 735, 786, 794, 807, 825, 866, 873, 884,
+  899, 930, 936, 945, 957, 246, 259, 278, 302, 345, 361, 384, 411, 460, 479,
+  506, 536, 587, 604, 628, 654, 698, 713, 734, 756, 793, 806, 824, 842, 872,
+  883, 898, 912, 935, 944, 956, 966, 258, 277, 301, 319, 360, 383, 410, 430,
+  478, 505, 535, 557, 603, 627, 653, 672, 712, 733, 755, 771, 805, 823, 841,
+  854, 882, 897, 911, 921, 943, 955, 965, 972, 276, 300, 318, 330, 382, 409,
+  429, 442, 504, 534, 556, 570, 626, 652, 671, 683, 732, 754, 770, 780, 822,
+  840, 853, 861, 896, 910, 920, 926, 954, 964, 971, 975, 336, 344, 359, 381,
+  449, 459, 477, 503, 577, 586, 602, 625, 689, 697, 711, 731, 785, 792, 804,
+  821, 865, 871, 881, 895, 929, 934, 942, 953, 977, 981, 987, 995, 343, 358,
+  380, 408, 458, 476, 502, 533, 585, 601, 624, 651, 696, 710, 730, 753, 791,
+  803, 820, 839, 870, 880, 894, 909, 933, 941, 952, 963, 980, 986, 994, 1001,
+  357, 379, 407, 428, 475, 501, 532, 555, 600, 623, 650, 670, 709, 729, 752,
+  769, 802, 819, 838, 852, 879, 893, 908, 919, 940, 951, 962, 970, 985, 993,
+  1000, 1005, 378, 406, 427, 441, 500, 531, 554, 569, 622, 649, 669, 682, 728,
+  751, 768, 779, 818, 837, 851, 860, 892, 907, 918, 925, 950, 961, 969, 974,
+  992, 999, 1004, 1007, 448, 457, 474, 499, 576, 584, 599, 621, 688, 695, 708,
+  727, 784, 790, 801, 817, 864, 869, 878, 891, 928, 932, 939, 949, 976, 979,
+  984, 991, 1008, 1010, 1013, 1017, 456, 473, 498, 530, 583, 598, 620, 648,
+  694, 707, 726, 750, 789, 800, 816, 836, 868, 877, 890, 906, 931, 938, 948,
+  960, 978, 983, 990, 998, 1009, 1012, 1016, 1020, 472, 497, 529, 553, 597,
+  619, 647, 668, 706, 725, 749, 767, 799, 815, 835, 850, 876, 889, 905, 917,
+  937, 947, 959, 968, 982, 989, 997, 1003, 1011, 1015, 1019, 1022, 496, 528,
+  552, 568, 618, 646, 667, 681, 724, 748, 766, 778, 814, 834, 849, 859, 888,
+  904, 916, 924, 946, 958, 967, 973, 988, 996, 1002, 1006, 1014, 1018, 1021,
+  1023,
+};
 
 const scan_order vp9_default_scan_orders[TX_SIZES] = {
   {default_scan_4x4,   vp9_default_iscan_4x4,   default_scan_4x4_neighbors},
@@ -295,93 +725,3 @@
     {default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors},
   }
 };
-
-static int find_in_scan(const int16_t *scan, int l, int idx) {
-  int n, l2 = l * l;
-  for (n = 0; n < l2; n++) {
-    int rc = scan[n];
-    if (rc == idx)
-      return  n;
-  }
-  assert(0);
-  return -1;
-}
-
-static void init_scan_neighbors(const int16_t *scan, int16_t *iscan, int l,
-                                int16_t *neighbors) {
-  int l2 = l * l;
-  int n, i, j;
-
-  // dc doesn't use this type of prediction
-  neighbors[MAX_NEIGHBORS * 0 + 0] = 0;
-  neighbors[MAX_NEIGHBORS * 0 + 1] = 0;
-  iscan[0] = find_in_scan(scan, l, 0);
-  for (n = 1; n < l2; n++) {
-    int rc = scan[n];
-    iscan[n] = find_in_scan(scan, l, n);
-    i = rc / l;
-    j = rc % l;
-    if (i > 0 && j > 0) {
-      // col/row scan is used for adst/dct, and generally means that
-      // energy decreases to zero much faster in the dimension in
-      // which ADST is used compared to the direction in which DCT
-      // is used. Likewise, we find much higher correlation between
-      // coefficients within the direction in which DCT is used.
-      // Therefore, if we use ADST/DCT, prefer the DCT neighbor coeff
-      // as a context. If ADST or DCT is used in both directions, we
-      // use the combination of the two as a context.
-      int a = (i - 1) * l + j;
-      int b =  i      * l + j - 1;
-      if (scan == col_scan_4x4 || scan == col_scan_8x8 ||
-          scan == col_scan_16x16) {
-        // in the col/row scan cases (as well as left/top edge cases), we set
-        // both contexts to the same value, so we can branchlessly do a+b+1>>1
-        // which automatically becomes a if a == b
-        neighbors[MAX_NEIGHBORS * n + 0] =
-        neighbors[MAX_NEIGHBORS * n + 1] = a;
-      } else if (scan == row_scan_4x4 || scan == row_scan_8x8 ||
-                 scan == row_scan_16x16) {
-        neighbors[MAX_NEIGHBORS * n + 0] =
-        neighbors[MAX_NEIGHBORS * n + 1] = b;
-      } else {
-        neighbors[MAX_NEIGHBORS * n + 0] = a;
-        neighbors[MAX_NEIGHBORS * n + 1] = b;
-      }
-    } else if (i > 0) {
-      neighbors[MAX_NEIGHBORS * n + 0] =
-      neighbors[MAX_NEIGHBORS * n + 1] = (i - 1) * l + j;
-    } else {
-      assert(j > 0);
-      neighbors[MAX_NEIGHBORS * n + 0] =
-      neighbors[MAX_NEIGHBORS * n + 1] =  i      * l + j - 1;
-    }
-    assert(iscan[neighbors[MAX_NEIGHBORS * n + 0]] < n);
-  }
-  // one padding item so we don't have to add branches in code to handle
-  // calls to get_coef_context() for the token after the final dc token
-  neighbors[MAX_NEIGHBORS * l2 + 0] = 0;
-  neighbors[MAX_NEIGHBORS * l2 + 1] = 0;
-}
-
-void vp9_init_neighbors() {
-  init_scan_neighbors(default_scan_4x4, vp9_default_iscan_4x4, 4,
-                      default_scan_4x4_neighbors);
-  init_scan_neighbors(row_scan_4x4, vp9_row_iscan_4x4, 4,
-                      row_scan_4x4_neighbors);
-  init_scan_neighbors(col_scan_4x4, vp9_col_iscan_4x4, 4,
-                      col_scan_4x4_neighbors);
-  init_scan_neighbors(default_scan_8x8, vp9_default_iscan_8x8, 8,
-                      default_scan_8x8_neighbors);
-  init_scan_neighbors(row_scan_8x8, vp9_row_iscan_8x8, 8,
-                      row_scan_8x8_neighbors);
-  init_scan_neighbors(col_scan_8x8, vp9_col_iscan_8x8, 8,
-                      col_scan_8x8_neighbors);
-  init_scan_neighbors(default_scan_16x16, vp9_default_iscan_16x16, 16,
-                      default_scan_16x16_neighbors);
-  init_scan_neighbors(row_scan_16x16, vp9_row_iscan_16x16, 16,
-                      row_scan_16x16_neighbors);
-  init_scan_neighbors(col_scan_16x16, vp9_col_iscan_16x16, 16,
-                      col_scan_16x16_neighbors);
-  init_scan_neighbors(default_scan_32x32, vp9_default_iscan_32x32, 32,
-                      default_scan_32x32_neighbors);
-}

diff --git a/source/libvpx/vp9/common/vp9_scan.h b/source/libvpx/vp9/common/vp9_scan.h
index 9613b67..65e2aa6 100644
--- a/source/libvpx/vp9/common/vp9_scan.h
+++ b/source/libvpx/vp9/common/vp9_scan.h

@@ -23,8 +23,6 @@
 
 #define MAX_NEIGHBORS 2
 
-void vp9_init_neighbors();
-
 typedef struct {
   const int16_t *scan;
   const int16_t *iscan;

diff --git a/source/libvpx/vp9/common/x86/vp9_asm_stubs.c b/source/libvpx/vp9/common/x86/vp9_asm_stubs.c
index 407573a..a0a5996 100644
--- a/source/libvpx/vp9/common/x86/vp9_asm_stubs.c
+++ b/source/libvpx/vp9/common/x86/vp9_asm_stubs.c

@@ -142,7 +142,7 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
 
-typedef void high_filter8_1dfunction (
+typedef void highbd_filter8_1dfunction (
   const uint16_t *src_ptr,
   const ptrdiff_t src_pitch,
   uint16_t *output_ptr,
@@ -153,87 +153,88 @@
 );
 
 #define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
-  void vp9_high_convolve8_##name##_##opt(const uint8_t *src8, \
-                                         ptrdiff_t src_stride, \
-                                         uint8_t *dst8, ptrdiff_t dst_stride, \
-                                         const int16_t *filter_x, \
-                                         int x_step_q4, \
-                                         const int16_t *filter_y, \
-                                         int y_step_q4, \
-                                         int w, int h, int bd) { \
+  void vp9_highbd_convolve8_##name##_##opt(const uint8_t *src8, \
+                                           ptrdiff_t src_stride, \
+                                           uint8_t *dst8, \
+                                           ptrdiff_t dst_stride, \
+                                           const int16_t *filter_x, \
+                                           int x_step_q4, \
+                                           const int16_t *filter_y, \
+                                           int y_step_q4, \
+                                           int w, int h, int bd) { \
   if (step_q4 == 16 && filter[3] != 128) { \
     uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
     if (filter[0] || filter[1] || filter[2]) { \
       while (w >= 16) { \
-        vp9_high_filter_block1d16_##dir##8_##avg##opt(src_start, \
-                                                      src_stride, \
-                                                      dst, \
-                                                      dst_stride, \
-                                                      h, \
-                                                      filter, \
-                                                      bd); \
+        vp9_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \
+                                                        src_stride, \
+                                                        dst, \
+                                                        dst_stride, \
+                                                        h, \
+                                                        filter, \
+                                                        bd); \
         src += 16; \
         dst += 16; \
         w -= 16; \
       } \
       while (w >= 8) { \
-        vp9_high_filter_block1d8_##dir##8_##avg##opt(src_start, \
-                                                     src_stride, \
-                                                     dst, \
-                                                     dst_stride, \
-                                                     h, \
-                                                     filter, \
-                                                     bd); \
+        vp9_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \
+                                                       src_stride, \
+                                                       dst, \
+                                                       dst_stride, \
+                                                       h, \
+                                                       filter, \
+                                                       bd); \
         src += 8; \
         dst += 8; \
         w -= 8; \
       } \
       while (w >= 4) { \
-        vp9_high_filter_block1d4_##dir##8_##avg##opt(src_start, \
-                                                     src_stride, \
-                                                     dst, \
-                                                     dst_stride, \
-                                                     h, \
-                                                     filter, \
-                                                     bd); \
+        vp9_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \
+                                                       src_stride, \
+                                                       dst, \
+                                                       dst_stride, \
+                                                       h, \
+                                                       filter, \
+                                                       bd); \
         src += 4; \
         dst += 4; \
         w -= 4; \
       } \
     } else { \
       while (w >= 16) { \
-        vp9_high_filter_block1d16_##dir##2_##avg##opt(src, \
-                                                      src_stride, \
-                                                      dst, \
-                                                      dst_stride, \
-                                                      h, \
-                                                      filter, \
-                                                      bd); \
+        vp9_highbd_filter_block1d16_##dir##2_##avg##opt(src, \
+                                                        src_stride, \
+                                                        dst, \
+                                                        dst_stride, \
+                                                        h, \
+                                                        filter, \
+                                                        bd); \
         src += 16; \
         dst += 16; \
         w -= 16; \
       } \
       while (w >= 8) { \
-        vp9_high_filter_block1d8_##dir##2_##avg##opt(src, \
-                                                     src_stride, \
-                                                     dst, \
-                                                     dst_stride, \
-                                                     h, \
-                                                     filter, \
-                                                     bd); \
+        vp9_highbd_filter_block1d8_##dir##2_##avg##opt(src, \
+                                                       src_stride, \
+                                                       dst, \
+                                                       dst_stride, \
+                                                       h, \
+                                                       filter, \
+                                                       bd); \
         src += 8; \
         dst += 8; \
         w -= 8; \
       } \
       while (w >= 4) { \
-        vp9_high_filter_block1d4_##dir##2_##avg##opt(src, \
-                                                     src_stride, \
-                                                     dst, \
-                                                     dst_stride, \
-                                                     h, \
-                                                     filter, \
-                                                     bd); \
+        vp9_highbd_filter_block1d4_##dir##2_##avg##opt(src, \
+                                                       src_stride, \
+                                                       dst, \
+                                                       dst_stride, \
+                                                       h, \
+                                                       filter, \
+                                                       bd); \
         src += 4; \
         dst += 4; \
         w -= 4; \
@@ -241,47 +242,51 @@
     } \
   } \
   if (w) { \
-    vp9_high_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \
-                                  filter_x, x_step_q4, filter_y, y_step_q4, \
-                                  w, h, bd); \
+    vp9_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \
+                                    filter_x, x_step_q4, filter_y, y_step_q4, \
+                                    w, h, bd); \
   } \
 }
 
 #define HIGH_FUN_CONV_2D(avg, opt) \
-void vp9_high_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
-                                   uint8_t *dst, ptrdiff_t dst_stride, \
-                                   const int16_t *filter_x, int x_step_q4, \
-                                   const int16_t *filter_y, int y_step_q4, \
-                                   int w, int h, int bd) { \
+void vp9_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
+                                     uint8_t *dst, ptrdiff_t dst_stride, \
+                                     const int16_t *filter_x, int x_step_q4, \
+                                     const int16_t *filter_y, int y_step_q4, \
+                                     int w, int h, int bd) { \
   assert(w <= 64); \
   assert(h <= 64); \
   if (x_step_q4 == 16 && y_step_q4 == 16) { \
     if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
         filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
       DECLARE_ALIGNED_ARRAY(16, uint16_t, fdata2, 64 * 71); \
-      vp9_high_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
-                                     CONVERT_TO_BYTEPTR(fdata2), 64, \
-                                     filter_x, x_step_q4, filter_y, y_step_q4, \
-                                     w, h + 7, bd); \
-      vp9_high_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \
-                                           64, dst, dst_stride, \
-                                           filter_x, x_step_q4, filter_y, \
-                                           y_step_q4, w, h, bd); \
+      vp9_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
+                                       CONVERT_TO_BYTEPTR(fdata2), 64, \
+                                       filter_x, x_step_q4, \
+                                       filter_y, y_step_q4, \
+                                       w, h + 7, bd); \
+      vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \
+                                             64, dst, dst_stride, \
+                                             filter_x, x_step_q4, \
+                                             filter_y, y_step_q4, \
+                                             w, h, bd); \
     } else { \
       DECLARE_ALIGNED_ARRAY(16, uint16_t, fdata2, 64 * 65); \
-      vp9_high_convolve8_horiz_##opt(src, src_stride, \
-                                     CONVERT_TO_BYTEPTR(fdata2), 64, \
-                                     filter_x, x_step_q4, filter_y, y_step_q4, \
-                                     w, h + 1, bd); \
-      vp9_high_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \
-                                           dst, dst_stride, \
-                                           filter_x, x_step_q4, filter_y, \
-                                           y_step_q4, w, h, bd); \
+      vp9_highbd_convolve8_horiz_##opt(src, src_stride, \
+                                       CONVERT_TO_BYTEPTR(fdata2), 64, \
+                                       filter_x, x_step_q4, \
+                                       filter_y, y_step_q4, \
+                                       w, h + 1, bd); \
+      vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \
+                                             dst, dst_stride, \
+                                             filter_x, x_step_q4, \
+                                             filter_y, y_step_q4, \
+                                             w, h, bd); \
     } \
   } else { \
-    vp9_high_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
-                                filter_x, x_step_q4, filter_y, y_step_q4, w, \
-                                h, bd); \
+    vp9_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
+                                  filter_x, x_step_q4, filter_y, y_step_q4, w, \
+                                  h, bd); \
   } \
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -485,72 +490,84 @@
 FUN_CONV_2D(avg_ , sse2);
 
 #if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
-high_filter8_1dfunction vp9_high_filter_block1d16_v8_sse2;
-high_filter8_1dfunction vp9_high_filter_block1d16_h8_sse2;
-high_filter8_1dfunction vp9_high_filter_block1d8_v8_sse2;
-high_filter8_1dfunction vp9_high_filter_block1d8_h8_sse2;
-high_filter8_1dfunction vp9_high_filter_block1d4_v8_sse2;
-high_filter8_1dfunction vp9_high_filter_block1d4_h8_sse2;
-high_filter8_1dfunction vp9_high_filter_block1d16_v8_avg_sse2;
-high_filter8_1dfunction vp9_high_filter_block1d16_h8_avg_sse2;
-high_filter8_1dfunction vp9_high_filter_block1d8_v8_avg_sse2;
-high_filter8_1dfunction vp9_high_filter_block1d8_h8_avg_sse2;
-high_filter8_1dfunction vp9_high_filter_block1d4_v8_avg_sse2;
-high_filter8_1dfunction vp9_high_filter_block1d4_h8_avg_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d16_v8_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d16_h8_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d8_v8_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d8_h8_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d4_v8_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h8_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d16_v8_avg_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d16_h8_avg_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d8_v8_avg_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d8_h8_avg_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d4_v8_avg_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h8_avg_sse2;
 
-high_filter8_1dfunction vp9_high_filter_block1d16_v2_sse2;
-high_filter8_1dfunction vp9_high_filter_block1d16_h2_sse2;
-high_filter8_1dfunction vp9_high_filter_block1d8_v2_sse2;
-high_filter8_1dfunction vp9_high_filter_block1d8_h2_sse2;
-high_filter8_1dfunction vp9_high_filter_block1d4_v2_sse2;
-high_filter8_1dfunction vp9_high_filter_block1d4_h2_sse2;
-high_filter8_1dfunction vp9_high_filter_block1d16_v2_avg_sse2;
-high_filter8_1dfunction vp9_high_filter_block1d16_h2_avg_sse2;
-high_filter8_1dfunction vp9_high_filter_block1d8_v2_avg_sse2;
-high_filter8_1dfunction vp9_high_filter_block1d8_h2_avg_sse2;
-high_filter8_1dfunction vp9_high_filter_block1d4_v2_avg_sse2;
-high_filter8_1dfunction vp9_high_filter_block1d4_h2_avg_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d16_v2_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d16_h2_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d8_v2_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d8_h2_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d4_v2_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h2_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d16_v2_avg_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d16_h2_avg_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d8_v2_avg_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d8_h2_avg_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d4_v2_avg_sse2;
+highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h2_avg_sse2;
 
-// void vp9_high_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                                    uint8_t *dst, ptrdiff_t dst_stride,
-//                                    const int16_t *filter_x, int x_step_q4,
-//                                    const int16_t *filter_y, int y_step_q4,
-//                                    int w, int h, int bd);
-// void vp9_high_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                                   uint8_t *dst, ptrdiff_t dst_stride,
-//                                   const int16_t *filter_x, int x_step_q4,
-//                                   const int16_t *filter_y, int y_step_q4,
-//                                   int w, int h, int bd);
-// void vp9_high_convolve8_avg_horiz_sse2(const uint8_t *src,
-//                                        ptrdiff_t src_stride,
-//                                        uint8_t *dst, ptrdiff_t dst_stride,
-//                                        const int16_t *filter_x,
-//                                        int x_step_q4,
-//                                        const int16_t *filter_y,
-//                                        int y_step_q4,
-//                                        int w, int h, int bd);
-// void vp9_high_convolve8_avg_vert_sse2(const uint8_t *src,
-//                                       ptrdiff_t src_stride,
-//                                       uint8_t *dst, ptrdiff_t dst_stride,
-//                                       const int16_t *filter_x, int x_step_q4,
-//                                       const int16_t *filter_y, int y_step_q4,
-//                                       int w, int h, int bd);
+// void vp9_highbd_convolve8_horiz_sse2(const uint8_t *src,
+//                                      ptrdiff_t src_stride,
+//                                      uint8_t *dst,
+//                                      ptrdiff_t dst_stride,
+//                                      const int16_t *filter_x,
+//                                      int x_step_q4,
+//                                      const int16_t *filter_y,
+//                                      int y_step_q4,
+//                                      int w, int h, int bd);
+// void vp9_highbd_convolve8_vert_sse2(const uint8_t *src,
+//                                     ptrdiff_t src_stride,
+//                                     uint8_t *dst,
+//                                     ptrdiff_t dst_stride,
+//                                     const int16_t *filter_x,
+//                                     int x_step_q4,
+//                                     const int16_t *filter_y,
+//                                     int y_step_q4,
+//                                     int w, int h, int bd);
+// void vp9_highbd_convolve8_avg_horiz_sse2(const uint8_t *src,
+//                                          ptrdiff_t src_stride,
+//                                          uint8_t *dst,
+//                                          ptrdiff_t dst_stride,
+//                                          const int16_t *filter_x,
+//                                          int x_step_q4,
+//                                          const int16_t *filter_y,
+//                                          int y_step_q4,
+//                                          int w, int h, int bd);
+// void vp9_highbd_convolve8_avg_vert_sse2(const uint8_t *src,
+//                                         ptrdiff_t src_stride,
+//                                         uint8_t *dst,
+//                                         ptrdiff_t dst_stride,
+//                                         const int16_t *filter_x,
+//                                         int x_step_q4,
+//                                         const int16_t *filter_y,
+//                                         int y_step_q4,
+//                                         int w, int h, int bd);
 HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
 HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
 HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
 HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
                  sse2);
 
-// void vp9_high_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                              uint8_t *dst, ptrdiff_t dst_stride,
-//                              const int16_t *filter_x, int x_step_q4,
-//                              const int16_t *filter_y, int y_step_q4,
-//                              int w, int h, int bd);
-// void vp9_high_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                                  uint8_t *dst, ptrdiff_t dst_stride,
-//                                  const int16_t *filter_x, int x_step_q4,
-//                                  const int16_t *filter_y, int y_step_q4,
-//                                  int w, int h, int bd);
+// void vp9_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                                uint8_t *dst, ptrdiff_t dst_stride,
+//                                const int16_t *filter_x, int x_step_q4,
+//                                const int16_t *filter_y, int y_step_q4,
+//                                int w, int h, int bd);
+// void vp9_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                                    uint8_t *dst, ptrdiff_t dst_stride,
+//                                    const int16_t *filter_x, int x_step_q4,
+//                                    const int16_t *filter_y, int y_step_q4,
+//                                    int w, int h, int bd);
 HIGH_FUN_CONV_2D(, sse2);
 HIGH_FUN_CONV_2D(avg_ , sse2);
 #endif  // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64

diff --git a/source/libvpx/vp9/common/x86/vp9_high_intrapred_sse2.asm b/source/libvpx/vp9/common/x86/vp9_high_intrapred_sse2.asm
index ff45071..721126c 100644
--- a/source/libvpx/vp9/common/x86/vp9_high_intrapred_sse2.asm
+++ b/source/libvpx/vp9/common/x86/vp9_high_intrapred_sse2.asm

@@ -18,7 +18,7 @@
 
 SECTION .text
 INIT_MMX sse
-cglobal high_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
+cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
   GET_GOT     goffsetq
 
   movq                  m0, [aboveq]
@@ -45,7 +45,7 @@
   RET
 
 INIT_XMM sse2
-cglobal high_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset
+cglobal highbd_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset
   GET_GOT     goffsetq
 
   pxor                  m1, m1
@@ -80,7 +80,7 @@
   RET
 
 INIT_XMM sse2
-cglobal high_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset
+cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset
   GET_GOT     goffsetq
 
   pxor                  m1, m1
@@ -124,7 +124,7 @@
 
 %if ARCH_X86_64
 INIT_XMM sse2
-cglobal high_dc_predictor_32x32, 4, 5, 9, dst, stride, above, left, goffset
+cglobal highbd_dc_predictor_32x32, 4, 5, 9, dst, stride, above, left, goffset
   GET_GOT     goffsetq
 
   pxor                  m1, m1
@@ -184,7 +184,7 @@
 %endif
 
 INIT_MMX sse
-cglobal high_v_predictor_4x4, 3, 3, 1, dst, stride, above
+cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above
   movq                  m0, [aboveq]
   movq    [dstq          ], m0
   movq    [dstq+strideq*2], m0
@@ -194,7 +194,7 @@
   RET
 
 INIT_XMM sse2
-cglobal high_v_predictor_8x8, 3, 3, 1, dst, stride, above
+cglobal highbd_v_predictor_8x8, 3, 3, 1, dst, stride, above
   mova                  m0, [aboveq]
   DEFINE_ARGS dst, stride, stride3
   lea             stride3q, [strideq*3]
@@ -210,7 +210,7 @@
   RET
 
 INIT_XMM sse2
-cglobal high_v_predictor_16x16, 3, 4, 2, dst, stride, above
+cglobal highbd_v_predictor_16x16, 3, 4, 2, dst, stride, above
   mova                  m0, [aboveq]
   mova                  m1, [aboveq+16]
   DEFINE_ARGS dst, stride, stride3, nlines4
@@ -231,7 +231,7 @@
   REP_RET
 
 INIT_XMM sse2
-cglobal high_v_predictor_32x32, 3, 4, 4, dst, stride, above
+cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above
   mova                  m0, [aboveq]
   mova                  m1, [aboveq+16]
   mova                  m2, [aboveq+32]
@@ -262,7 +262,7 @@
   REP_RET
 
 INIT_MMX sse
-cglobal high_tm_predictor_4x4, 5, 6, 5, dst, stride, above, left, bps, one
+cglobal highbd_tm_predictor_4x4, 5, 6, 5, dst, stride, above, left, bps, one
   movd                  m1, [aboveq-2]
   movq                  m0, [aboveq]
   pshufw                m1, m1, 0x0
@@ -300,7 +300,7 @@
   REP_RET
 
 INIT_XMM sse2
-cglobal high_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one
+cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one
   movd                  m1, [aboveq-2]
   mova                  m0, [aboveq]
   pshuflw               m1, m1, 0x0
@@ -345,7 +345,7 @@
 
 %if ARCH_X86_64
 INIT_XMM sse2
-cglobal high_tm_predictor_16x16, 5, 6, 8, dst, stride, above, left, bps, one
+cglobal highbd_tm_predictor_16x16, 5, 6, 8, dst, stride, above, left, bps, one
   movd                  m2, [aboveq-2]
   mova                  m0, [aboveq]
   mova                  m1, [aboveq+16]
@@ -399,7 +399,7 @@
   REP_RET
 
 INIT_XMM sse2
-cglobal high_tm_predictor_32x32, 5, 6, 12, dst, stride, above, left, bps, one
+cglobal highbd_tm_predictor_32x32, 5, 6, 12, dst, stride, above, left, bps, one
   movd                  m0, [aboveq-2]
   mova                  m1, [aboveq]
   mova                  m2, [aboveq+16]

diff --git a/source/libvpx/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c b/source/libvpx/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c
new file mode 100644
index 0000000..7e63f38
--- /dev/null
+++ b/source/libvpx/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c

@@ -0,0 +1,1119 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_loopfilter.h"
+#include "vpx_ports/emmintrin_compat.h"
+
+static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
+    __m128i ubounded;
+    __m128i lbounded;
+    __m128i retval;
+
+    const __m128i zero = _mm_set1_epi16(0);
+    const __m128i one = _mm_set1_epi16(1);
+    const __m128i t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
+    const __m128i max = _mm_subs_epi16(
+        _mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80);
+    const __m128i min = _mm_subs_epi16(zero, t80);
+    ubounded = _mm_cmpgt_epi16(value, max);
+    lbounded = _mm_cmplt_epi16(value, min);
+    retval = _mm_andnot_si128(_mm_or_si128(ubounded, lbounded), value);
+    ubounded = _mm_and_si128(ubounded, max);
+    lbounded = _mm_and_si128(lbounded, min);
+    retval = _mm_or_si128(retval, ubounded);
+    retval = _mm_or_si128(retval, lbounded);
+    return retval;
+}
+
+// TODO(debargha, peter): Break up large functions into smaller ones
+// in this file.
+static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
+                                                   int p,
+                                                   const uint8_t *_blimit,
+                                                   const uint8_t *_limit,
+                                                   const uint8_t *_thresh,
+                                                   int bd) {
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i blimit = _mm_slli_epi16(
+      _mm_unpacklo_epi8(
+          _mm_load_si128((const __m128i *)_blimit), zero), bd - 8);
+  const __m128i limit = _mm_slli_epi16(
+      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), bd - 8);
+  const __m128i thresh = _mm_slli_epi16(
+      _mm_unpacklo_epi8(
+          _mm_load_si128((const __m128i *)_thresh), zero), bd - 8);
+  __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0;
+  __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0;
+  __m128i ps1, qs1, ps0, qs0;
+  __m128i abs_p0q0, abs_p1q1, ffff, work;
+  __m128i filt, work_a, filter1, filter2;
+  __m128i flat2_q6, flat2_p6, flat2_q5, flat2_p5, flat2_q4, flat2_p4;
+  __m128i flat2_q3, flat2_p3, flat2_q2, flat2_p2, flat2_q1, flat2_p1;
+  __m128i flat2_q0, flat2_p0;
+  __m128i flat_q2, flat_p2, flat_q1, flat_p1, flat_q0, flat_p0;
+  __m128i pixelFilter_p, pixelFilter_q;
+  __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+  __m128i sum_p7, sum_q7, sum_p3, sum_q3;
+  __m128i t4, t3, t80, t1;
+  __m128i eight, four;
+
+  q4 = _mm_load_si128((__m128i *)(s + 4 * p));
+  p4 = _mm_load_si128((__m128i *)(s - 5 * p));
+  q3 = _mm_load_si128((__m128i *)(s + 3 * p));
+  p3 = _mm_load_si128((__m128i *)(s - 4 * p));
+  q2 = _mm_load_si128((__m128i *)(s + 2 * p));
+  p2 = _mm_load_si128((__m128i *)(s - 3 * p));
+  q1 = _mm_load_si128((__m128i *)(s + 1 * p));
+  p1 = _mm_load_si128((__m128i *)(s - 2 * p));
+  q0 = _mm_load_si128((__m128i *)(s + 0 * p));
+  p0 = _mm_load_si128((__m128i *)(s - 1 * p));
+
+  //  highbd_filter_mask
+  abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
+  abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
+
+  ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
+
+  abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
+  abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
+
+  //  highbd_hev_mask (in C code this is actually called from highbd_filter4)
+  flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
+  hev = _mm_subs_epu16(flat, thresh);
+  hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
+
+  abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0);  // abs(p0 - q0) * 2
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);  // abs(p1 - q1) / 2
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p1, p0),
+                                    _mm_subs_epu16(p0, p1)),
+                       _mm_or_si128(_mm_subs_epu16(q1, q0),
+                                    _mm_subs_epu16(q0, q1)));
+  mask = _mm_max_epi16(work, mask);
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1),
+                                    _mm_subs_epu16(p1, p2)),
+                       _mm_or_si128(_mm_subs_epu16(q2, q1),
+                                    _mm_subs_epu16(q1, q2)));
+  mask = _mm_max_epi16(work, mask);
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p2),
+                                    _mm_subs_epu16(p2, p3)),
+                       _mm_or_si128(_mm_subs_epu16(q3, q2),
+                                    _mm_subs_epu16(q2, q3)));
+  mask = _mm_max_epi16(work, mask);
+
+  mask = _mm_subs_epu16(mask, limit);
+  mask = _mm_cmpeq_epi16(mask, zero);  // return ~mask
+
+  // lp filter
+  // highbd_filter4
+  t4 = _mm_set1_epi16(4);
+  t3 = _mm_set1_epi16(3);
+  t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
+  t1 = _mm_set1_epi16(0x1);
+
+  ps1 = _mm_subs_epi16(p1, t80);
+  qs1 = _mm_subs_epi16(q1, t80);
+  ps0 = _mm_subs_epi16(p0, t80);
+  qs0 = _mm_subs_epi16(q0, t80);
+
+  filt = _mm_and_si128(
+      signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd), hev);
+  work_a = _mm_subs_epi16(qs0, ps0);
+  filt = _mm_adds_epi16(filt, work_a);
+  filt = _mm_adds_epi16(filt, work_a);
+  filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
+  filt = _mm_and_si128(filt, mask);
+
+  filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
+  filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
+
+  // Filter1 >> 3
+  filter1 = _mm_srai_epi16(filter1, 0x3);
+  filter2 = _mm_srai_epi16(filter2, 0x3);
+
+  qs0 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd),
+      t80);
+  ps0 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd),
+      t80);
+  filt = _mm_adds_epi16(filter1, t1);
+  filt = _mm_srai_epi16(filt, 1);
+  filt = _mm_andnot_si128(hev, filt);
+
+  qs1 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
+      t80);
+  ps1 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
+      t80);
+  // end highbd_filter4
+  // loopfilter done
+
+  // highbd_flat_mask4
+  flat = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p0),
+                                    _mm_subs_epu16(p0, p2)),
+                       _mm_or_si128(_mm_subs_epu16(p3, p0),
+                                    _mm_subs_epu16(p0, p3)));
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(q2, q0),
+                                    _mm_subs_epu16(q0, q2)),
+                       _mm_or_si128(_mm_subs_epu16(q3, q0),
+                                    _mm_subs_epu16(q0, q3)));
+  flat = _mm_max_epi16(work, flat);
+  work = _mm_max_epi16(abs_p1p0, abs_q1q0);
+  flat = _mm_max_epi16(work, flat);
+  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+  flat = _mm_cmpeq_epi16(flat, zero);
+  // end flat_mask4
+
+  // flat & mask = flat && mask (as used in filter8)
+  // (because, in both vars, each block of 16 either all 1s or all 0s)
+  flat = _mm_and_si128(flat, mask);
+
+  p5 = _mm_load_si128((__m128i *)(s - 6 * p));
+  q5 = _mm_load_si128((__m128i *)(s + 5 * p));
+  p6 = _mm_load_si128((__m128i *)(s - 7 * p));
+  q6 = _mm_load_si128((__m128i *)(s + 6 * p));
+  p7 = _mm_load_si128((__m128i *)(s - 8 * p));
+  q7 = _mm_load_si128((__m128i *)(s + 7 * p));
+
+  // highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7
+  // but referred to as p0-p4 & q0-q4 in fn)
+  flat2 = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p4, p0),
+                                     _mm_subs_epu16(p0, p4)),
+                        _mm_or_si128(_mm_subs_epu16(q4, q0),
+                                     _mm_subs_epu16(q0, q4)));
+
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p5, p0),
+                                    _mm_subs_epu16(p0, p5)),
+                       _mm_or_si128(_mm_subs_epu16(q5, q0),
+                                    _mm_subs_epu16(q0, q5)));
+  flat2 = _mm_max_epi16(work, flat2);
+
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p6, p0),
+                                    _mm_subs_epu16(p0, p6)),
+                       _mm_or_si128(_mm_subs_epu16(q6, q0),
+                                    _mm_subs_epu16(q0, q6)));
+  flat2 = _mm_max_epi16(work, flat2);
+
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p7, p0),
+                                    _mm_subs_epu16(p0, p7)),
+                       _mm_or_si128(_mm_subs_epu16(q7, q0),
+                                    _mm_subs_epu16(q0, q7)));
+  flat2 = _mm_max_epi16(work, flat2);
+
+  flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, bd - 8));
+  flat2 = _mm_cmpeq_epi16(flat2, zero);
+  flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+  // end highbd_flat_mask5
+
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // flat and wide flat calculations
+  eight = _mm_set1_epi16(8);
+  four = _mm_set1_epi16(4);
+
+  pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5),
+                                _mm_add_epi16(p4, p3));
+  pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5),
+                                _mm_add_epi16(q4, q3));
+
+  pixetFilter_p2p1p0 = _mm_add_epi16(p0, _mm_add_epi16(p2, p1));
+  pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+  pixetFilter_q2q1q0 = _mm_add_epi16(q0, _mm_add_epi16(q2, q1));
+  pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+  pixelFilter_p = _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p,
+                                                      pixelFilter_q));
+  pixetFilter_p2p1p0 =   _mm_add_epi16(four,
+                                       _mm_add_epi16(pixetFilter_p2p1p0,
+                                                     pixetFilter_q2q1q0));
+  flat2_p0 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                          _mm_add_epi16(p7, p0)), 4);
+  flat2_q0 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                          _mm_add_epi16(q7, q0)), 4);
+  flat_p0 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+                                         _mm_add_epi16(p3, p0)), 3);
+  flat_q0 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+                                         _mm_add_epi16(q3, q0)), 3);
+
+  sum_p7 = _mm_add_epi16(p7, p7);
+  sum_q7 = _mm_add_epi16(q7, q7);
+  sum_p3 = _mm_add_epi16(p3, p3);
+  sum_q3 = _mm_add_epi16(q3, q3);
+
+  pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6);
+  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6);
+  flat2_p1 = _mm_srli_epi16(
+      _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1)), 4);
+  flat2_q1 = _mm_srli_epi16(
+      _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1)), 4);
+
+  pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2);
+  pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2);
+  flat_p1 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+                                         _mm_add_epi16(sum_p3, p1)), 3);
+  flat_q1 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
+                                         _mm_add_epi16(sum_q3, q1)), 3);
+
+  sum_p7 = _mm_add_epi16(sum_p7, p7);
+  sum_q7 = _mm_add_epi16(sum_q7, q7);
+  sum_p3 = _mm_add_epi16(sum_p3, p3);
+  sum_q3 = _mm_add_epi16(sum_q3, q3);
+
+  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5);
+  pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5);
+  flat2_p2 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                          _mm_add_epi16(sum_p7, p2)), 4);
+  flat2_q2 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                                          _mm_add_epi16(sum_q7, q2)), 4);
+
+  pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1);
+  pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1);
+  flat_p2 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+                                         _mm_add_epi16(sum_p3, p2)), 3);
+  flat_q2 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
+                                         _mm_add_epi16(sum_q3, q2)), 3);
+
+  sum_p7 = _mm_add_epi16(sum_p7, p7);
+  sum_q7 = _mm_add_epi16(sum_q7, q7);
+  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4);
+  pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4);
+  flat2_p3 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                          _mm_add_epi16(sum_p7, p3)), 4);
+  flat2_q3 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                                          _mm_add_epi16(sum_q7, q3)), 4);
+
+  sum_p7 = _mm_add_epi16(sum_p7, p7);
+  sum_q7 = _mm_add_epi16(sum_q7, q7);
+  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3);
+  pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3);
+  flat2_p4 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                          _mm_add_epi16(sum_p7, p4)), 4);
+  flat2_q4 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                                          _mm_add_epi16(sum_q7, q4)), 4);
+
+  sum_p7 = _mm_add_epi16(sum_p7, p7);
+  sum_q7 = _mm_add_epi16(sum_q7, q7);
+  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2);
+  pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2);
+  flat2_p5 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                          _mm_add_epi16(sum_p7, p5)), 4);
+  flat2_q5 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                                          _mm_add_epi16(sum_q7, q5)), 4);
+
+  sum_p7 = _mm_add_epi16(sum_p7, p7);
+  sum_q7 = _mm_add_epi16(sum_q7, q7);
+  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1);
+  pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1);
+  flat2_p6 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                          _mm_add_epi16(sum_p7, p6)), 4);
+  flat2_q6 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                                          _mm_add_epi16(sum_q7, q6)), 4);
+
+  //  wide flat
+  //  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  //  highbd_filter8
+  p2 = _mm_andnot_si128(flat, p2);
+  //  p2 remains unchanged if !(flat && mask)
+  flat_p2 = _mm_and_si128(flat, flat_p2);
+  //  when (flat && mask)
+  p2 = _mm_or_si128(p2, flat_p2);  // full list of p2 values
+  q2 = _mm_andnot_si128(flat, q2);
+  flat_q2 = _mm_and_si128(flat, flat_q2);
+  q2 = _mm_or_si128(q2, flat_q2);  // full list of q2 values
+
+  ps1 = _mm_andnot_si128(flat, ps1);
+  //  p1 takes the value assigned to in in filter4 if !(flat && mask)
+  flat_p1 = _mm_and_si128(flat, flat_p1);
+  //  when (flat && mask)
+  p1 = _mm_or_si128(ps1, flat_p1);  // full list of p1 values
+  qs1 = _mm_andnot_si128(flat, qs1);
+  flat_q1 = _mm_and_si128(flat, flat_q1);
+  q1 = _mm_or_si128(qs1, flat_q1);  // full list of q1 values
+
+  ps0 = _mm_andnot_si128(flat, ps0);
+  //  p0 takes the value assigned to in in filter4 if !(flat && mask)
+  flat_p0 = _mm_and_si128(flat, flat_p0);
+  //  when (flat && mask)
+  p0 = _mm_or_si128(ps0, flat_p0);  // full list of p0 values
+  qs0 = _mm_andnot_si128(flat, qs0);
+  flat_q0 = _mm_and_si128(flat, flat_q0);
+  q0 = _mm_or_si128(qs0, flat_q0);  // full list of q0 values
+  // end highbd_filter8
+
+  // highbd_filter16
+  p6 = _mm_andnot_si128(flat2, p6);
+  //  p6 remains unchanged if !(flat2 && flat && mask)
+  flat2_p6 = _mm_and_si128(flat2, flat2_p6);
+  //  get values for when (flat2 && flat && mask)
+  p6 = _mm_or_si128(p6, flat2_p6);  // full list of p6 values
+  q6 = _mm_andnot_si128(flat2, q6);
+  //  q6 remains unchanged if !(flat2 && flat && mask)
+  flat2_q6 = _mm_and_si128(flat2, flat2_q6);
+  //  get values for when (flat2 && flat && mask)
+  q6 = _mm_or_si128(q6, flat2_q6);  // full list of q6 values
+  _mm_store_si128((__m128i *)(s - 7 * p), p6);
+  _mm_store_si128((__m128i *)(s + 6 * p), q6);
+
+  p5 = _mm_andnot_si128(flat2, p5);
+  //  p5 remains unchanged if !(flat2 && flat && mask)
+  flat2_p5 = _mm_and_si128(flat2, flat2_p5);
+  //  get values for when (flat2 && flat && mask)
+  p5 = _mm_or_si128(p5, flat2_p5);
+  //  full list of p5 values
+  q5 = _mm_andnot_si128(flat2, q5);
+  //  q5 remains unchanged if !(flat2 && flat && mask)
+  flat2_q5 = _mm_and_si128(flat2, flat2_q5);
+  //  get values for when (flat2 && flat && mask)
+  q5 = _mm_or_si128(q5, flat2_q5);
+  //  full list of q5 values
+  _mm_store_si128((__m128i *)(s - 6 * p), p5);
+  _mm_store_si128((__m128i *)(s + 5 * p), q5);
+
+  p4 = _mm_andnot_si128(flat2, p4);
+  //  p4 remains unchanged if !(flat2 && flat && mask)
+  flat2_p4 = _mm_and_si128(flat2, flat2_p4);
+  //  get values for when (flat2 && flat && mask)
+  p4 = _mm_or_si128(p4, flat2_p4);  // full list of p4 values
+  q4 = _mm_andnot_si128(flat2, q4);
+  //  q4 remains unchanged if !(flat2 && flat && mask)
+  flat2_q4 = _mm_and_si128(flat2, flat2_q4);
+  //  get values for when (flat2 && flat && mask)
+  q4 = _mm_or_si128(q4, flat2_q4);  // full list of q4 values
+  _mm_store_si128((__m128i *)(s - 5 * p), p4);
+  _mm_store_si128((__m128i *)(s + 4 * p), q4);
+
+  p3 = _mm_andnot_si128(flat2, p3);
+  //  p3 takes value from highbd_filter8 if !(flat2 && flat && mask)
+  flat2_p3 = _mm_and_si128(flat2, flat2_p3);
+  //  get values for when (flat2 && flat && mask)
+  p3 = _mm_or_si128(p3, flat2_p3);  // full list of p3 values
+  q3 = _mm_andnot_si128(flat2, q3);
+  //  q3 takes value from highbd_filter8 if !(flat2 && flat && mask)
+  flat2_q3 = _mm_and_si128(flat2, flat2_q3);
+  //  get values for when (flat2 && flat && mask)
+  q3 = _mm_or_si128(q3, flat2_q3);  // full list of q3 values
+  _mm_store_si128((__m128i *)(s - 4 * p), p3);
+  _mm_store_si128((__m128i *)(s + 3 * p), q3);
+
+  p2 = _mm_andnot_si128(flat2, p2);
+  //  p2 takes value from highbd_filter8 if !(flat2 && flat && mask)
+  flat2_p2 = _mm_and_si128(flat2, flat2_p2);
+  //  get values for when (flat2 && flat && mask)
+  p2 = _mm_or_si128(p2, flat2_p2);
+  //  full list of p2 values
+  q2 = _mm_andnot_si128(flat2, q2);
+  //  q2 takes value from highbd_filter8 if !(flat2 && flat && mask)
+  flat2_q2 = _mm_and_si128(flat2, flat2_q2);
+  //  get values for when (flat2 && flat && mask)
+  q2 = _mm_or_si128(q2, flat2_q2);  // full list of q2 values
+  _mm_store_si128((__m128i *)(s - 3 * p), p2);
+  _mm_store_si128((__m128i *)(s + 2 * p), q2);
+
+  p1 = _mm_andnot_si128(flat2, p1);
+  //  p1 takes value from highbd_filter8 if !(flat2 && flat && mask)
+  flat2_p1 = _mm_and_si128(flat2, flat2_p1);
+  //  get values for when (flat2 && flat && mask)
+  p1 = _mm_or_si128(p1, flat2_p1);  // full list of p1 values
+  q1 = _mm_andnot_si128(flat2, q1);
+  //  q1 takes value from highbd_filter8 if !(flat2 && flat && mask)
+  flat2_q1 = _mm_and_si128(flat2, flat2_q1);
+  //  get values for when (flat2 && flat && mask)
+  q1 = _mm_or_si128(q1, flat2_q1);  // full list of q1 values
+  _mm_store_si128((__m128i *)(s - 2 * p), p1);
+  _mm_store_si128((__m128i *)(s + 1 * p), q1);
+
+  p0 = _mm_andnot_si128(flat2, p0);
+  //  p0 takes value from highbd_filter8 if !(flat2 && flat && mask)
+  flat2_p0 = _mm_and_si128(flat2, flat2_p0);
+  //  get values for when (flat2 && flat && mask)
+  p0 = _mm_or_si128(p0, flat2_p0);  // full list of p0 values
+  q0 = _mm_andnot_si128(flat2, q0);
+  //  q0 takes value from highbd_filter8 if !(flat2 && flat && mask)
+  flat2_q0 = _mm_and_si128(flat2, flat2_q0);
+  //  get values for when (flat2 && flat && mask)
+  q0 = _mm_or_si128(q0, flat2_q0);  // full list of q0 values
+  _mm_store_si128((__m128i *)(s - 1 * p), p0);
+  _mm_store_si128((__m128i *)(s - 0 * p), q0);
+}
+
+static void highbd_mb_lpf_horizontal_edge_w_sse2_16(uint16_t *s,
+                                                    int p,
+                                                    const uint8_t *_blimit,
+                                                    const uint8_t *_limit,
+                                                    const uint8_t *_thresh,
+                                                    int bd) {
+  highbd_mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh, bd);
+  highbd_mb_lpf_horizontal_edge_w_sse2_8(s + 8, p, _blimit, _limit, _thresh,
+                                         bd);
+}
+
+// TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly.
+void vp9_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
+                                       const uint8_t *_blimit,
+                                       const uint8_t *_limit,
+                                       const uint8_t *_thresh,
+                                       int count, int bd) {
+  if (count == 1)
+    highbd_mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh, bd);
+  else
+    highbd_mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh, bd);
+}
+
+void vp9_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
+                                      const uint8_t *_blimit,
+                                      const uint8_t *_limit,
+                                      const uint8_t *_thresh,
+                                      int count, int bd) {
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_op2, 16);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_op1, 16);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_op0, 16);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_oq2, 16);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_oq1, 16);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_oq0, 16);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i blimit = _mm_slli_epi16(
+      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero),
+      bd - 8);
+  const __m128i limit = _mm_slli_epi16(
+      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero),
+      bd - 8);
+  const __m128i thresh = _mm_slli_epi16(
+      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero),
+      bd - 8);
+  __m128i mask, hev, flat;
+  __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * p));
+  __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * p));
+  __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * p));
+  __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * p));
+  __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * p));
+  __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * p));
+  __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * p));
+  __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * p));
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i ffff = _mm_cmpeq_epi16(one, one);
+  __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+  const __m128i four = _mm_set1_epi16(4);
+  __m128i workp_a, workp_b, workp_shft;
+
+  const __m128i t4 = _mm_set1_epi16(4);
+  const __m128i t3 = _mm_set1_epi16(3);
+  const __m128i t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
+  const __m128i t1 = _mm_set1_epi16(0x1);
+  const __m128i ps1 = _mm_subs_epi16(p1, t80);
+  const __m128i ps0 = _mm_subs_epi16(p0, t80);
+  const __m128i qs0 = _mm_subs_epi16(q0, t80);
+  const __m128i qs1 = _mm_subs_epi16(q1, t80);
+  __m128i filt;
+  __m128i work_a;
+  __m128i filter1, filter2;
+
+  (void)count;
+
+  // filter_mask and hev_mask
+  abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0),
+                          _mm_subs_epu16(p0, p1));
+  abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0),
+                          _mm_subs_epu16(q0, q1));
+
+  abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0),
+                          _mm_subs_epu16(q0, p0));
+  abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1),
+                          _mm_subs_epu16(q1, p1));
+  flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
+  hev = _mm_subs_epu16(flat, thresh);
+  hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
+
+  abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0);
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+  // So taking maximums continues to work:
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+  mask = _mm_max_epi16(abs_p1p0, mask);
+  // mask |= (abs(p1 - p0) > limit) * -1;
+  mask = _mm_max_epi16(abs_q1q0, mask);
+  // mask |= (abs(q1 - q0) > limit) * -1;
+
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1),
+                                    _mm_subs_epu16(p1, p2)),
+                       _mm_or_si128(_mm_subs_epu16(q2, q1),
+                                    _mm_subs_epu16(q1, q2)));
+  mask = _mm_max_epi16(work, mask);
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p2),
+                                    _mm_subs_epu16(p2, p3)),
+                       _mm_or_si128(_mm_subs_epu16(q3, q2),
+                                    _mm_subs_epu16(q2, q3)));
+  mask = _mm_max_epi16(work, mask);
+  mask = _mm_subs_epu16(mask, limit);
+  mask = _mm_cmpeq_epi16(mask, zero);
+
+  // flat_mask4
+  flat = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p0),
+                                    _mm_subs_epu16(p0, p2)),
+                       _mm_or_si128(_mm_subs_epu16(q2, q0),
+                                    _mm_subs_epu16(q0, q2)));
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p0),
+                                    _mm_subs_epu16(p0, p3)),
+                       _mm_or_si128(_mm_subs_epu16(q3, q0),
+                                    _mm_subs_epu16(q0, q3)));
+  flat = _mm_max_epi16(work, flat);
+  flat = _mm_max_epi16(abs_p1p0, flat);
+  flat = _mm_max_epi16(abs_q1q0, flat);
+  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+  flat = _mm_cmpeq_epi16(flat, zero);
+  flat = _mm_and_si128(flat, mask);  // flat & mask
+
+  // Added before shift for rounding part of ROUND_POWER_OF_TWO
+
+  workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
+  workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
+  workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
+  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  _mm_store_si128((__m128i *)&flat_op2[0], workp_shft);
+
+  workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
+  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  _mm_store_si128((__m128i *)&flat_op1[0], workp_shft);
+
+  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
+  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
+  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  _mm_store_si128((__m128i *)&flat_op0[0], workp_shft);
+
+  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
+  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
+  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  _mm_store_si128((__m128i *)&flat_oq0[0], workp_shft);
+
+  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
+  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
+  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  _mm_store_si128((__m128i *)&flat_oq1[0], workp_shft);
+
+  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
+  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
+  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  _mm_store_si128((__m128i *)&flat_oq2[0], workp_shft);
+
+  // lp filter
+  filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
+  filt = _mm_and_si128(filt, hev);
+  work_a = _mm_subs_epi16(qs0, ps0);
+  filt = _mm_adds_epi16(filt, work_a);
+  filt = _mm_adds_epi16(filt, work_a);
+  filt = _mm_adds_epi16(filt, work_a);
+  // (vp9_filter + 3 * (qs0 - ps0)) & mask
+  filt = signed_char_clamp_bd_sse2(filt, bd);
+  filt = _mm_and_si128(filt, mask);
+
+  filter1 = _mm_adds_epi16(filt, t4);
+  filter2 = _mm_adds_epi16(filt, t3);
+
+  // Filter1 >> 3
+  filter1 = signed_char_clamp_bd_sse2(filter1, bd);
+  filter1 = _mm_srai_epi16(filter1, 3);
+
+  // Filter2 >> 3
+  filter2 = signed_char_clamp_bd_sse2(filter2, bd);
+  filter2 = _mm_srai_epi16(filter2, 3);
+
+  // filt >> 1
+  filt = _mm_adds_epi16(filter1, t1);
+  filt = _mm_srai_epi16(filt, 1);
+  // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
+  filt = _mm_andnot_si128(hev, filt);
+
+  work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd);
+  work_a = _mm_adds_epi16(work_a, t80);
+  q0 = _mm_load_si128((__m128i *)flat_oq0);
+  work_a = _mm_andnot_si128(flat, work_a);
+  q0 = _mm_and_si128(flat, q0);
+  q0 = _mm_or_si128(work_a, q0);
+
+  work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd);
+  work_a = _mm_adds_epi16(work_a, t80);
+  q1 = _mm_load_si128((__m128i *)flat_oq1);
+  work_a = _mm_andnot_si128(flat, work_a);
+  q1 = _mm_and_si128(flat, q1);
+  q1 = _mm_or_si128(work_a, q1);
+
+  work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  q2 = _mm_load_si128((__m128i *)flat_oq2);
+  work_a = _mm_andnot_si128(flat, work_a);
+  q2 = _mm_and_si128(flat, q2);
+  q2 = _mm_or_si128(work_a, q2);
+
+  work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd);
+  work_a = _mm_adds_epi16(work_a, t80);
+  p0 = _mm_load_si128((__m128i *)flat_op0);
+  work_a = _mm_andnot_si128(flat, work_a);
+  p0 = _mm_and_si128(flat, p0);
+  p0 = _mm_or_si128(work_a, p0);
+
+  work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd);
+  work_a = _mm_adds_epi16(work_a, t80);
+  p1 = _mm_load_si128((__m128i *)flat_op1);
+  work_a = _mm_andnot_si128(flat, work_a);
+  p1 = _mm_and_si128(flat, p1);
+  p1 = _mm_or_si128(work_a, p1);
+
+  work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  p2 = _mm_load_si128((__m128i *)flat_op2);
+  work_a = _mm_andnot_si128(flat, work_a);
+  p2 = _mm_and_si128(flat, p2);
+  p2 = _mm_or_si128(work_a, p2);
+
+  _mm_store_si128((__m128i *)(s - 3 * p), p2);
+  _mm_store_si128((__m128i *)(s - 2 * p), p1);
+  _mm_store_si128((__m128i *)(s - 1 * p), p0);
+  _mm_store_si128((__m128i *)(s + 0 * p), q0);
+  _mm_store_si128((__m128i *)(s + 1 * p), q1);
+  _mm_store_si128((__m128i *)(s + 2 * p), q2);
+}
+
+void vp9_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int p,
+                                           const uint8_t *_blimit0,
+                                           const uint8_t *_limit0,
+                                           const uint8_t *_thresh0,
+                                           const uint8_t *_blimit1,
+                                           const uint8_t *_limit1,
+                                           const uint8_t *_thresh1,
+                                           int bd) {
+  vp9_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd);
+  vp9_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1,
+                                   1, bd);
+}
+
+void vp9_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
+                                      const uint8_t *_blimit,
+                                      const uint8_t *_limit,
+                                      const uint8_t *_thresh,
+                                      int count, int bd) {
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i blimit = _mm_slli_epi16(
+      _mm_unpacklo_epi8(
+          _mm_load_si128((const __m128i *)_blimit), zero), bd - 8);
+  const __m128i limit = _mm_slli_epi16(
+      _mm_unpacklo_epi8(
+          _mm_load_si128((const __m128i *)_limit), zero), bd - 8);
+  const __m128i thresh = _mm_slli_epi16(
+      _mm_unpacklo_epi8(
+          _mm_load_si128((const __m128i *)_thresh), zero), bd - 8);
+  __m128i mask, hev, flat;
+  __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+  __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+  const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0),
+                                        _mm_subs_epu16(p0, p1));
+  const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0),
+                                        _mm_subs_epu16(q0, q1));
+  const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0),
+                                  _mm_subs_epu16(q0, p0));
+  __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1),
+                                  _mm_subs_epu16(q1, p1));
+  __m128i work;
+  const __m128i t4 = _mm_set1_epi16(4);
+  const __m128i t3 = _mm_set1_epi16(3);
+  const __m128i t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
+  const __m128i tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), bd - 8);
+  const __m128i tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), bd - 8);
+  const __m128i t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 16 - bd);
+  // equivalent to shifting 0x1f left by bitdepth - 8
+  // and setting new bits to 1
+  const __m128i t1 = _mm_set1_epi16(0x1);
+  const __m128i t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 16 - bd);
+  // equivalent to shifting 0x7f left by bitdepth - 8
+  // and setting new bits to 1
+  const __m128i ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)),
+                                     t80);
+  const __m128i ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)),
+                                     t80);
+  const __m128i qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)),
+                                     t80);
+  const __m128i qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)),
+                                     t80);
+  __m128i filt;
+  __m128i work_a;
+  __m128i filter1, filter2;
+
+  (void)count;
+
+  // filter_mask and hev_mask
+  flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
+  hev = _mm_subs_epu16(flat, thresh);
+  hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
+
+  abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0);
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+  // So taking maximums continues to work:
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+  mask = _mm_max_epi16(flat, mask);
+  // mask |= (abs(p1 - p0) > limit) * -1;
+  // mask |= (abs(q1 - q0) > limit) * -1;
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1),
+                                    _mm_subs_epu16(p1, p2)),
+                       _mm_or_si128(_mm_subs_epu16(p3, p2),
+                                    _mm_subs_epu16(p2, p3)));
+  mask = _mm_max_epi16(work, mask);
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(q2, q1),
+                                    _mm_subs_epu16(q1, q2)),
+                       _mm_or_si128(_mm_subs_epu16(q3, q2),
+                                    _mm_subs_epu16(q2, q3)));
+  mask = _mm_max_epi16(work, mask);
+  mask = _mm_subs_epu16(mask, limit);
+  mask = _mm_cmpeq_epi16(mask, zero);
+
+  // filter4
+  filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
+  filt = _mm_and_si128(filt, hev);
+  work_a = _mm_subs_epi16(qs0, ps0);
+  filt = _mm_adds_epi16(filt, work_a);
+  filt = _mm_adds_epi16(filt, work_a);
+  filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
+  // (vp9_filter + 3 * (qs0 - ps0)) & mask
+  filt = _mm_and_si128(filt, mask);
+
+  filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
+  filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
+
+  // Filter1 >> 3
+  work_a = _mm_cmpgt_epi16(zero, filter1);  // get the values that are <0
+  filter1 = _mm_srli_epi16(filter1, 3);
+  work_a = _mm_and_si128(work_a, tffe0);  // sign bits for the values < 0
+  filter1 = _mm_and_si128(filter1, t1f);  // clamp the range
+  filter1 = _mm_or_si128(filter1, work_a);  // reinsert the sign bits
+
+  // Filter2 >> 3
+  work_a = _mm_cmpgt_epi16(zero, filter2);
+  filter2 = _mm_srli_epi16(filter2, 3);
+  work_a = _mm_and_si128(work_a, tffe0);
+  filter2 = _mm_and_si128(filter2, t1f);
+  filter2 = _mm_or_si128(filter2, work_a);
+
+  // filt >> 1
+  filt = _mm_adds_epi16(filter1, t1);
+  work_a = _mm_cmpgt_epi16(zero, filt);
+  filt = _mm_srli_epi16(filt, 1);
+  work_a = _mm_and_si128(work_a, tff80);
+  filt = _mm_and_si128(filt, t7f);
+  filt = _mm_or_si128(filt, work_a);
+
+  filt = _mm_andnot_si128(hev, filt);
+
+  q0 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80);
+  q1 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd), t80);
+  p0 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80);
+  p1 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd), t80);
+
+  _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+  _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+  _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+  _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+}
+
+void vp9_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int p,
+                                           const uint8_t *_blimit0,
+                                           const uint8_t *_limit0,
+                                           const uint8_t *_thresh0,
+                                           const uint8_t *_blimit1,
+                                           const uint8_t *_limit1,
+                                           const uint8_t *_thresh1,
+                                           int bd) {
+  vp9_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd);
+  vp9_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, 1,
+                                   bd);
+}
+
+static INLINE void highbd_transpose(uint16_t *src[], int in_p,
+                                    uint16_t *dst[], int out_p,
+                                    int num_8x8_to_transpose) {
+  int idx8x8 = 0;
+  __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7;
+  do {
+    uint16_t *in = src[idx8x8];
+    uint16_t *out = dst[idx8x8];
+
+    p0 = _mm_loadu_si128((__m128i *)(in + 0*in_p));  // 00 01 02 03 04 05 06 07
+    p1 = _mm_loadu_si128((__m128i *)(in + 1*in_p));  // 10 11 12 13 14 15 16 17
+    p2 = _mm_loadu_si128((__m128i *)(in + 2*in_p));  // 20 21 22 23 24 25 26 27
+    p3 = _mm_loadu_si128((__m128i *)(in + 3*in_p));  // 30 31 32 33 34 35 36 37
+    p4 = _mm_loadu_si128((__m128i *)(in + 4*in_p));  // 40 41 42 43 44 45 46 47
+    p5 = _mm_loadu_si128((__m128i *)(in + 5*in_p));  // 50 51 52 53 54 55 56 57
+    p6 = _mm_loadu_si128((__m128i *)(in + 6*in_p));  // 60 61 62 63 64 65 66 67
+    p7 = _mm_loadu_si128((__m128i *)(in + 7*in_p));  // 70 71 72 73 74 75 76 77
+    // 00 10 01 11 02 12 03 13
+    x0 = _mm_unpacklo_epi16(p0, p1);
+    // 20 30 21 31 22 32 23 33
+    x1 = _mm_unpacklo_epi16(p2, p3);
+    // 40 50 41 51 42 52 43 53
+    x2 = _mm_unpacklo_epi16(p4, p5);
+    // 60 70 61 71 62 72 63 73
+    x3 = _mm_unpacklo_epi16(p6, p7);
+    // 00 10 20 30 01 11 21 31
+    x4 = _mm_unpacklo_epi32(x0, x1);
+    // 40 50 60 70 41 51 61 71
+    x5 = _mm_unpacklo_epi32(x2, x3);
+    // 00 10 20 30 40 50 60 70
+    x6 = _mm_unpacklo_epi64(x4, x5);
+    // 01 11 21 31 41 51 61 71
+    x7 = _mm_unpackhi_epi64(x4, x5);
+
+    _mm_storeu_si128((__m128i *)(out + 0*out_p), x6);
+    // 00 10 20 30 40 50 60 70
+    _mm_storeu_si128((__m128i *)(out + 1*out_p), x7);
+    // 01 11 21 31 41 51 61 71
+
+    // 02 12 22 32 03 13 23 33
+    x4 = _mm_unpackhi_epi32(x0, x1);
+    // 42 52 62 72 43 53 63 73
+    x5 = _mm_unpackhi_epi32(x2, x3);
+    // 02 12 22 32 42 52 62 72
+    x6 = _mm_unpacklo_epi64(x4, x5);
+    // 03 13 23 33 43 53 63 73
+    x7 = _mm_unpackhi_epi64(x4, x5);
+
+    _mm_storeu_si128((__m128i *)(out + 2*out_p), x6);
+    // 02 12 22 32 42 52 62 72
+    _mm_storeu_si128((__m128i *)(out + 3*out_p), x7);
+    // 03 13 23 33 43 53 63 73
+
+    // 04 14 05 15 06 16 07 17
+    x0 = _mm_unpackhi_epi16(p0, p1);
+    // 24 34 25 35 26 36 27 37
+    x1 = _mm_unpackhi_epi16(p2, p3);
+    // 44 54 45 55 46 56 47 57
+    x2 = _mm_unpackhi_epi16(p4, p5);
+    // 64 74 65 75 66 76 67 77
+    x3 = _mm_unpackhi_epi16(p6, p7);
+    // 04 14 24 34 05 15 25 35
+    x4 = _mm_unpacklo_epi32(x0, x1);
+    // 44 54 64 74 45 55 65 75
+    x5 = _mm_unpacklo_epi32(x2, x3);
+    // 04 14 24 34 44 54 64 74
+    x6 = _mm_unpacklo_epi64(x4, x5);
+    // 05 15 25 35 45 55 65 75
+    x7 = _mm_unpackhi_epi64(x4, x5);
+
+    _mm_storeu_si128((__m128i *)(out + 4*out_p), x6);
+    // 04 14 24 34 44 54 64 74
+    _mm_storeu_si128((__m128i *)(out + 5*out_p), x7);
+    // 05 15 25 35 45 55 65 75
+
+    // 06 16 26 36 07 17 27 37
+    x4 = _mm_unpackhi_epi32(x0, x1);
+    // 46 56 66 76 47 57 67 77
+    x5 = _mm_unpackhi_epi32(x2, x3);
+    // 06 16 26 36 46 56 66 76
+    x6 = _mm_unpacklo_epi64(x4, x5);
+    // 07 17 27 37 47 57 67 77
+    x7 = _mm_unpackhi_epi64(x4, x5);
+
+    _mm_storeu_si128((__m128i *)(out + 6*out_p), x6);
+    // 06 16 26 36 46 56 66 76
+    _mm_storeu_si128((__m128i *)(out + 7*out_p), x7);
+    // 07 17 27 37 47 57 67 77
+  } while (++idx8x8 < num_8x8_to_transpose);
+}
+
+static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1,
+                                        int in_p, uint16_t *out, int out_p) {
+  uint16_t *src0[1];
+  uint16_t *src1[1];
+  uint16_t *dest0[1];
+  uint16_t *dest1[1];
+  src0[0] = in0;
+  src1[0] = in1;
+  dest0[0] = out;
+  dest1[0] = out + 8;
+  highbd_transpose(src0, in_p, dest0, out_p, 1);
+  highbd_transpose(src1, in_p, dest1, out_p, 1);
+}
+
+void vp9_highbd_lpf_vertical_4_sse2(uint16_t *s, int p,
+                                    const uint8_t *blimit,
+                                    const uint8_t *limit,
+                                    const uint8_t *thresh,
+                                    int count, int bd) {
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, t_dst, 8 * 8);
+  uint16_t *src[1];
+  uint16_t *dst[1];
+  (void)count;
+
+  // Transpose 8x8
+  src[0] = s - 4;
+  dst[0] = t_dst;
+
+  highbd_transpose(src, p, dst, 8, 1);
+
+  // Loop filtering
+  vp9_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1,
+                                   bd);
+
+  src[0] = t_dst;
+  dst[0] = s - 4;
+
+  // Transpose back
+  highbd_transpose(src, 8, dst, p, 1);
+}
+
+void vp9_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int p,
+                                         const uint8_t *blimit0,
+                                         const uint8_t *limit0,
+                                         const uint8_t *thresh0,
+                                         const uint8_t *blimit1,
+                                         const uint8_t *limit1,
+                                         const uint8_t *thresh1,
+                                         int bd) {
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, t_dst, 16 * 8);
+  uint16_t *src[2];
+  uint16_t *dst[2];
+
+  // Transpose 8x16
+  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+
+  // Loop filtering
+  vp9_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
+                                        thresh0, blimit1, limit1, thresh1, bd);
+  src[0] = t_dst;
+  src[1] = t_dst + 8;
+  dst[0] = s - 4;
+  dst[1] = s - 4 + p * 8;
+
+  // Transpose back
+  highbd_transpose(src, 16, dst, p, 2);
+}
+
+void vp9_highbd_lpf_vertical_8_sse2(uint16_t *s, int p,
+                                    const uint8_t *blimit,
+                                    const uint8_t *limit,
+                                    const uint8_t *thresh,
+                                    int count, int bd) {
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, t_dst, 8 * 8);
+  uint16_t *src[1];
+  uint16_t *dst[1];
+  (void)count;
+
+  // Transpose 8x8
+  src[0] = s - 4;
+  dst[0] = t_dst;
+
+  highbd_transpose(src, p, dst, 8, 1);
+
+  // Loop filtering
+  vp9_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1,
+                                   bd);
+
+  src[0] = t_dst;
+  dst[0] = s - 4;
+
+  // Transpose back
+  highbd_transpose(src, 8, dst, p, 1);
+}
+
+void vp9_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int p,
+                                         const uint8_t *blimit0,
+                                         const uint8_t *limit0,
+                                         const uint8_t *thresh0,
+                                         const uint8_t *blimit1,
+                                         const uint8_t *limit1,
+                                         const uint8_t *thresh1,
+                                         int bd) {
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, t_dst, 16 * 8);
+  uint16_t *src[2];
+  uint16_t *dst[2];
+
+  // Transpose 8x16
+  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+
+  // Loop filtering
+  vp9_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
+                                        thresh0, blimit1, limit1, thresh1, bd);
+  src[0] = t_dst;
+  src[1] = t_dst + 8;
+
+  dst[0] = s - 4;
+  dst[1] = s - 4 + p * 8;
+
+  // Transpose back
+  highbd_transpose(src, 16, dst, p, 2);
+}
+
+void vp9_highbd_lpf_vertical_16_sse2(uint16_t *s, int p,
+                                     const uint8_t *blimit,
+                                     const uint8_t *limit,
+                                     const uint8_t *thresh,
+                                     int bd) {
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, t_dst, 8 * 16);
+  uint16_t *src[2];
+  uint16_t *dst[2];
+
+  src[0] = s - 8;
+  src[1] = s;
+  dst[0] = t_dst;
+  dst[1] = t_dst + 8 * 8;
+
+  // Transpose 16x8
+  highbd_transpose(src, p, dst, 8, 2);
+
+  // Loop filtering
+  highbd_mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit,
+                                         thresh, bd);
+  src[0] = t_dst;
+  src[1] = t_dst + 8 * 8;
+  dst[0] = s - 8;
+  dst[1] = s;
+
+  // Transpose back
+  highbd_transpose(src, 8, dst, p, 2);
+}
+
+void vp9_highbd_lpf_vertical_16_dual_sse2(uint16_t *s,
+                                          int p,
+                                          const uint8_t *blimit,
+                                          const uint8_t *limit,
+                                          const uint8_t *thresh,
+                                          int bd) {
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, t_dst, 256);
+
+  //  Transpose 16x16
+  highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
+  highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
+
+  //  Loop filtering
+  highbd_mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit,
+                                          thresh, bd);
+
+  //  Transpose back
+  highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
+  highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
+}

diff --git a/source/libvpx/vp9/common/x86/vp9_high_subpixel_8t_sse2.asm b/source/libvpx/vp9/common/x86/vp9_high_subpixel_8t_sse2.asm
index 4bdbb83..29ec151 100644
--- a/source/libvpx/vp9/common/x86/vp9_high_subpixel_8t_sse2.asm
+++ b/source/libvpx/vp9/common/x86/vp9_high_subpixel_8t_sse2.asm

@@ -206,8 +206,8 @@
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(vp9_high_filter_block1d4_v8_sse2) PRIVATE
-sym(vp9_high_filter_block1d4_v8_sse2):
+global sym(vp9_highbd_filter_block1d4_v8_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d4_v8_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -276,8 +276,8 @@
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(vp9_high_filter_block1d8_v8_sse2) PRIVATE
-sym(vp9_high_filter_block1d8_v8_sse2):
+global sym(vp9_highbd_filter_block1d8_v8_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d8_v8_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -335,8 +335,8 @@
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(vp9_high_filter_block1d16_v8_sse2) PRIVATE
-sym(vp9_high_filter_block1d16_v8_sse2):
+global sym(vp9_highbd_filter_block1d16_v8_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d16_v8_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -389,8 +389,8 @@
     pop         rbp
     ret
 
-global sym(vp9_high_filter_block1d4_v8_avg_sse2) PRIVATE
-sym(vp9_high_filter_block1d4_v8_avg_sse2):
+global sym(vp9_highbd_filter_block1d4_v8_avg_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d4_v8_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -450,8 +450,8 @@
     pop         rbp
     ret
 
-global sym(vp9_high_filter_block1d8_v8_avg_sse2) PRIVATE
-sym(vp9_high_filter_block1d8_v8_avg_sse2):
+global sym(vp9_highbd_filter_block1d8_v8_avg_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d8_v8_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -499,8 +499,8 @@
     pop         rbp
     ret
 
-global sym(vp9_high_filter_block1d16_v8_avg_sse2) PRIVATE
-sym(vp9_high_filter_block1d16_v8_avg_sse2):
+global sym(vp9_highbd_filter_block1d16_v8_avg_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d16_v8_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -561,8 +561,8 @@
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(vp9_high_filter_block1d4_h8_sse2) PRIVATE
-sym(vp9_high_filter_block1d4_h8_sse2):
+global sym(vp9_highbd_filter_block1d4_h8_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d4_h8_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -636,8 +636,8 @@
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(vp9_high_filter_block1d8_h8_sse2) PRIVATE
-sym(vp9_high_filter_block1d8_h8_sse2):
+global sym(vp9_highbd_filter_block1d8_h8_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d8_h8_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -702,8 +702,8 @@
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(vp9_high_filter_block1d16_h8_sse2) PRIVATE
-sym(vp9_high_filter_block1d16_h8_sse2):
+global sym(vp9_highbd_filter_block1d16_h8_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d16_h8_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -770,8 +770,8 @@
     pop         rbp
     ret
 
-global sym(vp9_high_filter_block1d4_h8_avg_sse2) PRIVATE
-sym(vp9_high_filter_block1d4_h8_avg_sse2):
+global sym(vp9_highbd_filter_block1d4_h8_avg_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d4_h8_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -836,8 +836,8 @@
     pop         rbp
     ret
 
-global sym(vp9_high_filter_block1d8_h8_avg_sse2) PRIVATE
-sym(vp9_high_filter_block1d8_h8_avg_sse2):
+global sym(vp9_highbd_filter_block1d8_h8_avg_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d8_h8_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -893,8 +893,8 @@
     pop         rbp
     ret
 
-global sym(vp9_high_filter_block1d16_h8_avg_sse2) PRIVATE
-sym(vp9_high_filter_block1d16_h8_avg_sse2):
+global sym(vp9_highbd_filter_block1d16_h8_avg_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d16_h8_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7

diff --git a/source/libvpx/vp9/common/x86/vp9_high_subpixel_bilinear_sse2.asm b/source/libvpx/vp9/common/x86/vp9_high_subpixel_bilinear_sse2.asm
index b7d4a61..9378412 100644
--- a/source/libvpx/vp9/common/x86/vp9_high_subpixel_bilinear_sse2.asm
+++ b/source/libvpx/vp9/common/x86/vp9_high_subpixel_bilinear_sse2.asm

@@ -171,8 +171,8 @@
 %endm
 %endif
 
-global sym(vp9_high_filter_block1d4_v2_sse2) PRIVATE
-sym(vp9_high_filter_block1d4_v2_sse2):
+global sym(vp9_highbd_filter_block1d4_v2_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d4_v2_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -196,8 +196,8 @@
     ret
 
 %if ARCH_X86_64
-global sym(vp9_high_filter_block1d8_v2_sse2) PRIVATE
-sym(vp9_high_filter_block1d8_v2_sse2):
+global sym(vp9_highbd_filter_block1d8_v2_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d8_v2_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -222,8 +222,8 @@
     pop         rbp
     ret
 
-global sym(vp9_high_filter_block1d16_v2_sse2) PRIVATE
-sym(vp9_high_filter_block1d16_v2_sse2):
+global sym(vp9_highbd_filter_block1d16_v2_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d16_v2_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -251,8 +251,8 @@
     ret
 %endif
 
-global sym(vp9_high_filter_block1d4_v2_avg_sse2) PRIVATE
-sym(vp9_high_filter_block1d4_v2_avg_sse2):
+global sym(vp9_highbd_filter_block1d4_v2_avg_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d4_v2_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -276,8 +276,8 @@
     ret
 
 %if ARCH_X86_64
-global sym(vp9_high_filter_block1d8_v2_avg_sse2) PRIVATE
-sym(vp9_high_filter_block1d8_v2_avg_sse2):
+global sym(vp9_highbd_filter_block1d8_v2_avg_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d8_v2_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -302,8 +302,8 @@
     pop         rbp
     ret
 
-global sym(vp9_high_filter_block1d16_v2_avg_sse2) PRIVATE
-sym(vp9_high_filter_block1d16_v2_avg_sse2):
+global sym(vp9_highbd_filter_block1d16_v2_avg_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d16_v2_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -331,8 +331,8 @@
     ret
 %endif
 
-global sym(vp9_high_filter_block1d4_h2_sse2) PRIVATE
-sym(vp9_high_filter_block1d4_h2_sse2):
+global sym(vp9_highbd_filter_block1d4_h2_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d4_h2_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -357,8 +357,8 @@
     ret
 
 %if ARCH_X86_64
-global sym(vp9_high_filter_block1d8_h2_sse2) PRIVATE
-sym(vp9_high_filter_block1d8_h2_sse2):
+global sym(vp9_highbd_filter_block1d8_h2_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d8_h2_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -383,8 +383,8 @@
     pop         rbp
     ret
 
-global sym(vp9_high_filter_block1d16_h2_sse2) PRIVATE
-sym(vp9_high_filter_block1d16_h2_sse2):
+global sym(vp9_highbd_filter_block1d16_h2_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d16_h2_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -412,8 +412,8 @@
     ret
 %endif
 
-global sym(vp9_high_filter_block1d4_h2_avg_sse2) PRIVATE
-sym(vp9_high_filter_block1d4_h2_avg_sse2):
+global sym(vp9_highbd_filter_block1d4_h2_avg_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d4_h2_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -438,8 +438,8 @@
     ret
 
 %if ARCH_X86_64
-global sym(vp9_high_filter_block1d8_h2_avg_sse2) PRIVATE
-sym(vp9_high_filter_block1d8_h2_avg_sse2):
+global sym(vp9_highbd_filter_block1d8_h2_avg_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d8_h2_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -464,8 +464,8 @@
     pop         rbp
     ret
 
-global sym(vp9_high_filter_block1d16_h2_avg_sse2) PRIVATE
-sym(vp9_high_filter_block1d16_h2_avg_sse2):
+global sym(vp9_highbd_filter_block1d16_h2_avg_sse2) PRIVATE
+sym(vp9_highbd_filter_block1d16_h2_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7

diff --git a/source/libvpx/vp9/decoder/vp9_decodeframe.c b/source/libvpx/vp9/decoder/vp9_decodeframe.c
index 4e85caf..dc712f0 100644
--- a/source/libvpx/vp9/decoder/vp9_decodeframe.c
+++ b/source/libvpx/vp9/decoder/vp9_decodeframe.c

@@ -196,6 +196,64 @@
   if (eob > 0) {
     TX_TYPE tx_type = DCT_DCT;
     tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      if (xd->lossless) {
+        tx_type = DCT_DCT;
+        vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
+      } else {
+        const PLANE_TYPE plane_type = pd->plane_type;
+        switch (tx_size) {
+          case TX_4X4:
+            tx_type = get_tx_type_4x4(plane_type, xd, block);
+            vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+            break;
+          case TX_8X8:
+            tx_type = get_tx_type(plane_type, xd);
+            vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+            break;
+          case TX_16X16:
+            tx_type = get_tx_type(plane_type, xd);
+            vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+            break;
+          case TX_32X32:
+            tx_type = DCT_DCT;
+            vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
+            break;
+          default:
+            assert(0 && "Invalid transform size");
+        }
+      }
+    } else {
+      if (xd->lossless) {
+        tx_type = DCT_DCT;
+        vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
+      } else {
+        const PLANE_TYPE plane_type = pd->plane_type;
+        switch (tx_size) {
+          case TX_4X4:
+            tx_type = get_tx_type_4x4(plane_type, xd, block);
+            vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob);
+            break;
+          case TX_8X8:
+            tx_type = get_tx_type(plane_type, xd);
+            vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
+            break;
+          case TX_16X16:
+            tx_type = get_tx_type(plane_type, xd);
+            vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
+            break;
+          case TX_32X32:
+            tx_type = DCT_DCT;
+            vp9_idct32x32_add(dqcoeff, dst, stride, eob);
+            break;
+          default:
+            assert(0 && "Invalid transform size");
+            return;
+        }
+      }
+    }
+#else
     if (xd->lossless) {
       tx_type = DCT_DCT;
       vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
@@ -220,8 +278,10 @@
           break;
         default:
           assert(0 && "Invalid transform size");
+          return;
       }
     }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
     if (eob == 1) {
       vpx_memset(dqcoeff, 0, 2 * sizeof(dqcoeff[0]));
@@ -258,7 +318,7 @@
   dst = &pd->dst.buf[4 * y * pd->dst.stride + 4 * x];
 
   vp9_predict_intra_block(xd, block >> (tx_size << 1),
-                          b_width_log2(plane_bsize), tx_size, mode,
+                          b_width_log2_lookup[plane_bsize], tx_size, mode,
                           dst, pd->dst.stride, dst, pd->dst.stride,
                           x, y, plane);
 
@@ -324,22 +384,6 @@
   return &xd->mi[0].mbmi;
 }
 
-static void set_ref(VP9_COMMON *const cm, MACROBLOCKD *const xd,
-                    int idx, int mi_row, int mi_col) {
-  MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
-  RefBuffer *ref_buffer = &cm->frame_refs[mbmi->ref_frame[idx] - LAST_FRAME];
-  xd->block_refs[idx] = ref_buffer;
-  if (!vp9_is_valid_scale(&ref_buffer->sf))
-    vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
-                       "Invalid scale factors");
-  if (ref_buffer->buf->corrupted)
-    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
-                       "Block reference is corrupt");
-  vp9_setup_pre_planes(xd, idx, ref_buffer->buf, mi_row, mi_col,
-                       &ref_buffer->sf);
-  xd->corrupted |= ref_buffer->buf->corrupted;
-}
-
 static void decode_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                          const TileInfo *const tile,
                          int mi_row, int mi_col,
@@ -364,11 +408,6 @@
     vp9_foreach_transformed_block(xd, bsize,
                                   predict_and_reconstruct_intra_block, &arg);
   } else {
-    // Setup
-    set_ref(cm, xd, 0, mi_row, mi_col);
-    if (has_second_ref(mbmi))
-      set_ref(cm, xd, 1, mi_row, mi_col);
-
     // Prediction
     vp9_dec_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
 
@@ -592,13 +631,18 @@
   update |= read_delta_q(rb, &cm->y_dc_delta_q);
   update |= read_delta_q(rb, &cm->uv_dc_delta_q);
   update |= read_delta_q(rb, &cm->uv_ac_delta_q);
-  if (update)
+  if (update || cm->bit_depth != cm->dequant_bit_depth) {
     vp9_init_dequantizer(cm);
+    cm->dequant_bit_depth = cm->bit_depth;
+  }
 
   xd->lossless = cm->base_qindex == 0 &&
                  cm->y_dc_delta_q == 0 &&
                  cm->uv_dc_delta_q == 0 &&
                  cm->uv_ac_delta_q == 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+  xd->bd = (int)cm->bit_depth;
+#endif
 }
 
 static INTERP_FILTER read_interp_filter(struct vp9_read_bit_buffer *rb) {
@@ -612,10 +656,8 @@
 
 void vp9_read_frame_size(struct vp9_read_bit_buffer *rb,
                          int *width, int *height) {
-  const int w = vp9_rb_read_literal(rb, 16) + 1;
-  const int h = vp9_rb_read_literal(rb, 16) + 1;
-  *width = w;
-  *height = h;
+  *width = vp9_rb_read_literal(rb, 16) + 1;
+  *height = vp9_rb_read_literal(rb, 16) + 1;
 }
 
 static void setup_display_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
@@ -670,6 +712,8 @@
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffer");
   }
+  cm->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
+  cm->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
   cm->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
 }
 
@@ -703,7 +747,7 @@
   if (!found)
     vp9_read_frame_size(rb, &width, &height);
 
-  if (width <=0 || height <= 0)
+  if (width <= 0 || height <= 0)
     vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                        "Invalid frame size");
 
@@ -722,8 +766,8 @@
     RefBuffer *const ref_frame = &cm->frame_refs[i];
     if (!valid_ref_frame_img_fmt(
             ref_frame->buf->bit_depth,
-            ref_frame->buf->uv_crop_width < ref_frame->buf->y_crop_width,
-            ref_frame->buf->uv_crop_height < ref_frame->buf->y_crop_height,
+            ref_frame->buf->subsampling_x,
+            ref_frame->buf->subsampling_y,
             cm->bit_depth,
             cm->subsampling_x,
             cm->subsampling_y))
@@ -746,6 +790,8 @@
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffer");
   }
+  cm->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
+  cm->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
   cm->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
 }
 
@@ -1139,8 +1185,17 @@
 
 static void read_bitdepth_colorspace_sampling(
     VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
-  if (cm->profile >= PROFILE_2)
+  if (cm->profile >= PROFILE_2) {
     cm->bit_depth = vp9_rb_read_bit(rb) ? VPX_BITS_12 : VPX_BITS_10;
+#if CONFIG_VP9_HIGHBITDEPTH
+    cm->use_highbitdepth = 1;
+#endif
+  } else {
+    cm->bit_depth = VPX_BITS_8;
+#if CONFIG_VP9_HIGHBITDEPTH
+    cm->use_highbitdepth = 0;
+#endif
+  }
   cm->color_space = (COLOR_SPACE)vp9_rb_read_literal(rb, 3);
   if (cm->color_space != SRGB) {
     vp9_rb_read_bit(rb);  // [16,235] (including xvycc) vs [0,255] range
@@ -1244,6 +1299,10 @@
         // case (normative).
         cm->color_space = BT_601;
         cm->subsampling_y = cm->subsampling_x = 1;
+        cm->bit_depth = VPX_BITS_8;
+#if CONFIG_VP9_HIGHBITDEPTH
+        cm->use_highbitdepth = 0;
+#endif
       }
 
       pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES);
@@ -1284,6 +1343,9 @@
       }
     }
   }
+#if CONFIG_VP9_HIGHBITDEPTH
+  get_frame_new_buffer(cm)->bit_depth = cm->bit_depth;
+#endif
 
   if (pbi->need_resync) {
     vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,

diff --git a/source/libvpx/vp9/decoder/vp9_decodemv.c b/source/libvpx/vp9/decoder/vp9_decodemv.c
index ef2dc80..a01fe84 100644
--- a/source/libvpx/vp9/decoder/vp9_decodemv.c
+++ b/source/libvpx/vp9/decoder/vp9_decodemv.c

@@ -223,7 +223,6 @@
   fr = vp9_read_tree(r, vp9_mv_fp_tree, class0 ? mvcomp->class0_fp[d]
                                                : mvcomp->fp);
 
-
   // High precision part (if hp is not used, the default value of the hp is 1)
   hp = usehp ? vp9_read(r, class0 ? mvcomp->class0_hp : mvcomp->hp)
              : 1;
@@ -435,11 +434,16 @@
 
   for (ref = 0; ref < 1 + is_compound; ++ref) {
     const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
-    const int ref_idx = frame - LAST_FRAME;
-    if (cm->frame_refs[ref_idx].sf.x_scale_fp == REF_INVALID_SCALE ||
-        cm->frame_refs[ref_idx].sf.y_scale_fp == REF_INVALID_SCALE )
+    RefBuffer *ref_buf = &cm->frame_refs[frame - LAST_FRAME];
+    xd->block_refs[ref] = ref_buf;
+    if ((!vp9_is_valid_scale(&ref_buf->sf)))
       vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                          "Reference frame has invalid dimensions");
+    if (ref_buf->buf->corrupted)
+      vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                         "Block reference is corrupt");
+    vp9_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col,
+                         &ref_buf->sf);
     vp9_find_mv_refs(cm, xd, tile, mi, frame, mbmi->ref_mvs[frame],
                      mi_row, mi_col);
   }

diff --git a/source/libvpx/vp9/decoder/vp9_decoder.c b/source/libvpx/vp9/decoder/vp9_decoder.c
index 6ee3d70..baf6ab7 100644
--- a/source/libvpx/vp9/decoder/vp9_decoder.c
+++ b/source/libvpx/vp9/decoder/vp9_decoder.c

@@ -38,7 +38,6 @@
 
   if (!init_done) {
     vp9_rtcd();
-    vp9_init_neighbors();
     vp9_init_intra_predictors();
     init_done = 1;
   }
@@ -69,6 +68,7 @@
   cm->current_video_frame = 0;
   pbi->ready_for_new_data = 1;
   cm->bit_depth = VPX_BITS_8;
+  cm->dequant_bit_depth = VPX_BITS_8;
 
   // vp9_init_dequantizer() is first called here. Add check in
   // frame_init_dequantizer() to avoid unnecessary calling of
@@ -232,6 +232,8 @@
       cm->frame_refs[0].buf->corrupted = 1;
   }
 
+  pbi->ready_for_new_data = 0;
+
   // Check if the previous frame was a frame without any references to it.
   if (cm->new_fb_idx >= 0 && cm->frame_bufs[cm->new_fb_idx].ref_count == 0)
     cm->release_fb_cb(cm->cb_priv,
@@ -279,8 +281,6 @@
     cm->current_video_frame++;
   }
 
-  pbi->ready_for_new_data = 0;
-
   cm->error.setjmp = 0;
   return retcode;
 }
@@ -296,12 +296,12 @@
   if (pbi->ready_for_new_data == 1)
     return ret;
 
+  pbi->ready_for_new_data = 1;
+
   /* no raw frame to show!!! */
   if (!cm->show_frame)
     return ret;
 
-  pbi->ready_for_new_data = 1;
-
 #if CONFIG_VP9_POSTPROC
   if (!cm->show_existing_frame) {
     ret = vp9_post_proc_frame(cm, sd, flags);

diff --git a/source/libvpx/vp9/decoder/vp9_detokenize.c b/source/libvpx/vp9/decoder/vp9_detokenize.c
index 5778748..421229a 100644
--- a/source/libvpx/vp9/decoder/vp9_detokenize.c
+++ b/source/libvpx/vp9/decoder/vp9_detokenize.c

@@ -190,7 +190,11 @@
       }
     }
     v = (val * dqv) >> dq_shift;
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+    dqcoeff[scan[c]] = check_range(vp9_read_bit(r) ? -v : v);
+#else
     dqcoeff[scan[c]] = vp9_read_bit(r) ? -v : v;
+#endif
     token_cache[scan[c]] = vp9_pt_energy_class[token];
     ++c;
     ctx = get_coef_context(nb, token_cache, c);

diff --git a/source/libvpx/vp9/decoder/vp9_dthread.c b/source/libvpx/vp9/decoder/vp9_dthread.c
index 62ea6c1..69e4fde 100644
--- a/source/libvpx/vp9/decoder/vp9_dthread.c
+++ b/source/libvpx/vp9/decoder/vp9_dthread.c

@@ -223,14 +223,18 @@
 
     CHECK_MEM_ERROR(cm, lf_sync->mutex_,
                     vpx_malloc(sizeof(*lf_sync->mutex_) * rows));
-    for (i = 0; i < rows; ++i) {
-      pthread_mutex_init(&lf_sync->mutex_[i], NULL);
+    if (lf_sync->mutex_) {
+      for (i = 0; i < rows; ++i) {
+        pthread_mutex_init(&lf_sync->mutex_[i], NULL);
+      }
     }
 
     CHECK_MEM_ERROR(cm, lf_sync->cond_,
                     vpx_malloc(sizeof(*lf_sync->cond_) * rows));
-    for (i = 0; i < rows; ++i) {
-      pthread_cond_init(&lf_sync->cond_[i], NULL);
+    if (lf_sync->cond_) {
+      for (i = 0; i < rows; ++i) {
+        pthread_cond_init(&lf_sync->cond_[i], NULL);
+      }
     }
   }
 #endif  // CONFIG_MULTITHREAD

diff --git a/source/libvpx/vp9/encoder/vp9_aq_variance.c b/source/libvpx/vp9/encoder/vp9_aq_variance.c
index b96f00f..7d75f09 100644
--- a/source/libvpx/vp9/encoder/vp9_aq_variance.c
+++ b/source/libvpx/vp9/encoder/vp9_aq_variance.c

@@ -34,6 +34,9 @@
 #define SEGMENT_ID(i) segment_id[(i) - ENERGY_MIN]
 
 DECLARE_ALIGNED(16, static const uint8_t, vp9_64_zeros[64]) = {0};
+#if CONFIG_VP9_HIGHBITDEPTH
+DECLARE_ALIGNED(16, static const uint16_t, vp9_highbd_64_zeros[64]) = {0};
+#endif
 
 unsigned int vp9_vaq_segment_id(int energy) {
   ENERGY_IN_BOUNDS(energy);
@@ -126,14 +129,40 @@
     const int bw = 8 * num_8x8_blocks_wide_lookup[bs] - right_overflow;
     const int bh = 8 * num_8x8_blocks_high_lookup[bs] - bottom_overflow;
     int avg;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      highbd_variance(x->plane[0].src.buf, x->plane[0].src.stride,
+                      CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros), 0, bw, bh,
+                      &sse, &avg);
+      sse >>= 2 * (xd->bd - 8);
+      avg >>= (xd->bd - 8);
+    } else {
+      variance(x->plane[0].src.buf, x->plane[0].src.stride,
+               vp9_64_zeros, 0, bw, bh, &sse, &avg);
+    }
+#else
     variance(x->plane[0].src.buf, x->plane[0].src.stride,
              vp9_64_zeros, 0, bw, bh, &sse, &avg);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     var = sse - (((int64_t)avg * avg) / (bw * bh));
     return (256 * var) / (bw * bh);
   } else {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,
+                               x->plane[0].src.stride,
+                               CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros),
+                               0, &sse);
+    } else {
+      var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,
+                               x->plane[0].src.stride,
+                               vp9_64_zeros, 0, &sse);
+    }
+#else
     var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,
                              x->plane[0].src.stride,
                              vp9_64_zeros, 0, &sse);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     return (256 * var) >> num_pels_log2_lookup[bs];
   }
 }

diff --git a/source/libvpx/vp9/encoder/vp9_avg.c b/source/libvpx/vp9/encoder/vp9_avg.c
new file mode 100644
index 0000000..e9810c8
--- /dev/null
+++ b/source/libvpx/vp9/encoder/vp9_avg.c

@@ -0,0 +1,33 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "vp9/common/vp9_common.h"
+#include "vpx_ports/mem.h"
+
+unsigned int vp9_avg_8x8_c(const uint8_t *s, int p) {
+  int i, j;
+  int sum = 0;
+  for (i = 0; i < 8; ++i, s+=p)
+    for (j = 0; j < 8; sum += s[j], ++j) {}
+
+  return (sum + 32) >> 6;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+unsigned int vp9_highbd_avg_8x8_c(const uint8_t *s8, int p) {
+  int i, j;
+  int sum = 0;
+  const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
+  for (i = 0; i < 8; ++i, s+=p)
+    for (j = 0; j < 8; sum += s[j], ++j) {}
+
+  return (sum + 32) >> 6;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+

diff --git a/source/libvpx/vp9/encoder/vp9_bitstream.c b/source/libvpx/vp9/encoder/vp9_bitstream.c
index f658dda..421e049 100644
--- a/source/libvpx/vp9/encoder/vp9_bitstream.c
+++ b/source/libvpx/vp9/encoder/vp9_bitstream.c

@@ -120,16 +120,28 @@
 }
 
 static void pack_mb_tokens(vp9_writer *w,
-                           TOKENEXTRA **tp, const TOKENEXTRA *const stop) {
+                           TOKENEXTRA **tp, const TOKENEXTRA *const stop,
+                           vpx_bit_depth_t bit_depth) {
   TOKENEXTRA *p = *tp;
 
   while (p < stop && p->token != EOSB_TOKEN) {
     const int t = p->token;
     const struct vp9_token *const a = &vp9_coef_encodings[t];
-    const vp9_extra_bit *const b = &vp9_extra_bits[t];
     int i = 0;
     int v = a->value;
     int n = a->len;
+#if CONFIG_VP9_HIGHBITDEPTH
+    const vp9_extra_bit *b;
+    if (bit_depth == VPX_BITS_12)
+      b = &vp9_extra_bits_high12[t];
+    else if (bit_depth == VPX_BITS_10)
+      b = &vp9_extra_bits_high10[t];
+    else
+      b = &vp9_extra_bits[t];
+#else
+    const vp9_extra_bit *const b = &vp9_extra_bits[t];
+    (void) bit_depth;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
     /* skip one or two nodes */
     if (p->skip_eob_node) {
@@ -387,7 +399,7 @@
   }
 
   assert(*tok < tok_end);
-  pack_mb_tokens(w, tok, tok_end);
+  pack_mb_tokens(w, tok, tok_end, cm->bit_depth);
 }
 
 static void write_partition(const VP9_COMMON *const cm,
@@ -419,7 +431,7 @@
   const VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
 
-  const int bsl = b_width_log2(bsize);
+  const int bsl = b_width_log2_lookup[bsize];
   const int bs = (1 << bsl) / 4;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
@@ -923,26 +935,27 @@
   size_t total_size = 0;
   const int tile_cols = 1 << cm->log2_tile_cols;
   const int tile_rows = 1 << cm->log2_tile_rows;
+  TileInfo tile[4][1 << 6];
+  TOKENEXTRA *pre_tok = cpi->tok;
+  int tile_tok = 0;
 
   vpx_memset(cm->above_seg_context, 0, sizeof(*cm->above_seg_context) *
              mi_cols_aligned_to_sb(cm->mi_cols));
 
-  tok[0][0] = cpi->tok;
-  for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-    if (tile_row)
-      tok[tile_row][0] = tok[tile_row - 1][tile_cols - 1] +
-                         cpi->tok_count[tile_row - 1][tile_cols - 1];
+  for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+    for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+      vp9_tile_init(&tile[tile_row][tile_col], cm, tile_row, tile_col);
 
-    for (tile_col = 1; tile_col < tile_cols; tile_col++)
-      tok[tile_row][tile_col] = tok[tile_row][tile_col - 1] +
-                                cpi->tok_count[tile_row][tile_col - 1];
+      tok[tile_row][tile_col] = pre_tok + tile_tok;
+      pre_tok = tok[tile_row][tile_col];
+      tile_tok = allocated_tokens(tile[tile_row][tile_col]);
+    }
   }
 
   for (tile_row = 0; tile_row < tile_rows; tile_row++) {
     for (tile_col = 0; tile_col < tile_cols; tile_col++) {
-      TileInfo tile;
+      const TileInfo * const ptile = &tile[tile_row][tile_col];
 
-      vp9_tile_init(&tile, cm, tile_row, tile_col);
       tok_end = tok[tile_row][tile_col] + cpi->tok_count[tile_row][tile_col];
 
       if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1)
@@ -950,7 +963,7 @@
       else
         vp9_start_encode(&residual_bc, data_ptr + total_size);
 
-      write_modes(cpi, &tile, &residual_bc, &tok[tile_row][tile_col], tok_end);
+      write_modes(cpi, ptile, &residual_bc, &tok[tile_row][tile_col], tok_end);
       assert(tok[tile_row][tile_col] == tok_end);
       vp9_stop_encode(&residual_bc);
       if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) {
@@ -1001,7 +1014,11 @@
         ((cpi->svc.number_temporal_layers > 1 &&
          cpi->oxcf.rc_mode == VPX_CBR) ||
         (cpi->svc.number_spatial_layers > 1 &&
-         cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame))) {
+         cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame) ||
+        (is_two_pass_svc(cpi) &&
+         cpi->svc.encode_empty_frame_state == ENCODING &&
+         cpi->svc.layer_context[0].frames_from_key_frame <
+         cpi->svc.number_temporal_layers + 1))) {
       found = 0;
     }
     vp9_wb_write_bit(wb, found);
@@ -1093,8 +1110,7 @@
     // will change to show_frame flag to 0, then add an one byte frame with
     // show_existing_frame flag which tells the decoder which frame we want to
     // show.
-    if (!cm->show_frame ||
-        (is_two_pass_svc(cpi) && cm->error_resilient_mode == 0))
+    if (!cm->show_frame)
       vp9_wb_write_bit(wb, cm->intra_only);
 
     if (!cm->error_resilient_mode)

diff --git a/source/libvpx/vp9/encoder/vp9_block.h b/source/libvpx/vp9/encoder/vp9_block.h
index 767bd7f..5194c4c 100644
--- a/source/libvpx/vp9/encoder/vp9_block.h
+++ b/source/libvpx/vp9/encoder/vp9_block.h

@@ -13,8 +13,6 @@
 
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_entropy.h"
-#include "vpx_ports/mem.h"
-#include "vp9/common/vp9_onyxc_int.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -122,8 +120,8 @@
   void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride);
   void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob);
 #if CONFIG_VP9_HIGHBITDEPTH
-  void (*high_itxm_add)(const tran_low_t *input, uint8_t *dest, int stride,
-                        int eob, int bd);
+  void (*highbd_itxm_add)(const tran_low_t *input, uint8_t *dest, int stride,
+                          int eob, int bd);
 #endif
 };
 

diff --git a/source/libvpx/vp9/encoder/vp9_context_tree.h b/source/libvpx/vp9/encoder/vp9_context_tree.h
index 97f0741..6b28ee5 100644
--- a/source/libvpx/vp9/encoder/vp9_context_tree.h
+++ b/source/libvpx/vp9/encoder/vp9_context_tree.h

@@ -11,9 +11,10 @@
 #ifndef VP9_ENCODER_VP9_CONTEXT_TREE_H_
 #define VP9_ENCODER_VP9_CONTEXT_TREE_H_
 
-#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_blockd.h"
 
 struct VP9_COMP;
+struct VP9Common;
 
 // Structure to hold snapshot of coding context during the mode picking process
 typedef struct {

diff --git a/source/libvpx/vp9/encoder/vp9_dct.c b/source/libvpx/vp9/encoder/vp9_dct.c
index eff8996..1090d04 100644
--- a/source/libvpx/vp9/encoder/vp9_dct.c
+++ b/source/libvpx/vp9/encoder/vp9_dct.c

@@ -1440,59 +1440,62 @@
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vp9_high_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+void vp9_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output,
+                          int stride) {
   vp9_fdct4x4_c(input, output, stride);
 }
 
-void vp9_high_fht4x4_c(const int16_t *input, tran_low_t *output,
-                       int stride, int tx_type) {
+void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output,
+                         int stride, int tx_type) {
   vp9_fht4x4_c(input, output, stride, tx_type);
 }
 
-void vp9_high_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output,
-                          int stride) {
+void vp9_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output,
+                            int stride) {
   vp9_fdct8x8_1_c(input, final_output, stride);
 }
 
-void vp9_high_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
-                        int stride) {
+void vp9_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
+                          int stride) {
   vp9_fdct8x8_c(input, final_output, stride);
 }
 
-void vp9_high_fdct16x16_1_c(const int16_t *input, tran_low_t *output,
-                            int stride) {
+void vp9_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output,
+                              int stride) {
   vp9_fdct16x16_1_c(input, output, stride);
 }
 
-void vp9_high_fdct16x16_c(const int16_t *input, tran_low_t *output,
-                          int stride) {
+void vp9_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
+                            int stride) {
   vp9_fdct16x16_c(input, output, stride);
 }
 
-void vp9_high_fht8x8_c(const int16_t *input, tran_low_t *output,
-                  int stride, int tx_type) {
+void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output,
+                         int stride, int tx_type) {
   vp9_fht8x8_c(input, output, stride, tx_type);
 }
 
-void vp9_high_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output,
+                          int stride) {
   vp9_fwht4x4_c(input, output, stride);
 }
 
-void vp9_high_fht16x16_c(const int16_t *input, tran_low_t *output,
-                    int stride, int tx_type) {
+void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output,
+                           int stride, int tx_type) {
   vp9_fht16x16_c(input, output, stride, tx_type);
 }
 
-void vp9_high_fdct32x32_1_c(const int16_t *input, tran_low_t *out, int stride) {
+void vp9_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *out,
+                              int stride) {
   vp9_fdct32x32_1_c(input, out, stride);
 }
 
-void vp9_high_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
+void vp9_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
   vp9_fdct32x32_c(input, out, stride);
 }
 
-void vp9_high_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
-                             int stride) {
+void vp9_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
+                               int stride) {
   vp9_fdct32x32_rd_c(input, out, stride);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH

diff --git a/source/libvpx/vp9/encoder/vp9_denoiser.c b/source/libvpx/vp9/encoder/vp9_denoiser.c
index 681b2a5..4deeed2 100644
--- a/source/libvpx/vp9/encoder/vp9_denoiser.c
+++ b/source/libvpx/vp9/encoder/vp9_denoiser.c

@@ -31,9 +31,6 @@
 static void make_grayscale(YV12_BUFFER_CONFIG *yuv);
 #endif
 
-static const int widths[]  = {4, 4, 8, 8,  8, 16, 16, 16, 32, 32, 32, 64, 64};
-static const int heights[] = {4, 8, 4, 8, 16,  8, 16, 32, 16, 32, 64, 32, 64};
-
 static int absdiff_thresh(BLOCK_SIZE bs, int increase_denoising) {
   (void)bs;
   return 3 + (increase_denoising ? 1 : 0);
@@ -52,7 +49,9 @@
 }
 
 static unsigned int sse_thresh(BLOCK_SIZE bs, int increase_denoising) {
-  return widths[bs] * heights[bs] * (increase_denoising ? 60 : 40);
+  return (4 << b_width_log2_lookup[bs]) *
+         (4 << b_height_log2_lookup[bs]) *
+         (increase_denoising ? 60 : 40);
 }
 
 static int sse_diff_thresh(BLOCK_SIZE bs, int increase_denoising,
@@ -61,25 +60,31 @@
       noise_motion_thresh(bs, increase_denoising)) {
     return 0;
   } else {
-    return widths[bs] * heights[bs] * 20;
+    return (4 << b_width_log2_lookup[bs]) *
+           (4 << b_height_log2_lookup[bs]) * 20;
   }
 }
 
-static int total_adj_strong_thresh(BLOCK_SIZE bs, int increase_denoising) {
-  return widths[bs] * heights[bs] * (increase_denoising ? 3 : 2);
+int total_adj_strong_thresh(BLOCK_SIZE bs, int increase_denoising) {
+  return (4 << b_width_log2_lookup[bs]) *
+         (4 << b_height_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
 }
 
 static int total_adj_weak_thresh(BLOCK_SIZE bs, int increase_denoising) {
-  return widths[bs] * heights[bs] * (increase_denoising ? 3 : 2);
+  return (4 << b_width_log2_lookup[bs]) *
+         (4 << b_height_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
 }
 
-static VP9_DENOISER_DECISION denoiser_filter(const uint8_t *sig, int sig_stride,
-                                             const uint8_t *mc_avg,
-                                             int mc_avg_stride,
-                                             uint8_t *avg, int avg_stride,
-                                             int increase_denoising,
-                                             BLOCK_SIZE bs,
-                                             int motion_magnitude) {
+// TODO(jackychen): If increase_denoising is enabled in the future,
+// we might need to update the code for calculating 'total_adj' in
+// case the C code is not bit-exact with corresponding sse2 code.
+int vp9_denoiser_filter_c(const uint8_t *sig, int sig_stride,
+                          const uint8_t *mc_avg,
+                          int mc_avg_stride,
+                          uint8_t *avg, int avg_stride,
+                          int increase_denoising,
+                          BLOCK_SIZE bs,
+                          int motion_magnitude) {
   int r, c;
   const uint8_t *sig_start = sig;
   const uint8_t *mc_avg_start = mc_avg;
@@ -102,8 +107,8 @@
   }
 
   // First attempt to apply a strong temporal denoising filter.
-  for (r = 0; r < heights[bs]; ++r) {
-    for (c = 0; c < widths[bs]; ++c) {
+  for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
+    for (c = 0; c < (4 << b_width_log2_lookup[bs]); ++c) {
       diff = mc_avg[c] - sig[c];
       absdiff = abs(diff);
 
@@ -143,7 +148,7 @@
 
   // Otherwise, we try to dampen the filter if the delta is not too high.
   delta = ((abs(total_adj) - total_adj_strong_thresh(bs, increase_denoising))
-           >> 8) + 1;
+           >> num_pels_log2_lookup[bs]) + 1;
 
   if (delta >= delta_thresh(bs, increase_denoising)) {
     return COPY_BLOCK;
@@ -152,8 +157,8 @@
   mc_avg =  mc_avg_start;
   avg = avg_start;
   sig = sig_start;
-  for (r = 0; r < heights[bs]; ++r) {
-    for (c = 0; c < widths[bs]; ++c) {
+  for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
+    for (c = 0; c < (4 << b_width_log2_lookup[bs]); ++c) {
       diff = mc_avg[c] - sig[c];
       adj = abs(diff);
       if (adj > delta) {
@@ -193,8 +198,8 @@
 static void copy_block(uint8_t *dest, int dest_stride,
                        const uint8_t *src, int src_stride, BLOCK_SIZE bs) {
   int r;
-  for (r = 0; r < heights[bs]; ++r) {
-    vpx_memcpy(dest, src, widths[bs]);
+  for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
+    vpx_memcpy(dest, src, (4 << b_width_log2_lookup[bs]));
     dest += dest_stride;
     src += src_stride;
   }
@@ -336,10 +341,10 @@
                                          &motion_magnitude);
 
   if (decision == FILTER_BLOCK) {
-    decision = denoiser_filter(src.buf, src.stride,
-                               mc_avg_start, mc_avg.y_stride,
-                               avg_start, avg.y_stride,
-                               0, bs, motion_magnitude);
+    decision = vp9_denoiser_filter(src.buf, src.stride,
+                                 mc_avg_start, mc_avg.y_stride,
+                                 avg_start, avg.y_stride,
+                                 0, bs, motion_magnitude);
   }
 
   if (decision == FILTER_BLOCK) {

diff --git a/source/libvpx/vp9/encoder/vp9_denoiser.h b/source/libvpx/vp9/encoder/vp9_denoiser.h
index fa714b1..421dfcd 100644
--- a/source/libvpx/vp9/encoder/vp9_denoiser.h
+++ b/source/libvpx/vp9/encoder/vp9_denoiser.h

@@ -55,6 +55,10 @@
 #endif
                        int border);
 
+#if CONFIG_VP9_TEMPORAL_DENOISING
+int total_adj_strong_thresh(BLOCK_SIZE bs, int increase_denoising);
+#endif
+
 void vp9_denoiser_free(VP9_DENOISER *denoiser);
 
 #ifdef __cplusplus

diff --git a/source/libvpx/vp9/encoder/vp9_encodeframe.c b/source/libvpx/vp9/encoder/vp9_encodeframe.c
index be5ee7b..6eff8c5 100644
--- a/source/libvpx/vp9/encoder/vp9_encodeframe.c
+++ b/source/libvpx/vp9/encoder/vp9_encodeframe.c

@@ -61,16 +61,51 @@
 // Eventually this should be replaced by custom no-reference routines,
 //  which will be faster.
 static const uint8_t VP9_VAR_OFFS[64] = {
-  128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128
 };
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static const uint16_t VP9_HIGH_VAR_OFFS_8[64] = {
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128
+};
+
+static const uint16_t VP9_HIGH_VAR_OFFS_10[64] = {
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4
+};
+
+static const uint16_t VP9_HIGH_VAR_OFFS_12[64] = {
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16
+};
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi,
                                               const struct buf_2d *ref,
                                               BLOCK_SIZE bs) {
@@ -80,6 +115,32 @@
   return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static unsigned int high_get_sby_perpixel_variance(
+    VP9_COMP *cpi, const struct buf_2d *ref, BLOCK_SIZE bs, int bd) {
+  unsigned int var, sse;
+  switch (bd) {
+    case 10:
+      var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+                               CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_10),
+                               0, &sse);
+      break;
+    case 12:
+      var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+                               CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_12),
+                               0, &sse);
+      break;
+    case 8:
+    default:
+      var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+                               CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_8),
+                               0, &sse);
+      break;
+  }
+  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static unsigned int get_sby_perpixel_diff_variance(VP9_COMP *cpi,
                                                    const struct buf_2d *ref,
                                                    int mi_row, int mi_col,
@@ -335,10 +396,11 @@
   const int block_width = num_8x8_blocks_wide_lookup[bsize];
   const int block_height = num_8x8_blocks_high_lookup[bsize];
   // TODO(debargha): Choose this more intelligently.
-  const int64_t threshold_multiplier = 25;
-  int64_t threshold = threshold_multiplier * cpi->common.base_qindex;
+  const int threshold_multiplier = cm->frame_type == KEY_FRAME ? 64 : 4;
+  int64_t threshold =
+      (int64_t)(threshold_multiplier *
+                vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth));
   assert(block_height == block_width);
-
   tree_to_node(data, bsize, &vt);
 
   // Split none is available only if we have more than half a block size
@@ -350,29 +412,47 @@
     return 1;
   }
 
-  // Vertical split is available on all but the bottom border.
-  if (mi_row + block_height / 2 < cm->mi_rows &&
-      vt.part_variances->vert[0].variance < threshold &&
-      vt.part_variances->vert[1].variance < threshold) {
-    BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_VERT);
-    set_block_size(cpi, mi_row, mi_col, subsize);
-    set_block_size(cpi, mi_row, mi_col + block_width / 2, subsize);
-    return 1;
+  // Only allow split for blocks above 16x16.
+  if (bsize > BLOCK_16X16) {
+    // Vertical split is available on all but the bottom border.
+    if (mi_row + block_height / 2 < cm->mi_rows &&
+        vt.part_variances->vert[0].variance < threshold &&
+        vt.part_variances->vert[1].variance < threshold) {
+      BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_VERT);
+      set_block_size(cpi, mi_row, mi_col, subsize);
+      set_block_size(cpi, mi_row, mi_col + block_width / 2, subsize);
+      return 1;
+    }
+
+    // Horizontal split is available on all but the right border.
+    if (mi_col + block_width / 2 < cm->mi_cols &&
+        vt.part_variances->horz[0].variance < threshold &&
+        vt.part_variances->horz[1].variance < threshold) {
+      BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_HORZ);
+      set_block_size(cpi, mi_row, mi_col, subsize);
+      set_block_size(cpi, mi_row + block_height / 2, mi_col, subsize);
+      return 1;
+    }
   }
 
-  // Horizontal split is available on all but the right border.
-  if (mi_col + block_width / 2 < cm->mi_cols &&
-      vt.part_variances->horz[0].variance < threshold &&
-      vt.part_variances->horz[1].variance < threshold) {
-    BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_HORZ);
-    set_block_size(cpi, mi_row, mi_col, subsize);
-    set_block_size(cpi, mi_row + block_height / 2, mi_col, subsize);
-    return 1;
+  // This will only allow 8x8 if the 16x16 variance is very large.
+  if (bsize == BLOCK_16X16) {
+    if (mi_col + block_width / 2 < cm->mi_cols &&
+        mi_row + block_height / 2 < cm->mi_rows &&
+        vt.part_variances->none.variance < (threshold << 6)) {
+      set_block_size(cpi, mi_row, mi_col, bsize);
+      return 1;
+    }
   }
   return 0;
 }
 
-// TODO(debargha): Fix this function and make it work as expected.
+// This function chooses partitioning based on the variance
+// between source and reconstructed last, where variance is
+// computed for 8x8 downsampled inputs. Some things to check:
+// using the last source rather than reconstructed last, and
+// allowing for small downsampling (4x4 or 2x2) for selection
+// of smaller block sizes (i.e., < 16x16).
 static void choose_partitioning(VP9_COMP *cpi,
                                 const TileInfo *const tile,
                                 int mi_row, int mi_col) {
@@ -391,6 +471,7 @@
   const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
   const struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf;
 
+  vp9_clear_system_state();
   vp9_zero(vt);
   set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
 
@@ -419,6 +500,22 @@
   } else {
     d = VP9_VAR_OFFS;
     dp = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      switch (xd->bd) {
+        case 10:
+          d = CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_10);
+          break;
+        case 12:
+          d = CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_12);
+          break;
+        case 8:
+        default:
+          d = CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_8);
+          break;
+      }
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   }
 
   // Fill in the entire tree of 8x8 variances for splits.
@@ -434,10 +531,28 @@
         int y_idx = y16_idx + ((k >> 1) << 3);
         unsigned int sse = 0;
         int sum = 0;
-        if (x_idx < pixels_wide && y_idx < pixels_high)
-          vp9_get8x8var(s + y_idx * sp + x_idx, sp,
-                        d + y_idx * dp + x_idx, dp, &sse, &sum);
-        fill_variance(sse, sum, 64, &vst->split[k].part_variances.none);
+
+        if (x_idx < pixels_wide && y_idx < pixels_high) {
+          int s_avg, d_avg;
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            s_avg = vp9_highbd_avg_8x8(s + y_idx * sp + x_idx, sp);
+            d_avg = vp9_highbd_avg_8x8(d + y_idx * dp + x_idx, dp);
+          } else {
+            s_avg = vp9_avg_8x8(s + y_idx * sp + x_idx, sp);
+            d_avg = vp9_avg_8x8(d + y_idx * dp + x_idx, dp);
+          }
+#else
+          s_avg = vp9_avg_8x8(s + y_idx * sp + x_idx, sp);
+          d_avg = vp9_avg_8x8(d + y_idx * dp + x_idx, dp);
+#endif
+          sum = s_avg - d_avg;
+          sse = sum * sum;
+        }
+        // For an 8x8 block we have just one value the average of all 64
+        // pixels,  so use 1.   This means of course that there is no variance
+        // in an 8x8 block.
+        fill_variance(sse, sum, 1, &vst->split[k].part_variances.none);
       }
     }
   }
@@ -453,8 +568,8 @@
   // Now go through the entire structure,  splitting every block size until
   // we get to one that's got a variance lower than our threshold,  or we
   // hit 8x8.
-  if (!set_vt_partitioning(cpi, &vt, BLOCK_64X64,
-                           mi_row, mi_col)) {
+  if ( mi_col + 8 > cm->mi_cols || mi_row + 8 > cm->mi_rows ||
+      !set_vt_partitioning(cpi, &vt, BLOCK_64X64, mi_row, mi_col)) {
     for (i = 0; i < 4; ++i) {
       const int x32_idx = ((i & 1) << 2);
       const int y32_idx = ((i >> 1) << 2);
@@ -463,31 +578,15 @@
         for (j = 0; j < 4; ++j) {
           const int x16_idx = ((j & 1) << 1);
           const int y16_idx = ((j >> 1) << 1);
-          // NOTE: This is a temporary hack to disable 8x8 partitions,
-          // since it works really bad - possibly due to a bug
-#define DISABLE_8X8_VAR_BASED_PARTITION
-#ifdef DISABLE_8X8_VAR_BASED_PARTITION
-          if (mi_row + y32_idx + y16_idx + 1 < cm->mi_rows &&
-              mi_row + x32_idx + x16_idx + 1 < cm->mi_cols) {
-            set_block_size(cpi,
-                           (mi_row + y32_idx + y16_idx),
-                           (mi_col + x32_idx + x16_idx),
-                           BLOCK_16X16);
-          } else {
-            for (k = 0; k < 4; ++k) {
-              const int x8_idx = (k & 1);
-              const int y8_idx = (k >> 1);
-              set_block_size(cpi,
-                             (mi_row + y32_idx + y16_idx + y8_idx),
-                             (mi_col + x32_idx + x16_idx + x8_idx),
-                             BLOCK_8X8);
-            }
-          }
-#else
-          if (!set_vt_partitioning(cpi, &vt.split[i].split[j], tile,
+          // NOTE: Since this uses 8x8 downsampling for variance calculation
+          // we cannot really select block size 8x8 (or even 8x16/16x8),
+          // since we do not sufficient samples for variance.
+          // For now, 8x8 partition is only set if the variance of the 16x16
+          // block is very high. This is controlled in set_vt_partitioning.
+          if (!set_vt_partitioning(cpi, &vt.split[i].split[j],
                                    BLOCK_16X16,
-                                   (mi_row + y32_idx + y16_idx),
-                                   (mi_col + x32_idx + x16_idx), 2)) {
+                                   mi_row + y32_idx + y16_idx,
+                                   mi_col + x32_idx + x16_idx)) {
             for (k = 0; k < 4; ++k) {
               const int x8_idx = (k & 1);
               const int y8_idx = (k >> 1);
@@ -497,7 +596,6 @@
                              BLOCK_8X8);
             }
           }
-#endif
         }
       }
     }
@@ -684,10 +782,9 @@
 }
 
 static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
-                             int mi_row, int mi_col,
-                             int *totalrate, int64_t *totaldist,
+                             int mi_row, int mi_col, RD_COST *rd_cost,
                              BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
-                             int64_t best_rd, int block) {
+                             int64_t best_rd) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -704,19 +801,6 @@
   // Use the lower precision, but faster, 32x32 fdct for mode selection.
   x->use_lp32x32fdct = 1;
 
-  // TODO(JBB): Most other places in the code instead of calling the function
-  // and then checking if its not the first 8x8 we put the check in the
-  // calling function.  Do that here.
-  if (bsize < BLOCK_8X8) {
-    // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
-    // there is nothing to be done.
-    if (block != 0) {
-      *totalrate = 0;
-      *totaldist = 0;
-      return;
-    }
-  }
-
   set_offsets(cpi, tile, mi_row, mi_col, bsize);
   mbmi = &xd->mi[0].src_mi->mbmi;
   mbmi->sb_type = bsize;
@@ -734,7 +818,17 @@
   // Set to zero to make sure we do not use the previous encoded frame stats
   mbmi->skip = 0;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    x->source_variance =
+        high_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize, xd->bd);
+  } else {
+    x->source_variance =
+        get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+  }
+#else
   x->source_variance = get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
   // Save rdmult before it might be changed, so it can be restored later.
   orig_rdmult = x->rdmult;
@@ -774,28 +868,33 @@
   // Find best coding mode & reconstruct the MB so it is available
   // as a predictor for MBs that follow in the SB
   if (frame_is_intra_only(cm)) {
-    vp9_rd_pick_intra_mode_sb(cpi, x, totalrate, totaldist, bsize, ctx,
-                              best_rd);
+    vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd);
   } else {
     if (bsize >= BLOCK_8X8) {
       if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
-        vp9_rd_pick_inter_mode_sb_seg_skip(cpi, x, totalrate, totaldist, bsize,
+        vp9_rd_pick_inter_mode_sb_seg_skip(cpi, x, rd_cost, bsize,
                                            ctx, best_rd);
       else
         vp9_rd_pick_inter_mode_sb(cpi, x, tile, mi_row, mi_col,
-                                  totalrate, totaldist, bsize, ctx, best_rd);
+                                  rd_cost, bsize, ctx, best_rd);
     } else {
-      vp9_rd_pick_inter_mode_sub8x8(cpi, x, tile, mi_row, mi_col, totalrate,
-                                    totaldist, bsize, ctx, best_rd);
+      vp9_rd_pick_inter_mode_sub8x8(cpi, x, tile, mi_row, mi_col, rd_cost,
+                                    bsize, ctx, best_rd);
     }
   }
 
+  if (aq_mode == VARIANCE_AQ && rd_cost->rate != INT_MAX) {
+    vp9_clear_system_state();
+    rd_cost->rate = (int)round(rd_cost->rate * rdmult_ratio);
+    rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
+  }
+
   x->rdmult = orig_rdmult;
 
-  if (aq_mode == VARIANCE_AQ && *totalrate != INT_MAX) {
-    vp9_clear_system_state();
-    *totalrate = (int)round(*totalrate * rdmult_ratio);
-  }
+  // TODO(jingning) The rate-distortion optimization flow needs to be
+  // refactored to provide proper exit/return handle.
+  if (rd_cost->rate == INT_MAX)
+    rd_cost->rdcost = INT64_MAX;
 }
 
 static void update_stats(VP9_COMMON *cm, const MACROBLOCK *x) {
@@ -925,7 +1024,7 @@
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
 
-  const int bsl = b_width_log2(bsize), hbs = (1 << bsl) / 4;
+  const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
   int ctx;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize = bsize;
@@ -1297,12 +1396,18 @@
   *(xd->mi[0].src_mi) = ctx->mic;
   xd->mi[0].src_mi = &xd->mi[0];
 
-
-  // For in frame adaptive Q, check for reseting the segment_id and updating
-  // the cyclic refresh map.
-  if ((cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) && seg->enabled) {
-    vp9_cyclic_refresh_update_segment(cpi, &xd->mi[0].src_mi->mbmi,
-                                      mi_row, mi_col, bsize, 1);
+  if (seg->enabled && cpi->oxcf.aq_mode) {
+    // For in frame complexity AQ or variance AQ, copy segment_id from
+    // segmentation_map.
+    if (cpi->oxcf.aq_mode == COMPLEXITY_AQ ||
+        cpi->oxcf.aq_mode == VARIANCE_AQ ) {
+      const uint8_t *const map = seg->update_map ? cpi->segmentation_map
+                                                 : cm->last_frame_seg_map;
+      mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
+    } else {
+    // Setting segmentation map for cyclic_refresh
+      vp9_cyclic_refresh_update_segment(cpi, mbmi, mi_row, mi_col, bsize, 1);
+    }
     vp9_init_plane_quantizers(cpi, x);
   }
 
@@ -1348,7 +1453,7 @@
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
 
-  const int bsl = b_width_log2(bsize), hbs = (1 << bsl) / 4;
+  const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
   int ctx;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
@@ -1411,17 +1516,16 @@
     update_partition_context(xd, mi_row, mi_col, subsize, bsize);
 }
 
-static void rd_use_partition(VP9_COMP *cpi,
-                             const TileInfo *const tile,
-                             MODE_INFO *mi_8x8,
-                             TOKENEXTRA **tp, int mi_row, int mi_col,
+static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
+                             MODE_INFO *mi_8x8, TOKENEXTRA **tp,
+                             int mi_row, int mi_col,
                              BLOCK_SIZE bsize, int *rate, int64_t *dist,
                              int do_recon, PC_TREE *pc_tree) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const int mis = cm->mi_stride;
-  const int bsl = b_width_log2(bsize);
+  const int bsl = b_width_log2_lookup[bsize];
   const int mi_step = num_4x4_blocks_wide_lookup[bsize] / 2;
   const int bss = (1 << bsl) / 4;
   int i, pl;
@@ -1429,15 +1533,7 @@
   BLOCK_SIZE subsize;
   ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
   PARTITION_CONTEXT sl[8], sa[8];
-  int last_part_rate = INT_MAX;
-  int64_t last_part_dist = INT64_MAX;
-  int64_t last_part_rd = INT64_MAX;
-  int none_rate = INT_MAX;
-  int64_t none_dist = INT64_MAX;
-  int64_t none_rd = INT64_MAX;
-  int chosen_rate = INT_MAX;
-  int64_t chosen_dist = INT64_MAX;
-  int64_t chosen_rd = INT64_MAX;
+  RD_COST last_part_rdc, none_rdc, chosen_rdc;
   BLOCK_SIZE sub_subsize = BLOCK_4X4;
   int splits_below = 0;
   BLOCK_SIZE bs_type = mi_8x8[0].src_mi->mbmi.sb_type;
@@ -1450,6 +1546,10 @@
   assert(num_4x4_blocks_wide_lookup[bsize] ==
          num_4x4_blocks_high_lookup[bsize]);
 
+  vp9_rd_cost_reset(&last_part_rdc);
+  vp9_rd_cost_reset(&none_rdc);
+  vp9_rd_cost_reset(&chosen_rdc);
+
   partition = partition_lookup[bsl][bs_type];
   subsize = get_subsize(bsize, partition);
 
@@ -1483,14 +1583,15 @@
         mi_row + (mi_step >> 1) < cm->mi_rows &&
         mi_col + (mi_step >> 1) < cm->mi_cols) {
       pc_tree->partitioning = PARTITION_NONE;
-      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &none_rate, &none_dist, bsize,
-                       ctx, INT64_MAX, 0);
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &none_rdc, bsize,
+                       ctx, INT64_MAX);
 
       pl = partition_plane_context(xd, mi_row, mi_col, bsize);
 
-      if (none_rate < INT_MAX) {
-        none_rate += cpi->partition_cost[pl][PARTITION_NONE];
-        none_rd = RDCOST(x->rdmult, x->rddiv, none_rate, none_dist);
+      if (none_rdc.rate < INT_MAX) {
+        none_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
+        none_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, none_rdc.rate,
+                                 none_rdc.dist);
       }
 
       restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
@@ -1501,84 +1602,81 @@
 
   switch (partition) {
     case PARTITION_NONE:
-      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate,
-                       &last_part_dist, bsize, ctx, INT64_MAX, 0);
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rdc,
+                       bsize, ctx, INT64_MAX);
       break;
     case PARTITION_HORZ:
-      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate,
-                       &last_part_dist, subsize, &pc_tree->horizontal[0],
-                       INT64_MAX, 0);
-      if (last_part_rate != INT_MAX &&
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rdc,
+                       subsize, &pc_tree->horizontal[0],
+                       INT64_MAX);
+      if (last_part_rdc.rate != INT_MAX &&
           bsize >= BLOCK_8X8 && mi_row + (mi_step >> 1) < cm->mi_rows) {
-        int rt = 0;
-        int64_t dt = 0;
+        RD_COST tmp_rdc;
         PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
+        vp9_rd_cost_init(&tmp_rdc);
         update_state(cpi, ctx, mi_row, mi_col, subsize, 0);
         encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize, ctx);
-        rd_pick_sb_modes(cpi, tile, mi_row + (mi_step >> 1), mi_col, &rt, &dt,
-                         subsize, &pc_tree->horizontal[1], INT64_MAX, 1);
-        if (rt == INT_MAX || dt == INT64_MAX) {
-          last_part_rate = INT_MAX;
-          last_part_dist = INT64_MAX;
+        rd_pick_sb_modes(cpi, tile, mi_row + (mi_step >> 1), mi_col, &tmp_rdc,
+                         subsize, &pc_tree->horizontal[1], INT64_MAX);
+        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+          vp9_rd_cost_reset(&last_part_rdc);
           break;
         }
-
-        last_part_rate += rt;
-        last_part_dist += dt;
+        last_part_rdc.rate += tmp_rdc.rate;
+        last_part_rdc.dist += tmp_rdc.dist;
+        last_part_rdc.rdcost += tmp_rdc.rdcost;
       }
       break;
     case PARTITION_VERT:
-      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate,
-                       &last_part_dist, subsize, &pc_tree->vertical[0],
-                       INT64_MAX, 0);
-      if (last_part_rate != INT_MAX &&
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rdc,
+                       subsize, &pc_tree->vertical[0], INT64_MAX);
+      if (last_part_rdc.rate != INT_MAX &&
           bsize >= BLOCK_8X8 && mi_col + (mi_step >> 1) < cm->mi_cols) {
-        int rt = 0;
-        int64_t dt = 0;
+        RD_COST tmp_rdc;
         PICK_MODE_CONTEXT *ctx = &pc_tree->vertical[0];
+        vp9_rd_cost_init(&tmp_rdc);
         update_state(cpi, ctx, mi_row, mi_col, subsize, 0);
         encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize, ctx);
-        rd_pick_sb_modes(cpi, tile, mi_row, mi_col + (mi_step >> 1), &rt, &dt,
+        rd_pick_sb_modes(cpi, tile, mi_row, mi_col + (mi_step >> 1), &tmp_rdc,
                          subsize, &pc_tree->vertical[bsize > BLOCK_8X8],
-                         INT64_MAX, 1);
-        if (rt == INT_MAX || dt == INT64_MAX) {
-          last_part_rate = INT_MAX;
-          last_part_dist = INT64_MAX;
+                         INT64_MAX);
+        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+          vp9_rd_cost_reset(&last_part_rdc);
           break;
         }
-        last_part_rate += rt;
-        last_part_dist += dt;
+        last_part_rdc.rate += tmp_rdc.rate;
+        last_part_rdc.dist += tmp_rdc.dist;
+        last_part_rdc.rdcost += tmp_rdc.rdcost;
       }
       break;
     case PARTITION_SPLIT:
       if (bsize == BLOCK_8X8) {
-        rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate,
-                         &last_part_dist, subsize, pc_tree->leaf_split[0],
-                         INT64_MAX, 0);
+        rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rdc,
+                         subsize, pc_tree->leaf_split[0], INT64_MAX);
         break;
       }
-      last_part_rate = 0;
-      last_part_dist = 0;
+      last_part_rdc.rate = 0;
+      last_part_rdc.dist = 0;
+      last_part_rdc.rdcost = 0;
       for (i = 0; i < 4; i++) {
         int x_idx = (i & 1) * (mi_step >> 1);
         int y_idx = (i >> 1) * (mi_step >> 1);
         int jj = i >> 1, ii = i & 0x01;
-        int rt;
-        int64_t dt;
-
+        RD_COST tmp_rdc;
         if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
           continue;
 
+        vp9_rd_cost_init(&tmp_rdc);
         rd_use_partition(cpi, tile, mi_8x8 + jj * bss * mis + ii * bss, tp,
-                         mi_row + y_idx, mi_col + x_idx, subsize, &rt, &dt,
+                         mi_row + y_idx, mi_col + x_idx, subsize,
+                         &tmp_rdc.rate, &tmp_rdc.dist,
                          i != 3, pc_tree->split[i]);
-        if (rt == INT_MAX || dt == INT64_MAX) {
-          last_part_rate = INT_MAX;
-          last_part_dist = INT64_MAX;
+        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+          vp9_rd_cost_reset(&last_part_rdc);
           break;
         }
-        last_part_rate += rt;
-        last_part_dist += dt;
+        last_part_rdc.rate += tmp_rdc.rate;
+        last_part_rdc.dist += tmp_rdc.dist;
       }
       break;
     default:
@@ -1587,9 +1685,10 @@
   }
 
   pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-  if (last_part_rate < INT_MAX) {
-    last_part_rate += cpi->partition_cost[pl][partition];
-    last_part_rd = RDCOST(x->rdmult, x->rddiv, last_part_rate, last_part_dist);
+  if (last_part_rdc.rate < INT_MAX) {
+    last_part_rdc.rate += cpi->partition_cost[pl][partition];
+    last_part_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                                  last_part_rdc.rate, last_part_rdc.dist);
   }
 
   if (do_partition_search
@@ -1601,8 +1700,8 @@
       && (mi_col + mi_step < cm->mi_cols ||
           mi_col + (mi_step >> 1) == cm->mi_cols)) {
     BLOCK_SIZE split_subsize = get_subsize(bsize, PARTITION_SPLIT);
-    chosen_rate = 0;
-    chosen_dist = 0;
+    chosen_rdc.rate = 0;
+    chosen_rdc.dist = 0;
     restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
     pc_tree->partitioning = PARTITION_SPLIT;
 
@@ -1610,8 +1709,7 @@
     for (i = 0; i < 4; i++) {
       int x_idx = (i & 1) * (mi_step >> 1);
       int y_idx = (i >> 1) * (mi_step >> 1);
-      int rt = 0;
-      int64_t dt = 0;
+      RD_COST tmp_rdc;
       ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
       PARTITION_CONTEXT sl[8], sa[8];
 
@@ -1620,20 +1718,18 @@
 
       save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
       pc_tree->split[i]->partitioning = PARTITION_NONE;
-      rd_pick_sb_modes(cpi, tile, mi_row + y_idx, mi_col + x_idx, &rt, &dt,
-                       split_subsize, &pc_tree->split[i]->none,
-                       INT64_MAX, i);
+      rd_pick_sb_modes(cpi, tile, mi_row + y_idx, mi_col + x_idx, &tmp_rdc,
+                       split_subsize, &pc_tree->split[i]->none, INT64_MAX);
 
       restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
-      if (rt == INT_MAX || dt == INT64_MAX) {
-        chosen_rate = INT_MAX;
-        chosen_dist = INT64_MAX;
+      if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+        vp9_rd_cost_reset(&chosen_rdc);
         break;
       }
 
-      chosen_rate += rt;
-      chosen_dist += dt;
+      chosen_rdc.rate += tmp_rdc.rate;
+      chosen_rdc.dist += tmp_rdc.dist;
 
       if (i != 3)
         encode_sb(cpi, tile, tp,  mi_row + y_idx, mi_col + x_idx, 0,
@@ -1641,38 +1737,36 @@
 
       pl = partition_plane_context(xd, mi_row + y_idx, mi_col + x_idx,
                                    split_subsize);
-      chosen_rate += cpi->partition_cost[pl][PARTITION_NONE];
+      chosen_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
     }
     pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-    if (chosen_rate < INT_MAX) {
-      chosen_rate += cpi->partition_cost[pl][PARTITION_SPLIT];
-      chosen_rd = RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist);
+    if (chosen_rdc.rate < INT_MAX) {
+      chosen_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT];
+      chosen_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                                 chosen_rdc.rate, chosen_rdc.dist);
     }
   }
 
   // If last_part is better set the partitioning to that.
-  if (last_part_rd < chosen_rd) {
+  if (last_part_rdc.rdcost < chosen_rdc.rdcost) {
     mi_8x8[0].src_mi->mbmi.sb_type = bsize;
     if (bsize >= BLOCK_8X8)
       pc_tree->partitioning = partition;
-    chosen_rate = last_part_rate;
-    chosen_dist = last_part_dist;
-    chosen_rd = last_part_rd;
+    chosen_rdc = last_part_rdc;
   }
   // If none was better set the partitioning to that.
-  if (none_rd < chosen_rd) {
+  if (none_rdc.rdcost < chosen_rdc.rdcost) {
     if (bsize >= BLOCK_8X8)
       pc_tree->partitioning = PARTITION_NONE;
-    chosen_rate = none_rate;
-    chosen_dist = none_dist;
+    chosen_rdc = none_rdc;
   }
 
   restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
   // We must have chosen a partitioning and encoding or we'll fail later on.
   // No other opportunities for success.
-  if ( bsize == BLOCK_64X64)
-    assert(chosen_rate < INT_MAX && chosen_dist < INT64_MAX);
+  if (bsize == BLOCK_64X64)
+    assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX);
 
   if (do_recon) {
     int output_enabled = (bsize == BLOCK_64X64);
@@ -1682,18 +1776,18 @@
     // closer to the target.
     if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) {
       vp9_select_in_frame_q_segment(cpi, mi_row, mi_col,
-                                    output_enabled, chosen_rate);
+                                    output_enabled, chosen_rdc.rate);
     }
 
     if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
       vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh,
-                                              chosen_rate, chosen_dist);
+                                              chosen_rdc.rate, chosen_rdc.dist);
     encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize,
               pc_tree);
   }
 
-  *rate = chosen_rate;
-  *dist = chosen_dist;
+  *rate = chosen_rdc.rate;
+  *dist = chosen_rdc.dist;
 }
 
 static const BLOCK_SIZE min_partition_size[BLOCK_SIZES] = {
@@ -1863,7 +1957,7 @@
   int bh, bw;
   BLOCK_SIZE min_size = BLOCK_32X32;
   BLOCK_SIZE max_size = BLOCK_8X8;
-  int bsl = mi_width_log2(BLOCK_64X64);
+  int bsl = mi_width_log2_lookup[BLOCK_64X64];
   const int search_range_ctrl = (((mi_row + mi_col) >> bsl) +
                        get_chessboard_index(cm->current_video_frame)) & 0x1;
   // Trap case where we do not have a prediction.
@@ -2022,10 +2116,9 @@
 // unlikely to be selected depending on previous rate-distortion optimization
 // results, for encoding speed-up.
 static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
-                              TOKENEXTRA **tp, int mi_row,
-                              int mi_col, BLOCK_SIZE bsize, int *rate,
-                              int64_t *dist, int64_t best_rd,
-                              PC_TREE *pc_tree) {
+                              TOKENEXTRA **tp, int mi_row, int mi_col,
+                              BLOCK_SIZE bsize, RD_COST *rd_cost,
+                              int64_t best_rd, PC_TREE *pc_tree) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -2036,9 +2129,7 @@
   PICK_MODE_CONTEXT *ctx = &pc_tree->none;
   int i, pl;
   BLOCK_SIZE subsize;
-  int this_rate, sum_rate = 0, best_rate = INT_MAX;
-  int64_t this_dist, sum_dist = 0, best_dist = INT64_MAX;
-  int64_t sum_rd = 0;
+  RD_COST this_rdc, sum_rdc, best_rdc;
   int do_split = bsize >= BLOCK_8X8;
   int do_rect = 1;
 
@@ -2066,6 +2157,11 @@
   assert(num_8x8_blocks_wide_lookup[bsize] ==
              num_8x8_blocks_high_lookup[bsize]);
 
+  vp9_rd_cost_init(&this_rdc);
+  vp9_rd_cost_init(&sum_rdc);
+  vp9_rd_cost_reset(&best_rdc);
+  best_rdc.rdcost = best_rd;
+
   set_offsets(cpi, tile, mi_row, mi_col, bsize);
 
   if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode)
@@ -2157,28 +2253,29 @@
 
   // PARTITION_NONE
   if (partition_none_allowed) {
-    rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rate, &this_dist, bsize,
-                     ctx, best_rd, 0);
-    if (this_rate != INT_MAX) {
+    rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rdc, bsize, ctx,
+                     best_rdc.rdcost);
+    if (this_rdc.rate != INT_MAX) {
       if (bsize >= BLOCK_8X8) {
         pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-        this_rate += cpi->partition_cost[pl][PARTITION_NONE];
+        this_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
+        this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                                 this_rdc.rate, this_rdc.dist);
       }
-      sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist);
 
-      if (sum_rd < best_rd) {
+      if (this_rdc.rdcost < best_rdc.rdcost) {
         int64_t dist_breakout_thr = cpi->sf.partition_search_breakout_dist_thr;
         int rate_breakout_thr = cpi->sf.partition_search_breakout_rate_thr;
 
-        best_rate = this_rate;
-        best_dist = this_dist;
-        best_rd = sum_rd;
+        best_rdc = this_rdc;
         if (bsize >= BLOCK_8X8)
           pc_tree->partitioning = PARTITION_NONE;
 
         // Adjust dist breakout threshold according to the partition size.
-        dist_breakout_thr >>= 8 - (b_width_log2(bsize) +
-            b_height_log2(bsize));
+        dist_breakout_thr >>= 8 - (b_width_log2_lookup[bsize] +
+            b_height_log2_lookup[bsize]);
+
+        rate_breakout_thr *= num_pels_log2_lookup[bsize];
 
         // If all y, u, v transform blocks in this partition are skippable, and
         // the dist & rate are within the thresholds, the partition search is
@@ -2186,8 +2283,8 @@
         // The dist & rate thresholds are set to 0 at speed 0 to disable the
         // early termination at that speed.
         if (!x->e_mbd.lossless &&
-            (ctx->skippable && best_dist < dist_breakout_thr &&
-            best_rate < rate_breakout_thr)) {
+            (ctx->skippable && best_rdc.dist < dist_breakout_thr &&
+            best_rdc.rate < rate_breakout_thr)) {
           do_split = 0;
           do_rect = 0;
         }
@@ -2247,7 +2344,6 @@
     store_pred_mv(x, ctx);
 
   // PARTITION_SPLIT
-  sum_rd = 0;
   // TODO(jingning): use the motion vectors given by the above search as
   // the starting point of motion search in the following partition type check.
   if (do_split) {
@@ -2257,14 +2353,12 @@
       if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed)
         pc_tree->leaf_split[0]->pred_interp_filter =
             ctx->mic.mbmi.interp_filter;
-      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
-                       pc_tree->leaf_split[0], best_rd, 0);
-      if (sum_rate == INT_MAX)
-        sum_rd = INT64_MAX;
-      else
-        sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rdc, subsize,
+                       pc_tree->leaf_split[0], best_rdc.rdcost);
+      if (sum_rdc.rate == INT_MAX)
+        sum_rdc.rdcost = INT64_MAX;
     } else {
-      for (i = 0; i < 4 && sum_rd < best_rd; ++i) {
+      for (i = 0; i < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++i) {
       const int x_idx = (i & 1) * mi_step;
       const int y_idx = (i >> 1) * mi_step;
 
@@ -2276,28 +2370,28 @@
 
         pc_tree->split[i]->index = i;
         rd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx,
-                          subsize, &this_rate, &this_dist,
-                          best_rd - sum_rd, pc_tree->split[i]);
+                          subsize, &this_rdc,
+                          best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[i]);
 
-        if (this_rate == INT_MAX) {
-          sum_rd = INT64_MAX;
+        if (this_rdc.rate == INT_MAX) {
+          sum_rdc.rdcost = INT64_MAX;
+          break;
         } else {
-          sum_rate += this_rate;
-          sum_dist += this_dist;
-          sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+          sum_rdc.rate += this_rdc.rate;
+          sum_rdc.dist += this_rdc.dist;
+          sum_rdc.rdcost += this_rdc.rdcost;
         }
       }
     }
 
-    if (sum_rd < best_rd && i == 4) {
+    if (sum_rdc.rdcost < best_rdc.rdcost && i == 4) {
       pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-      sum_rate += cpi->partition_cost[pl][PARTITION_SPLIT];
-      sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+      sum_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT];
+      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                              sum_rdc.rate, sum_rdc.dist);
 
-      if (sum_rd < best_rd) {
-        best_rate = sum_rate;
-        best_dist = sum_dist;
-        best_rd = sum_rd;
+      if (sum_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = sum_rdc;
         pc_tree->partitioning = PARTITION_SPLIT;
       }
     } else {
@@ -2318,11 +2412,11 @@
         partition_none_allowed)
       pc_tree->horizontal[0].pred_interp_filter =
           ctx->mic.mbmi.interp_filter;
-    rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
-                     &pc_tree->horizontal[0], best_rd, 0);
-    sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+    rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rdc, subsize,
+                     &pc_tree->horizontal[0], best_rdc.rdcost);
 
-    if (sum_rd < best_rd && mi_row + mi_step < cm->mi_rows) {
+    if (sum_rdc.rdcost < best_rdc.rdcost && mi_row + mi_step < cm->mi_rows &&
+        bsize > BLOCK_8X8) {
       PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
       update_state(cpi, ctx, mi_row, mi_col, subsize, 0);
       encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize, ctx);
@@ -2333,25 +2427,24 @@
           partition_none_allowed)
         pc_tree->horizontal[1].pred_interp_filter =
             ctx->mic.mbmi.interp_filter;
-      rd_pick_sb_modes(cpi, tile, mi_row + mi_step, mi_col, &this_rate,
-                       &this_dist, subsize, &pc_tree->horizontal[1],
-                       best_rd - sum_rd, 1);
-      if (this_rate == INT_MAX) {
-        sum_rd = INT64_MAX;
+      rd_pick_sb_modes(cpi, tile, mi_row + mi_step, mi_col, &this_rdc,
+                       subsize, &pc_tree->horizontal[1],
+                       best_rdc.rdcost - sum_rdc.rdcost);
+      if (this_rdc.rate == INT_MAX) {
+        sum_rdc.rdcost = INT64_MAX;
       } else {
-        sum_rate += this_rate;
-        sum_dist += this_dist;
-        sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost += this_rdc.rdcost;
       }
     }
-    if (sum_rd < best_rd) {
+
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
       pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-      sum_rate += cpi->partition_cost[pl][PARTITION_HORZ];
-      sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
-      if (sum_rd < best_rd) {
-        best_rd = sum_rd;
-        best_rate = sum_rate;
-        best_dist = sum_dist;
+      sum_rdc.rate += cpi->partition_cost[pl][PARTITION_HORZ];
+      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+      if (sum_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = sum_rdc;
         pc_tree->partitioning = PARTITION_HORZ;
       }
     }
@@ -2367,10 +2460,10 @@
         partition_none_allowed)
       pc_tree->vertical[0].pred_interp_filter =
           ctx->mic.mbmi.interp_filter;
-    rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
-                     &pc_tree->vertical[0], best_rd, 0);
-    sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
-    if (sum_rd < best_rd && mi_col + mi_step < cm->mi_cols) {
+    rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rdc, subsize,
+                     &pc_tree->vertical[0], best_rdc.rdcost);
+    if (sum_rdc.rdcost < best_rdc.rdcost && mi_col + mi_step < cm->mi_cols &&
+        bsize > BLOCK_8X8) {
       update_state(cpi, &pc_tree->vertical[0], mi_row, mi_col, subsize, 0);
       encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize,
                         &pc_tree->vertical[0]);
@@ -2381,26 +2474,24 @@
           partition_none_allowed)
         pc_tree->vertical[1].pred_interp_filter =
             ctx->mic.mbmi.interp_filter;
-      rd_pick_sb_modes(cpi, tile, mi_row, mi_col + mi_step, &this_rate,
-                       &this_dist, subsize,
-                       &pc_tree->vertical[1], best_rd - sum_rd,
-                       1);
-      if (this_rate == INT_MAX) {
-        sum_rd = INT64_MAX;
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col + mi_step, &this_rdc, subsize,
+                       &pc_tree->vertical[1], best_rdc.rdcost - sum_rdc.rdcost);
+      if (this_rdc.rate == INT_MAX) {
+        sum_rdc.rdcost = INT64_MAX;
       } else {
-        sum_rate += this_rate;
-        sum_dist += this_dist;
-        sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost += this_rdc.rdcost;
       }
     }
-    if (sum_rd < best_rd) {
+
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
       pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-      sum_rate += cpi->partition_cost[pl][PARTITION_VERT];
-      sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
-      if (sum_rd < best_rd) {
-        best_rate = sum_rate;
-        best_dist = sum_dist;
-        best_rd = sum_rd;
+      sum_rdc.rate += cpi->partition_cost[pl][PARTITION_VERT];
+      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                              sum_rdc.rate, sum_rdc.dist);
+      if (sum_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = sum_rdc;
         pc_tree->partitioning = PARTITION_VERT;
       }
     }
@@ -2412,10 +2503,11 @@
   // point.  This code should be refactored so that the duplicate
   // checks occur in some sub function and thus are used...
   (void) best_rd;
-  *rate = best_rate;
-  *dist = best_dist;
+  *rd_cost = best_rdc;
 
-  if (best_rate < INT_MAX && best_dist < INT64_MAX && pc_tree->index != 3) {
+
+  if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
+      pc_tree->index != 3) {
     int output_enabled = (bsize == BLOCK_64X64);
 
     // Check the projected output rate for this SB against it's target
@@ -2423,18 +2515,18 @@
     // closer to the target.
     if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map)
       vp9_select_in_frame_q_segment(cpi, mi_row, mi_col, output_enabled,
-                                    best_rate);
+                                    best_rdc.rate);
     if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
       vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh,
-                                              best_rate, best_dist);
+                                              best_rdc.rate, best_rdc.dist);
 
     encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize, pc_tree);
   }
 
   if (bsize == BLOCK_64X64) {
     assert(tp_orig < *tp);
-    assert(best_rate < INT_MAX);
-    assert(best_dist < INT64_MAX);
+    assert(best_rdc.rate < INT_MAX);
+    assert(best_rdc.dist < INT64_MAX);
   } else {
     assert(tp_orig == *tp);
   }
@@ -2456,9 +2548,16 @@
        mi_col += MI_BLOCK_SIZE) {
     int dummy_rate;
     int64_t dummy_dist;
-
+    RD_COST dummy_rdc;
     int i;
 
+    const int idx_str = cm->mi_stride * mi_row + mi_col;
+    MODE_INFO *mi = cm->mi + idx_str;
+    MODE_INFO *prev_mi = NULL;
+
+    if (cm->frame_type != KEY_FRAME)
+      prev_mi = (cm->prev_mip + cm->mi_stride + 1 + idx_str)->src_mi;
+
     if (sf->adaptive_pred_interp_filter) {
       for (i = 0; i < 64; ++i)
         cpi->leaf_tree[i].pred_interp_filter = SWITCHABLE;
@@ -2477,71 +2576,44 @@
     // TODO(yunqingwang): use_lastframe_partitioning is no longer used in good-
     // quality encoding. Need to evaluate it in real-time encoding later to
     // decide if it can be removed too. And then, do the code cleanup.
-    if ((sf->partition_search_type == SEARCH_PARTITION &&
-         sf->use_lastframe_partitioning) ||
-         sf->partition_search_type == FIXED_PARTITION ||
-         sf->partition_search_type == VAR_BASED_PARTITION ||
-         sf->partition_search_type == VAR_BASED_FIXED_PARTITION) {
-      const int idx_str = cm->mi_stride * mi_row + mi_col;
-      MODE_INFO *mi = cm->mi + idx_str;
-      MODE_INFO *prev_mi = (cm->prev_mip + cm->mi_stride + 1 + idx_str)->src_mi;
-      cpi->mb.source_variance = UINT_MAX;
-      if (sf->partition_search_type == FIXED_PARTITION) {
-        set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
-        set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col,
-                               sf->always_this_block_size);
-        rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
-                         &dummy_rate, &dummy_dist, 1, cpi->pc_root);
-      } else if (cpi->skippable_frame ||
-                 sf->partition_search_type == VAR_BASED_FIXED_PARTITION) {
-        BLOCK_SIZE bsize;
-        set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
-        bsize = get_rd_var_based_fixed_partition(cpi, mi_row, mi_col);
-        set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col, bsize);
-        rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
-                         &dummy_rate, &dummy_dist, 1, cpi->pc_root);
-      } else if (sf->partition_search_type == VAR_BASED_PARTITION) {
-        choose_partitioning(cpi, tile, mi_row, mi_col);
-        rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
-                         &dummy_rate, &dummy_dist, 1, cpi->pc_root);
-      } else {
-        GF_GROUP * gf_grp = &cpi->twopass.gf_group;
-        int last_was_mid_sequence_overlay = 0;
-        if ((cpi->oxcf.pass == 2) && (gf_grp->index)) {
-          if (gf_grp->update_type[gf_grp->index - 1] == OVERLAY_UPDATE)
-            last_was_mid_sequence_overlay = 1;
-        }
-        if ((cpi->rc.frames_since_key
-            % sf->last_partitioning_redo_frequency) == 0
-            || last_was_mid_sequence_overlay
-            || cm->prev_mi == 0
-            || cm->show_frame == 0
-            || cm->frame_type == KEY_FRAME
-            || cpi->rc.is_src_frame_alt_ref
-            || ((sf->use_lastframe_partitioning ==
-                 LAST_FRAME_PARTITION_LOW_MOTION) &&
-                 sb_has_motion(cm, prev_mi, sf->lf_motion_threshold))) {
-          // If required set upper and lower partition size limits
-          if (sf->auto_min_max_partition_size) {
-            set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
-            rd_auto_partition_range(cpi, tile, mi_row, mi_col,
-                                    &sf->min_partition_size,
-                                    &sf->max_partition_size);
-          }
-          rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
-                            &dummy_rate, &dummy_dist, INT64_MAX,
-                            cpi->pc_root);
-        } else {
-          if (sf->constrain_copy_partition &&
-              sb_has_motion(cm, prev_mi, sf->lf_motion_threshold))
-            constrain_copy_partitioning(cpi, tile, mi, prev_mi,
-                                        mi_row, mi_col, BLOCK_16X16);
-          else
-            copy_partitioning(cm, mi, prev_mi);
-          rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
-                           &dummy_rate, &dummy_dist, 1, cpi->pc_root);
-        }
-      }
+    cpi->mb.source_variance = UINT_MAX;
+    if (sf->partition_search_type == FIXED_PARTITION) {
+      set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
+      set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col,
+                             sf->always_this_block_size);
+      rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
+                       &dummy_rate, &dummy_dist, 1, cpi->pc_root);
+    } else if (cpi->partition_search_skippable_frame) {
+      BLOCK_SIZE bsize;
+      set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
+      bsize = get_rd_var_based_fixed_partition(cpi, mi_row, mi_col);
+      set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col, bsize);
+      rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
+                       &dummy_rate, &dummy_dist, 1, cpi->pc_root);
+    } else if (sf->partition_search_type == VAR_BASED_PARTITION &&
+               cm->frame_type != KEY_FRAME ) {
+      choose_partitioning(cpi, tile, mi_row, mi_col);
+      rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
+                       &dummy_rate, &dummy_dist, 1, cpi->pc_root);
+    } else if (sf->partition_search_type == SEARCH_PARTITION &&
+               sf->use_lastframe_partitioning &&
+               (cpi->rc.frames_since_key %
+                   sf->last_partitioning_redo_frequency) &&
+               cm->prev_mi &&
+               cm->show_frame &&
+               cm->frame_type != KEY_FRAME &&
+               !cpi->rc.is_src_frame_alt_ref &&
+               ((sf->use_lastframe_partitioning !=
+                   LAST_FRAME_PARTITION_LOW_MOTION) ||
+                   !sb_has_motion(cm, prev_mi, sf->lf_motion_threshold))) {
+      if (sf->constrain_copy_partition &&
+          sb_has_motion(cm, prev_mi, sf->lf_motion_threshold))
+        constrain_copy_partitioning(cpi, tile, mi, prev_mi,
+                                    mi_row, mi_col, BLOCK_16X16);
+      else
+        copy_partitioning(cm, mi, prev_mi);
+      rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
+                       &dummy_rate, &dummy_dist, 1, cpi->pc_root);
     } else {
       // If required set upper and lower partition size limits
       if (sf->auto_min_max_partition_size) {
@@ -2551,7 +2623,7 @@
                                 &sf->max_partition_size);
       }
       rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
-                        &dummy_rate, &dummy_dist, INT64_MAX, cpi->pc_root);
+                        &dummy_rdc, INT64_MAX, cpi->pc_root);
     }
   }
 }
@@ -2652,7 +2724,7 @@
                               BLOCK_SIZE bsize, BLOCK_SIZE subsize,
                               PC_TREE *pc_tree) {
   MACROBLOCKD *xd = &x->e_mbd;
-  int bsl = b_width_log2(bsize), hbs = (1 << bsl) / 4;
+  int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
   PARTITION_TYPE partition = pc_tree->partitioning;
 
   assert(bsize >= BLOCK_8X8);
@@ -2771,8 +2843,13 @@
       this_rate += cpi->partition_cost[pl][PARTITION_NONE];
       sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist);
       if (sum_rd < best_rd) {
-        int64_t stop_thresh = 4096;
-        int64_t stop_thresh_rd;
+        int dist_breakout_thr = sf->partition_search_breakout_dist_thr;
+        int64_t rate_breakout_thr = sf->partition_search_breakout_rate_thr;
+
+        dist_breakout_thr >>= 8 - (b_width_log2_lookup[bsize] +
+            b_height_log2_lookup[bsize]);
+
+        rate_breakout_thr *= num_pels_log2_lookup[bsize];
 
         best_rate = this_rate;
         best_dist = this_dist;
@@ -2780,14 +2857,9 @@
         if (bsize >= BLOCK_8X8)
           pc_tree->partitioning = PARTITION_NONE;
 
-        // Adjust threshold according to partition size.
-        stop_thresh >>= 8 - (b_width_log2(bsize) +
-            b_height_log2(bsize));
-
-        stop_thresh_rd = RDCOST(x->rdmult, x->rddiv, 0, stop_thresh);
-        // If obtained distortion is very small, choose current partition
-        // and stop splitting.
-        if (!x->e_mbd.lossless && best_rd < stop_thresh_rd) {
+        if (!x->e_mbd.lossless &&
+            this_rate < rate_breakout_thr &&
+            this_dist < dist_breakout_thr) {
           do_split = 0;
           do_rect = 0;
         }
@@ -2974,7 +3046,7 @@
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  const int bsl = b_width_log2(bsize), hbs = (1 << bsl) / 4;
+  const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
   const int mis = cm->mi_stride;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
@@ -3095,7 +3167,6 @@
     int64_t dummy_dist = 0;
     const int idx_str = cm->mi_stride * mi_row + mi_col;
     MODE_INFO *mi = cm->mi + idx_str;
-    MODE_INFO *prev_mi = (cm->prev_mip + cm->mi_stride + 1 + idx_str)->src_mi;
     BLOCK_SIZE bsize;
     x->in_static_area = 0;
     x->source_variance = UINT_MAX;
@@ -3113,7 +3184,6 @@
         nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
                             1, &dummy_rate, &dummy_dist, cpi->pc_root);
         break;
-      case VAR_BASED_FIXED_PARTITION:
       case FIXED_PARTITION:
         bsize = sf->partition_search_type == FIXED_PARTITION ?
                 sf->always_this_block_size :
@@ -3133,7 +3203,7 @@
                                &dummy_rate, &dummy_dist, 1, INT64_MAX,
                                cpi->pc_root);
         } else {
-          copy_partitioning(cm, mi, prev_mi);
+          choose_partitioning(cpi, tile, mi_row, mi_col);
           nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col,
                               BLOCK_64X64, 1, &dummy_rate, &dummy_dist,
                               cpi->pc_root);
@@ -3170,9 +3240,34 @@
 
   for (i = 0; i < cm->mb_rows; i++) {
     for (j = 0; j < cm->mb_cols; j++) {
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (cm->use_highbitdepth) {
+        switch (cm->bit_depth) {
+          case VPX_BITS_8:
+            vp9_highbd_get16x16var(src, src_stride, last_src, last_stride,
+                                   &var16->sse, &var16->sum);
+            break;
+          case VPX_BITS_10:
+            vp9_highbd_10_get16x16var(src, src_stride, last_src, last_stride,
+                                    &var16->sse, &var16->sum);
+            break;
+          case VPX_BITS_12:
+            vp9_highbd_12_get16x16var(src, src_stride, last_src, last_stride,
+                                      &var16->sse, &var16->sum);
+            break;
+          default:
+            assert(0 && "cm->bit_depth should be VPX_BITS_8, VPX_BITS_10"
+                   " or VPX_BITS_12");
+            return -1;
+        }
+      } else {
+        vp9_get16x16var(src, src_stride, last_src, last_stride,
+                        &var16->sse, &var16->sum);
+      }
+#else
       vp9_get16x16var(src, src_stride, last_src, last_stride,
                       &var16->sse, &var16->sum);
-
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       var16->var = var16->sse -
           (((uint32_t)var16->sum * var16->sum) >> 8);
 
@@ -3252,25 +3347,39 @@
   const VP9_COMMON *const cm = &cpi->common;
   const int tile_cols = 1 << cm->log2_tile_cols;
   const int tile_rows = 1 << cm->log2_tile_rows;
+
   int tile_col, tile_row;
-  TOKENEXTRA *tok = cpi->tok;
+  TileInfo tile[4][1 << 6];
+  TOKENEXTRA *tok[4][1 << 6];
+  TOKENEXTRA *pre_tok = cpi->tok;
+  int tile_tok = 0;
 
   for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
     for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-      TileInfo tile;
-      TOKENEXTRA *old_tok = tok;
+      vp9_tile_init(&tile[tile_row][tile_col], cm, tile_row, tile_col);
+
+      tok[tile_row][tile_col] = pre_tok + tile_tok;
+      pre_tok = tok[tile_row][tile_col];
+      tile_tok = allocated_tokens(tile[tile_row][tile_col]);
+    }
+  }
+
+  for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+    for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+      const TileInfo * const ptile = &tile[tile_row][tile_col];
+      TOKENEXTRA * const old_tok = tok[tile_row][tile_col];
       int mi_row;
 
-      vp9_tile_init(&tile, cm, tile_row, tile_col);
-      for (mi_row = tile.mi_row_start; mi_row < tile.mi_row_end;
+      for (mi_row = ptile->mi_row_start; mi_row < ptile->mi_row_end;
            mi_row += MI_BLOCK_SIZE) {
         if (cpi->sf.use_nonrd_pick_mode && !frame_is_intra_only(cm))
-          encode_nonrd_sb_row(cpi, &tile, mi_row, &tok);
+          encode_nonrd_sb_row(cpi, ptile, mi_row, &tok[tile_row][tile_col]);
         else
-          encode_rd_sb_row(cpi, &tile, mi_row, &tok);
+          encode_rd_sb_row(cpi, ptile, mi_row, &tok[tile_row][tile_col]);
       }
-      cpi->tok_count[tile_row][tile_col] = (unsigned int)(tok - old_tok);
-      assert(tok - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols));
+      cpi->tok_count[tile_row][tile_col] =
+          (unsigned int)(tok[tile_row][tile_col] - old_tok);
+      assert(tok[tile_row][tile_col] - old_tok <= allocated_tokens(*ptile));
     }
   }
 }
@@ -3314,7 +3423,16 @@
 
   cm->tx_mode = select_tx_mode(cpi);
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cm->use_highbitdepth)
+    x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vp9_fdct4x4;
+  else
+    x->fwd_txm4x4 = xd->lossless ? vp9_highbd_fwht4x4 : vp9_highbd_fdct4x4;
+  x->highbd_itxm_add = xd->lossless ? vp9_highbd_iwht4x4_add :
+                                      vp9_highbd_idct4x4_add;
+#else
   x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vp9_fdct4x4;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   x->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
 
   if (xd->lossless) {

diff --git a/source/libvpx/vp9/encoder/vp9_encodemb.c b/source/libvpx/vp9/encoder/vp9_encodemb.c
index 2eae149..f5faa7c 100644
--- a/source/libvpx/vp9/encoder/vp9_encodemb.c
+++ b/source/libvpx/vp9/encoder/vp9_encodemb.c

@@ -51,6 +51,29 @@
   }
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_subtract_block_c(int rows, int cols,
+                                 int16_t *diff, ptrdiff_t diff_stride,
+                                 const uint8_t *src8, ptrdiff_t src_stride,
+                                 const uint8_t *pred8, ptrdiff_t pred_stride,
+                                 int bd) {
+  int r, c;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  (void) bd;
+
+  for (r = 0; r < rows; r++) {
+    for (c = 0; c < cols; c++) {
+      diff[c] = src[c] - pred[c];
+    }
+
+    diff += diff_stride;
+    pred += pred_stride;
+    src  += src_stride;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
   struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
@@ -58,6 +81,14 @@
   const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize];
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vp9_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf,
+                              p->src.stride, pd->dst.buf, pd->dst.stride,
+                              x->e_mbd.bd);
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   vp9_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
                      pd->dst.buf, pd->dst.stride);
 }
@@ -124,6 +155,8 @@
   int64_t rd_cost0, rd_cost1;
   int rate0, rate1, error0, error1, t0, t1;
   int best, band, pt, i, final_eob;
+  const TOKENVALUE *dct_value_tokens;
+  const int16_t *dct_value_cost;
 
   assert((!type && !plane) || (type && plane));
   assert(eob <= default_eob);
@@ -140,9 +173,24 @@
   tokens[eob][0].qc = 0;
   tokens[eob][1] = tokens[eob][0];
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->bd == 12) {
+    dct_value_tokens = vp9_dct_value_tokens_high12_ptr;
+    dct_value_cost = vp9_dct_value_cost_high12_ptr;
+  } else if (xd->bd == 10) {
+    dct_value_tokens = vp9_dct_value_tokens_high10_ptr;
+    dct_value_cost = vp9_dct_value_cost_high10_ptr;
+  } else {
+    dct_value_tokens = vp9_dct_value_tokens_ptr;
+    dct_value_cost = vp9_dct_value_cost_ptr;
+  }
+#else
+  dct_value_tokens = vp9_dct_value_tokens_ptr;
+  dct_value_cost = vp9_dct_value_cost_ptr;
+#endif
   for (i = 0; i < eob; i++)
     token_cache[scan[i]] =
-        vp9_pt_energy_class[vp9_dct_value_tokens_ptr[qcoeff[scan[i]]].token];
+        vp9_pt_energy_class[dct_value_tokens[qcoeff[scan[i]]].token];
 
   for (i = eob; i-- > 0;) {
     int base_bits, d2, dx;
@@ -156,7 +204,7 @@
       /* Evaluate the first possibility for this state. */
       rate0 = tokens[next][0].rate;
       rate1 = tokens[next][1].rate;
-      t0 = (vp9_dct_value_tokens_ptr + x)->token;
+      t0 = (dct_value_tokens + x)->token;
       /* Consider both possible successor states. */
       if (next < default_eob) {
         band = band_translate[i + 1];
@@ -169,8 +217,13 @@
       UPDATE_RD_COST();
       /* And pick the best. */
       best = rd_cost1 < rd_cost0;
-      base_bits = vp9_dct_value_cost_ptr[x];
+      base_bits = dct_value_cost[x];
       dx = mul * (dqcoeff[rc] - coeff[rc]);
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        dx >>= xd->bd - 8;
+      }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       d2 = dx * dx;
       tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
       tokens[i][0].error = d2 + (best ? error1 : error0);
@@ -203,7 +256,7 @@
         t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
         t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
       } else {
-        t0 = t1 = (vp9_dct_value_tokens_ptr + x)->token;
+        t0 = t1 = (dct_value_tokens + x)->token;
       }
       if (next < default_eob) {
         band = band_translate[i + 1];
@@ -222,10 +275,18 @@
       UPDATE_RD_COST();
       /* And pick the best. */
       best = rd_cost1 < rd_cost0;
-      base_bits = vp9_dct_value_cost_ptr[x];
+      base_bits = dct_value_cost[x];
 
       if (shortcut) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+          dx -= ((dequant_ptr[rc != 0] >> (xd->bd - 8)) + sz) ^ sz;
+        } else {
+          dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
+        }
+#else
         dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
         d2 = dx * dx;
       }
       tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
@@ -303,14 +364,14 @@
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static INLINE void high_fdct32x32(int rd_transform, const int16_t *src,
-                                  tran_low_t *dst, int src_stride) {
+static INLINE void highbd_fdct32x32(int rd_transform, const int16_t *src,
+                                    tran_low_t *dst, int src_stride) {
   if (rd_transform)
-    vp9_high_fdct32x32_rd(src, dst, src_stride);
+    vp9_highbd_fdct32x32_rd(src, dst, src_stride);
   else
-    vp9_high_fdct32x32(src, dst, src_stride);
+    vp9_highbd_fdct32x32(src, dst, src_stride);
 }
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
@@ -328,6 +389,45 @@
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
   src_diff = &p->src_diff[4 * (j * diff_stride + i)];
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    switch (tx_size) {
+      case TX_32X32:
+        highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+        vp9_highbd_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,
+                                     p->round_fp, p->quant_fp, p->quant_shift,
+                                     qcoeff, dqcoeff, pd->dequant,
+                                     p->zbin_extra, eob, scan_order->scan,
+                                     scan_order->iscan);
+        break;
+      case TX_16X16:
+        vp9_highbd_fdct16x16(src_diff, coeff, diff_stride);
+        vp9_highbd_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
+                               p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                               pd->dequant, p->zbin_extra, eob,
+                               scan_order->scan, scan_order->iscan);
+        break;
+      case TX_8X8:
+        vp9_highbd_fdct8x8(src_diff, coeff, diff_stride);
+        vp9_highbd_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
+                               p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                               pd->dequant, p->zbin_extra, eob,
+                               scan_order->scan, scan_order->iscan);
+        break;
+      case TX_4X4:
+        x->fwd_txm4x4(src_diff, coeff, diff_stride);
+        vp9_highbd_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
+                               p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                               pd->dequant, p->zbin_extra, eob,
+                               scan_order->scan, scan_order->iscan);
+        break;
+      default:
+        assert(0);
+    }
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
   switch (tx_size) {
     case TX_32X32:
       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
@@ -379,6 +479,40 @@
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
   src_diff = &p->src_diff[4 * (j * diff_stride + i)];
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    switch (tx_size) {
+      case TX_32X32:
+        vp9_highbd_fdct32x32_1(src_diff, coeff, diff_stride);
+        vp9_highbd_quantize_dc_32x32(coeff, x->skip_block, p->round,
+                                     p->quant_fp[0], qcoeff, dqcoeff,
+                                     pd->dequant[0], eob);
+        break;
+      case TX_16X16:
+        vp9_highbd_fdct16x16_1(src_diff, coeff, diff_stride);
+        vp9_highbd_quantize_dc(coeff, x->skip_block, p->round,
+                               p->quant_fp[0], qcoeff, dqcoeff,
+                               pd->dequant[0], eob);
+        break;
+      case TX_8X8:
+        vp9_highbd_fdct8x8_1(src_diff, coeff, diff_stride);
+        vp9_highbd_quantize_dc(coeff, x->skip_block, p->round,
+                               p->quant_fp[0], qcoeff, dqcoeff,
+                               pd->dequant[0], eob);
+        break;
+      case TX_4X4:
+        x->fwd_txm4x4(src_diff, coeff, diff_stride);
+        vp9_highbd_quantize_dc(coeff, x->skip_block, p->round,
+                               p->quant_fp[0], qcoeff, dqcoeff,
+                               pd->dequant[0], eob);
+        break;
+      default:
+        assert(0);
+    }
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
   switch (tx_size) {
     case TX_32X32:
       vp9_fdct32x32_1(src_diff, coeff, diff_stride);
@@ -426,6 +560,44 @@
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
   src_diff = &p->src_diff[4 * (j * diff_stride + i)];
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+     switch (tx_size) {
+      case TX_32X32:
+        highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+        vp9_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
+                                    p->round, p->quant, p->quant_shift, qcoeff,
+                                    dqcoeff, pd->dequant, p->zbin_extra, eob,
+                                    scan_order->scan, scan_order->iscan);
+        break;
+      case TX_16X16:
+        vp9_highbd_fdct16x16(src_diff, coeff, diff_stride);
+        vp9_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
+                              p->quant, p->quant_shift, qcoeff, dqcoeff,
+                              pd->dequant, p->zbin_extra, eob,
+                              scan_order->scan, scan_order->iscan);
+        break;
+      case TX_8X8:
+        vp9_highbd_fdct8x8(src_diff, coeff, diff_stride);
+        vp9_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
+                              p->quant, p->quant_shift, qcoeff, dqcoeff,
+                              pd->dequant, p->zbin_extra, eob,
+                              scan_order->scan, scan_order->iscan);
+        break;
+      case TX_4X4:
+        x->fwd_txm4x4(src_diff, coeff, diff_stride);
+        vp9_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
+                              p->quant, p->quant_shift, qcoeff, dqcoeff,
+                              pd->dequant, p->zbin_extra, eob,
+                              scan_order->scan, scan_order->iscan);
+        break;
+      default:
+        assert(0);
+    }
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
   switch (tx_size) {
     case TX_32X32:
       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
@@ -520,6 +692,34 @@
 
   if (x->skip_encode || p->eobs[block] == 0)
     return;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    switch (tx_size) {
+      case TX_32X32:
+        vp9_highbd_idct32x32_add(dqcoeff, dst, pd->dst.stride,
+                                 p->eobs[block], xd->bd);
+        break;
+      case TX_16X16:
+        vp9_highbd_idct16x16_add(dqcoeff, dst, pd->dst.stride,
+                                 p->eobs[block], xd->bd);
+        break;
+      case TX_8X8:
+        vp9_highbd_idct8x8_add(dqcoeff, dst, pd->dst.stride,
+                               p->eobs[block], xd->bd);
+        break;
+      case TX_4X4:
+        // this is like vp9_short_idct4x4 but has a special case around eob<=1
+        // which is significant (not just an optimization) for the lossless
+        // case.
+        x->highbd_itxm_add(dqcoeff, dst, pd->dst.stride,
+                           p->eobs[block], xd->bd);
+        break;
+      default:
+        assert(0 && "Invalid transform size");
+    }
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
   switch (tx_size) {
     case TX_32X32:
@@ -557,8 +757,15 @@
 
   vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
 
-  if (p->eobs[block] > 0)
+  if (p->eobs[block] > 0) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+       x->highbd_itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], xd->bd);
+       return;
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+  }
 }
 
 void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize) {
@@ -609,7 +816,7 @@
   const scan_order *scan_order;
   TX_TYPE tx_type;
   PREDICTION_MODE mode;
-  const int bwl = b_width_log2(plane_bsize);
+  const int bwl = b_width_log2_lookup[plane_bsize];
   const int diff_stride = 4 * (1 << bwl);
   uint8_t *src, *dst;
   int16_t *src_diff;
@@ -622,6 +829,117 @@
   src = &p->src.buf[4 * (j * src_stride + i)];
   src_diff = &p->src_diff[4 * (j * diff_stride + i)];
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    switch (tx_size) {
+      case TX_32X32:
+        scan_order = &vp9_default_scan_orders[TX_32X32];
+        mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
+        vp9_predict_intra_block(xd, block >> 6, bwl, TX_32X32, mode,
+                                x->skip_encode ? src : dst,
+                                x->skip_encode ? src_stride : dst_stride,
+                                dst, dst_stride, i, j, plane);
+        if (!x->skip_recode) {
+          vp9_highbd_subtract_block(32, 32, src_diff, diff_stride,
+                                    src, src_stride, dst, dst_stride, xd->bd);
+          highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+          vp9_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
+                                      p->round, p->quant, p->quant_shift,
+                                      qcoeff, dqcoeff, pd->dequant,
+                                      p->zbin_extra, eob,
+                                      scan_order->scan, scan_order->iscan);
+        }
+        if (!x->skip_encode && *eob) {
+          vp9_highbd_idct32x32_add(dqcoeff, dst, dst_stride, *eob, xd->bd);
+        }
+        break;
+      case TX_16X16:
+        tx_type = get_tx_type(pd->plane_type, xd);
+        scan_order = &vp9_scan_orders[TX_16X16][tx_type];
+        mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
+        vp9_predict_intra_block(xd, block >> 4, bwl, TX_16X16, mode,
+                                x->skip_encode ? src : dst,
+                                x->skip_encode ? src_stride : dst_stride,
+                                dst, dst_stride, i, j, plane);
+        if (!x->skip_recode) {
+          vp9_highbd_subtract_block(16, 16, src_diff, diff_stride,
+                                    src, src_stride, dst, dst_stride, xd->bd);
+          vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
+          vp9_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
+                                p->quant, p->quant_shift, qcoeff, dqcoeff,
+                                pd->dequant, p->zbin_extra, eob,
+                                scan_order->scan, scan_order->iscan);
+        }
+        if (!x->skip_encode && *eob) {
+          vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, dst_stride,
+                                  *eob, xd->bd);
+        }
+        break;
+      case TX_8X8:
+        tx_type = get_tx_type(pd->plane_type, xd);
+        scan_order = &vp9_scan_orders[TX_8X8][tx_type];
+        mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
+        vp9_predict_intra_block(xd, block >> 2, bwl, TX_8X8, mode,
+                                x->skip_encode ? src : dst,
+                                x->skip_encode ? src_stride : dst_stride,
+                                dst, dst_stride, i, j, plane);
+        if (!x->skip_recode) {
+          vp9_highbd_subtract_block(8, 8, src_diff, diff_stride,
+                                    src, src_stride, dst, dst_stride, xd->bd);
+          vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
+          vp9_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
+                                p->quant, p->quant_shift, qcoeff, dqcoeff,
+                                pd->dequant, p->zbin_extra, eob,
+                                scan_order->scan, scan_order->iscan);
+        }
+        if (!x->skip_encode && *eob) {
+          vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob,
+                                xd->bd);
+        }
+        break;
+      case TX_4X4:
+        tx_type = get_tx_type_4x4(pd->plane_type, xd, block);
+        scan_order = &vp9_scan_orders[TX_4X4][tx_type];
+        mode = plane == 0 ? get_y_mode(xd->mi[0].src_mi, block) : mbmi->uv_mode;
+        vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode,
+                                x->skip_encode ? src : dst,
+                                x->skip_encode ? src_stride : dst_stride,
+                                dst, dst_stride, i, j, plane);
+
+        if (!x->skip_recode) {
+          vp9_highbd_subtract_block(4, 4, src_diff, diff_stride,
+                                    src, src_stride, dst, dst_stride, xd->bd);
+          if (tx_type != DCT_DCT)
+            vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
+          else
+            x->fwd_txm4x4(src_diff, coeff, diff_stride);
+          vp9_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
+                                p->quant, p->quant_shift, qcoeff, dqcoeff,
+                                pd->dequant, p->zbin_extra, eob,
+                                scan_order->scan, scan_order->iscan);
+        }
+
+        if (!x->skip_encode && *eob) {
+          if (tx_type == DCT_DCT) {
+            // this is like vp9_short_idct4x4 but has a special case around
+            // eob<=1 which is significant (not just an optimization) for the
+            // lossless case.
+            x->highbd_itxm_add(dqcoeff, dst, dst_stride, *eob, xd->bd);
+          } else {
+            vp9_highbd_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type, xd->bd);
+          }
+        }
+        break;
+      default:
+        assert(0);
+        return;
+    }
+    if (*eob)
+      *(args->skip) = 0;
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
   switch (tx_size) {
     case TX_32X32:
       scan_order = &vp9_default_scan_orders[TX_32X32];

diff --git a/source/libvpx/vp9/encoder/vp9_encodemb.h b/source/libvpx/vp9/encoder/vp9_encodemb.h
index 1999718..54d2b37 100644
--- a/source/libvpx/vp9/encoder/vp9_encodemb.h
+++ b/source/libvpx/vp9/encoder/vp9_encodemb.h

@@ -13,8 +13,6 @@
 
 #include "./vpx_config.h"
 #include "vp9/encoder/vp9_block.h"
-#include "vp9/encoder/vp9_encoder.h"
-#include "vp9/common/vp9_onyxc_int.h"
 
 #ifdef __cplusplus
 extern "C" {

diff --git a/source/libvpx/vp9/encoder/vp9_encoder.c b/source/libvpx/vp9/encoder/vp9_encoder.c
index 5f5af19..1758e3f 100644
--- a/source/libvpx/vp9/encoder/vp9_encoder.c
+++ b/source/libvpx/vp9/encoder/vp9_encoder.c

@@ -144,7 +144,6 @@
 
   if (!init_done) {
     vp9_rtcd();
-    vp9_init_neighbors();
     vp9_init_intra_predictors();
     vp9_coef_tree_initialize();
     vp9_tokenize_initialize();
@@ -226,6 +225,9 @@
   }
   vpx_memset(&cpi->svc.scaled_frames[0], 0,
              MAX_LAG_BUFFERS * sizeof(cpi->svc.scaled_frames[0]));
+
+  vp9_free_frame_buffer(&cpi->svc.empty_frame.img);
+  vpx_memset(&cpi->svc.empty_frame, 0, sizeof(cpi->svc.empty_frame));
 }
 
 static void save_coding_context(VP9_COMP *cpi) {
@@ -586,8 +588,6 @@
   cpi->ref_frame_flags = 0;
 
   init_buffer_indices(cpi);
-
-  set_tile_limits(cpi);
 }
 
 static void set_rc_buffer_sizes(RATE_CONTROL *rc,
@@ -604,6 +604,612 @@
                                            : maximum * bandwidth / 1000;
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF) \
+    cpi->fn_ptr[BT].sdf = SDF; \
+    cpi->fn_ptr[BT].sdaf = SDAF; \
+    cpi->fn_ptr[BT].vf = VF; \
+    cpi->fn_ptr[BT].svf = SVF; \
+    cpi->fn_ptr[BT].svaf = SVAF; \
+    cpi->fn_ptr[BT].sdx3f = SDX3F; \
+    cpi->fn_ptr[BT].sdx8f = SDX8F; \
+    cpi->fn_ptr[BT].sdx4df = SDX4DF;
+
+#define MAKE_BFP_SAD_WRAPPER(fnname) \
+static unsigned int fnname##_bits8(const uint8_t *src_ptr, \
+                                   int source_stride, \
+                                   const uint8_t *ref_ptr, \
+                                   int ref_stride) {  \
+  return fnname(src_ptr, source_stride, ref_ptr, ref_stride); \
+} \
+static unsigned int fnname##_bits10(const uint8_t *src_ptr, \
+                                    int source_stride, \
+                                    const uint8_t *ref_ptr, \
+                                    int ref_stride) {  \
+  return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 2; \
+} \
+static unsigned int fnname##_bits12(const uint8_t *src_ptr, \
+                                    int source_stride, \
+                                    const uint8_t *ref_ptr, \
+                                    int ref_stride) {  \
+  return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 4; \
+}
+
+#define MAKE_BFP_SADAVG_WRAPPER(fnname) static unsigned int \
+fnname##_bits8(const uint8_t *src_ptr, \
+               int source_stride, \
+               const uint8_t *ref_ptr, \
+               int ref_stride, \
+               const uint8_t *second_pred) {  \
+  return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred); \
+} \
+static unsigned int fnname##_bits10(const uint8_t *src_ptr, \
+                                    int source_stride, \
+                                    const uint8_t *ref_ptr, \
+                                    int ref_stride, \
+                                    const uint8_t *second_pred) {  \
+  return fnname(src_ptr, source_stride, ref_ptr, ref_stride, \
+                second_pred) >> 2; \
+} \
+static unsigned int fnname##_bits12(const uint8_t *src_ptr, \
+                                    int source_stride, \
+                                    const uint8_t *ref_ptr, \
+                                    int ref_stride, \
+                                    const uint8_t *second_pred) {  \
+  return fnname(src_ptr, source_stride, ref_ptr, ref_stride, \
+                second_pred) >> 4; \
+}
+
+#define MAKE_BFP_SAD3_WRAPPER(fnname) \
+static void fnname##_bits8(const uint8_t *src_ptr, \
+                           int source_stride, \
+                           const uint8_t *ref_ptr, \
+                           int  ref_stride, \
+                           unsigned int *sad_array) {  \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+} \
+static void fnname##_bits10(const uint8_t *src_ptr, \
+                            int source_stride, \
+                            const uint8_t *ref_ptr, \
+                            int  ref_stride, \
+                            unsigned int *sad_array) {  \
+  int i; \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+  for (i = 0; i < 3; i++) \
+    sad_array[i] >>= 2; \
+} \
+static void fnname##_bits12(const uint8_t *src_ptr, \
+                            int source_stride, \
+                            const uint8_t *ref_ptr, \
+                            int  ref_stride, \
+                            unsigned int *sad_array) {  \
+  int i; \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+  for (i = 0; i < 3; i++) \
+    sad_array[i] >>= 4; \
+}
+
+#define MAKE_BFP_SAD8_WRAPPER(fnname) \
+static void fnname##_bits8(const uint8_t *src_ptr, \
+                           int source_stride, \
+                           const uint8_t *ref_ptr, \
+                           int  ref_stride, \
+                           unsigned int *sad_array) {  \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+} \
+static void fnname##_bits10(const uint8_t *src_ptr, \
+                            int source_stride, \
+                            const uint8_t *ref_ptr, \
+                            int  ref_stride, \
+                            unsigned int *sad_array) {  \
+  int i; \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+  for (i = 0; i < 8; i++) \
+    sad_array[i] >>= 2; \
+} \
+static void fnname##_bits12(const uint8_t *src_ptr, \
+                            int source_stride, \
+                            const uint8_t *ref_ptr, \
+                            int  ref_stride, \
+                            unsigned int *sad_array) {  \
+  int i; \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+  for (i = 0; i < 8; i++) \
+    sad_array[i] >>= 4; \
+}
+#define MAKE_BFP_SAD4D_WRAPPER(fnname) \
+static void fnname##_bits8(const uint8_t *src_ptr, \
+                           int source_stride, \
+                           const uint8_t* const ref_ptr[], \
+                           int  ref_stride, \
+                           unsigned int *sad_array) {  \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+} \
+static void fnname##_bits10(const uint8_t *src_ptr, \
+                            int source_stride, \
+                            const uint8_t* const ref_ptr[], \
+                            int  ref_stride, \
+                            unsigned int *sad_array) {  \
+  int i; \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+  for (i = 0; i < 4; i++) \
+  sad_array[i] >>= 2; \
+} \
+static void fnname##_bits12(const uint8_t *src_ptr, \
+                            int source_stride, \
+                            const uint8_t* const ref_ptr[], \
+                            int  ref_stride, \
+                            unsigned int *sad_array) {  \
+  int i; \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+  for (i = 0; i < 4; i++) \
+  sad_array[i] >>= 4; \
+}
+
+MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad32x16)
+MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad32x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad32x16x4d)
+MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad16x32)
+MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad16x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad16x32x4d)
+MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad64x32)
+MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad64x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad64x32x4d)
+MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad32x64)
+MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad32x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad32x64x4d)
+MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad32x32)
+MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad32x32_avg)
+MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad32x32x3)
+MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad32x32x8)
+MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad32x32x4d)
+MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad64x64)
+MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad64x64_avg)
+MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad64x64x3)
+MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad64x64x8)
+MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad64x64x4d)
+MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad16x16)
+MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad16x16_avg)
+MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad16x16x3)
+MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad16x16x8)
+MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad16x16x4d)
+MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad16x8)
+MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad16x8_avg)
+MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad16x8x3)
+MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad16x8x8)
+MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad16x8x4d)
+MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad8x16)
+MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad8x16_avg)
+MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad8x16x3)
+MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad8x16x8)
+MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad8x16x4d)
+MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad8x8)
+MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad8x8_avg)
+MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad8x8x3)
+MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad8x8x8)
+MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad8x8x4d)
+MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad8x4)
+MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad8x4_avg)
+MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad8x4x8)
+MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad8x4x4d)
+MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad4x8)
+MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad4x8_avg)
+MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad4x8x8)
+MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad4x8x4d)
+MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad4x4)
+MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad4x4_avg)
+MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad4x4x3)
+MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad4x4x8)
+MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad4x4x4d)
+
+static void  highbd_set_var_fns(VP9_COMP *const cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  if (cm->use_highbitdepth) {
+    switch (cm->bit_depth) {
+      case VPX_BITS_8:
+        HIGHBD_BFP(BLOCK_32X16,
+                   vp9_highbd_sad32x16_bits8,
+                   vp9_highbd_sad32x16_avg_bits8,
+                   vp9_highbd_variance32x16,
+                   vp9_highbd_sub_pixel_variance32x16,
+                   vp9_highbd_sub_pixel_avg_variance32x16,
+                   NULL,
+                   NULL,
+                   vp9_highbd_sad32x16x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_16X32,
+                   vp9_highbd_sad16x32_bits8,
+                   vp9_highbd_sad16x32_avg_bits8,
+                   vp9_highbd_variance16x32,
+                   vp9_highbd_sub_pixel_variance16x32,
+                   vp9_highbd_sub_pixel_avg_variance16x32,
+                   NULL,
+                   NULL,
+                   vp9_highbd_sad16x32x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_64X32,
+                   vp9_highbd_sad64x32_bits8,
+                   vp9_highbd_sad64x32_avg_bits8,
+                   vp9_highbd_variance64x32,
+                   vp9_highbd_sub_pixel_variance64x32,
+                   vp9_highbd_sub_pixel_avg_variance64x32,
+                   NULL,
+                   NULL,
+                   vp9_highbd_sad64x32x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_32X64,
+                   vp9_highbd_sad32x64_bits8,
+                   vp9_highbd_sad32x64_avg_bits8,
+                   vp9_highbd_variance32x64,
+                   vp9_highbd_sub_pixel_variance32x64,
+                   vp9_highbd_sub_pixel_avg_variance32x64,
+                   NULL,
+                   NULL,
+                   vp9_highbd_sad32x64x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_32X32,
+                   vp9_highbd_sad32x32_bits8,
+                   vp9_highbd_sad32x32_avg_bits8,
+                   vp9_highbd_variance32x32,
+                   vp9_highbd_sub_pixel_variance32x32,
+                   vp9_highbd_sub_pixel_avg_variance32x32,
+                   vp9_highbd_sad32x32x3_bits8,
+                   vp9_highbd_sad32x32x8_bits8,
+                   vp9_highbd_sad32x32x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_64X64,
+                   vp9_highbd_sad64x64_bits8,
+                   vp9_highbd_sad64x64_avg_bits8,
+                   vp9_highbd_variance64x64,
+                   vp9_highbd_sub_pixel_variance64x64,
+                   vp9_highbd_sub_pixel_avg_variance64x64,
+                   vp9_highbd_sad64x64x3_bits8,
+                   vp9_highbd_sad64x64x8_bits8,
+                   vp9_highbd_sad64x64x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_16X16,
+                   vp9_highbd_sad16x16_bits8,
+                   vp9_highbd_sad16x16_avg_bits8,
+                   vp9_highbd_variance16x16,
+                   vp9_highbd_sub_pixel_variance16x16,
+                   vp9_highbd_sub_pixel_avg_variance16x16,
+                   vp9_highbd_sad16x16x3_bits8,
+                   vp9_highbd_sad16x16x8_bits8,
+                   vp9_highbd_sad16x16x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_16X8,
+                   vp9_highbd_sad16x8_bits8,
+                   vp9_highbd_sad16x8_avg_bits8,
+                   vp9_highbd_variance16x8,
+                   vp9_highbd_sub_pixel_variance16x8,
+                   vp9_highbd_sub_pixel_avg_variance16x8,
+                   vp9_highbd_sad16x8x3_bits8,
+                   vp9_highbd_sad16x8x8_bits8,
+                   vp9_highbd_sad16x8x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_8X16,
+                   vp9_highbd_sad8x16_bits8,
+                   vp9_highbd_sad8x16_avg_bits8,
+                   vp9_highbd_variance8x16,
+                   vp9_highbd_sub_pixel_variance8x16,
+                   vp9_highbd_sub_pixel_avg_variance8x16,
+                   vp9_highbd_sad8x16x3_bits8,
+                   vp9_highbd_sad8x16x8_bits8,
+                   vp9_highbd_sad8x16x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_8X8,
+                   vp9_highbd_sad8x8_bits8,
+                   vp9_highbd_sad8x8_avg_bits8,
+                   vp9_highbd_variance8x8,
+                   vp9_highbd_sub_pixel_variance8x8,
+                   vp9_highbd_sub_pixel_avg_variance8x8,
+                   vp9_highbd_sad8x8x3_bits8,
+                   vp9_highbd_sad8x8x8_bits8,
+                   vp9_highbd_sad8x8x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_8X4,
+                   vp9_highbd_sad8x4_bits8,
+                   vp9_highbd_sad8x4_avg_bits8,
+                   vp9_highbd_variance8x4,
+                   vp9_highbd_sub_pixel_variance8x4,
+                   vp9_highbd_sub_pixel_avg_variance8x4,
+                   NULL,
+                   vp9_highbd_sad8x4x8_bits8,
+                   vp9_highbd_sad8x4x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_4X8,
+                   vp9_highbd_sad4x8_bits8,
+                   vp9_highbd_sad4x8_avg_bits8,
+                   vp9_highbd_variance4x8,
+                   vp9_highbd_sub_pixel_variance4x8,
+                   vp9_highbd_sub_pixel_avg_variance4x8,
+                   NULL,
+                   vp9_highbd_sad4x8x8_bits8,
+                   vp9_highbd_sad4x8x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_4X4,
+                   vp9_highbd_sad4x4_bits8,
+                   vp9_highbd_sad4x4_avg_bits8,
+                   vp9_highbd_variance4x4,
+                   vp9_highbd_sub_pixel_variance4x4,
+                   vp9_highbd_sub_pixel_avg_variance4x4,
+                   vp9_highbd_sad4x4x3_bits8,
+                   vp9_highbd_sad4x4x8_bits8,
+                   vp9_highbd_sad4x4x4d_bits8)
+        break;
+
+      case VPX_BITS_10:
+        HIGHBD_BFP(BLOCK_32X16,
+                   vp9_highbd_sad32x16_bits10,
+                   vp9_highbd_sad32x16_avg_bits10,
+                   vp9_highbd_10_variance32x16,
+                   vp9_highbd_10_sub_pixel_variance32x16,
+                   vp9_highbd_10_sub_pixel_avg_variance32x16,
+                   NULL,
+                   NULL,
+                   vp9_highbd_sad32x16x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_16X32,
+                   vp9_highbd_sad16x32_bits10,
+                   vp9_highbd_sad16x32_avg_bits10,
+                   vp9_highbd_10_variance16x32,
+                   vp9_highbd_10_sub_pixel_variance16x32,
+                   vp9_highbd_10_sub_pixel_avg_variance16x32,
+                   NULL,
+                   NULL,
+                   vp9_highbd_sad16x32x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_64X32,
+                   vp9_highbd_sad64x32_bits10,
+                   vp9_highbd_sad64x32_avg_bits10,
+                   vp9_highbd_10_variance64x32,
+                   vp9_highbd_10_sub_pixel_variance64x32,
+                   vp9_highbd_10_sub_pixel_avg_variance64x32,
+                   NULL,
+                   NULL,
+                   vp9_highbd_sad64x32x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_32X64,
+                   vp9_highbd_sad32x64_bits10,
+                   vp9_highbd_sad32x64_avg_bits10,
+                   vp9_highbd_10_variance32x64,
+                   vp9_highbd_10_sub_pixel_variance32x64,
+                   vp9_highbd_10_sub_pixel_avg_variance32x64,
+                   NULL,
+                   NULL,
+                   vp9_highbd_sad32x64x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_32X32,
+                   vp9_highbd_sad32x32_bits10,
+                   vp9_highbd_sad32x32_avg_bits10,
+                   vp9_highbd_10_variance32x32,
+                   vp9_highbd_10_sub_pixel_variance32x32,
+                   vp9_highbd_10_sub_pixel_avg_variance32x32,
+                   vp9_highbd_sad32x32x3_bits10,
+                   vp9_highbd_sad32x32x8_bits10,
+                   vp9_highbd_sad32x32x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_64X64,
+                   vp9_highbd_sad64x64_bits10,
+                   vp9_highbd_sad64x64_avg_bits10,
+                   vp9_highbd_10_variance64x64,
+                   vp9_highbd_10_sub_pixel_variance64x64,
+                   vp9_highbd_10_sub_pixel_avg_variance64x64,
+                   vp9_highbd_sad64x64x3_bits10,
+                   vp9_highbd_sad64x64x8_bits10,
+                   vp9_highbd_sad64x64x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_16X16,
+                   vp9_highbd_sad16x16_bits10,
+                   vp9_highbd_sad16x16_avg_bits10,
+                   vp9_highbd_10_variance16x16,
+                   vp9_highbd_10_sub_pixel_variance16x16,
+                   vp9_highbd_10_sub_pixel_avg_variance16x16,
+                   vp9_highbd_sad16x16x3_bits10,
+                   vp9_highbd_sad16x16x8_bits10,
+                   vp9_highbd_sad16x16x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_16X8,
+                   vp9_highbd_sad16x8_bits10,
+                   vp9_highbd_sad16x8_avg_bits10,
+                   vp9_highbd_10_variance16x8,
+                   vp9_highbd_10_sub_pixel_variance16x8,
+                   vp9_highbd_10_sub_pixel_avg_variance16x8,
+                   vp9_highbd_sad16x8x3_bits10,
+                   vp9_highbd_sad16x8x8_bits10,
+                   vp9_highbd_sad16x8x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_8X16,
+                   vp9_highbd_sad8x16_bits10,
+                   vp9_highbd_sad8x16_avg_bits10,
+                   vp9_highbd_10_variance8x16,
+                   vp9_highbd_10_sub_pixel_variance8x16,
+                   vp9_highbd_10_sub_pixel_avg_variance8x16,
+                   vp9_highbd_sad8x16x3_bits10,
+                   vp9_highbd_sad8x16x8_bits10,
+                   vp9_highbd_sad8x16x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_8X8,
+                   vp9_highbd_sad8x8_bits10,
+                   vp9_highbd_sad8x8_avg_bits10,
+                   vp9_highbd_10_variance8x8,
+                   vp9_highbd_10_sub_pixel_variance8x8,
+                   vp9_highbd_10_sub_pixel_avg_variance8x8,
+                   vp9_highbd_sad8x8x3_bits10,
+                   vp9_highbd_sad8x8x8_bits10,
+                   vp9_highbd_sad8x8x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_8X4,
+                   vp9_highbd_sad8x4_bits10,
+                   vp9_highbd_sad8x4_avg_bits10,
+                   vp9_highbd_10_variance8x4,
+                   vp9_highbd_10_sub_pixel_variance8x4,
+                   vp9_highbd_10_sub_pixel_avg_variance8x4,
+                   NULL,
+                   vp9_highbd_sad8x4x8_bits10,
+                   vp9_highbd_sad8x4x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_4X8,
+                   vp9_highbd_sad4x8_bits10,
+                   vp9_highbd_sad4x8_avg_bits10,
+                   vp9_highbd_10_variance4x8,
+                   vp9_highbd_10_sub_pixel_variance4x8,
+                   vp9_highbd_10_sub_pixel_avg_variance4x8,
+                   NULL,
+                   vp9_highbd_sad4x8x8_bits10,
+                   vp9_highbd_sad4x8x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_4X4,
+                   vp9_highbd_sad4x4_bits10,
+                   vp9_highbd_sad4x4_avg_bits10,
+                   vp9_highbd_10_variance4x4,
+                   vp9_highbd_10_sub_pixel_variance4x4,
+                   vp9_highbd_10_sub_pixel_avg_variance4x4,
+                   vp9_highbd_sad4x4x3_bits10,
+                   vp9_highbd_sad4x4x8_bits10,
+                   vp9_highbd_sad4x4x4d_bits10)
+        break;
+
+      case VPX_BITS_12:
+        HIGHBD_BFP(BLOCK_32X16,
+                   vp9_highbd_sad32x16_bits12,
+                   vp9_highbd_sad32x16_avg_bits12,
+                   vp9_highbd_12_variance32x16,
+                   vp9_highbd_12_sub_pixel_variance32x16,
+                   vp9_highbd_12_sub_pixel_avg_variance32x16,
+                   NULL,
+                   NULL,
+                   vp9_highbd_sad32x16x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_16X32,
+                   vp9_highbd_sad16x32_bits12,
+                   vp9_highbd_sad16x32_avg_bits12,
+                   vp9_highbd_12_variance16x32,
+                   vp9_highbd_12_sub_pixel_variance16x32,
+                   vp9_highbd_12_sub_pixel_avg_variance16x32,
+                   NULL,
+                   NULL,
+                   vp9_highbd_sad16x32x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_64X32,
+                   vp9_highbd_sad64x32_bits12,
+                   vp9_highbd_sad64x32_avg_bits12,
+                   vp9_highbd_12_variance64x32,
+                   vp9_highbd_12_sub_pixel_variance64x32,
+                   vp9_highbd_12_sub_pixel_avg_variance64x32,
+                   NULL,
+                   NULL,
+                   vp9_highbd_sad64x32x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_32X64,
+                   vp9_highbd_sad32x64_bits12,
+                   vp9_highbd_sad32x64_avg_bits12,
+                   vp9_highbd_12_variance32x64,
+                   vp9_highbd_12_sub_pixel_variance32x64,
+                   vp9_highbd_12_sub_pixel_avg_variance32x64,
+                   NULL,
+                   NULL,
+                   vp9_highbd_sad32x64x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_32X32,
+                   vp9_highbd_sad32x32_bits12,
+                   vp9_highbd_sad32x32_avg_bits12,
+                   vp9_highbd_12_variance32x32,
+                   vp9_highbd_12_sub_pixel_variance32x32,
+                   vp9_highbd_12_sub_pixel_avg_variance32x32,
+                   vp9_highbd_sad32x32x3_bits12,
+                   vp9_highbd_sad32x32x8_bits12,
+                   vp9_highbd_sad32x32x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_64X64,
+                   vp9_highbd_sad64x64_bits12,
+                   vp9_highbd_sad64x64_avg_bits12,
+                   vp9_highbd_12_variance64x64,
+                   vp9_highbd_12_sub_pixel_variance64x64,
+                   vp9_highbd_12_sub_pixel_avg_variance64x64,
+                   vp9_highbd_sad64x64x3_bits12,
+                   vp9_highbd_sad64x64x8_bits12,
+                   vp9_highbd_sad64x64x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_16X16,
+                   vp9_highbd_sad16x16_bits12,
+                   vp9_highbd_sad16x16_avg_bits12,
+                   vp9_highbd_12_variance16x16,
+                   vp9_highbd_12_sub_pixel_variance16x16,
+                   vp9_highbd_12_sub_pixel_avg_variance16x16,
+                   vp9_highbd_sad16x16x3_bits12,
+                   vp9_highbd_sad16x16x8_bits12,
+                   vp9_highbd_sad16x16x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_16X8,
+                   vp9_highbd_sad16x8_bits12,
+                   vp9_highbd_sad16x8_avg_bits12,
+                   vp9_highbd_12_variance16x8,
+                   vp9_highbd_12_sub_pixel_variance16x8,
+                   vp9_highbd_12_sub_pixel_avg_variance16x8,
+                   vp9_highbd_sad16x8x3_bits12,
+                   vp9_highbd_sad16x8x8_bits12,
+                   vp9_highbd_sad16x8x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_8X16,
+                   vp9_highbd_sad8x16_bits12,
+                   vp9_highbd_sad8x16_avg_bits12,
+                   vp9_highbd_12_variance8x16,
+                   vp9_highbd_12_sub_pixel_variance8x16,
+                   vp9_highbd_12_sub_pixel_avg_variance8x16,
+                   vp9_highbd_sad8x16x3_bits12,
+                   vp9_highbd_sad8x16x8_bits12,
+                   vp9_highbd_sad8x16x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_8X8,
+                   vp9_highbd_sad8x8_bits12,
+                   vp9_highbd_sad8x8_avg_bits12,
+                   vp9_highbd_12_variance8x8,
+                   vp9_highbd_12_sub_pixel_variance8x8,
+                   vp9_highbd_12_sub_pixel_avg_variance8x8,
+                   vp9_highbd_sad8x8x3_bits12,
+                   vp9_highbd_sad8x8x8_bits12,
+                   vp9_highbd_sad8x8x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_8X4,
+                   vp9_highbd_sad8x4_bits12,
+                   vp9_highbd_sad8x4_avg_bits12,
+                   vp9_highbd_12_variance8x4,
+                   vp9_highbd_12_sub_pixel_variance8x4,
+                   vp9_highbd_12_sub_pixel_avg_variance8x4,
+                   NULL,
+                   vp9_highbd_sad8x4x8_bits12,
+                   vp9_highbd_sad8x4x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_4X8,
+                   vp9_highbd_sad4x8_bits12,
+                   vp9_highbd_sad4x8_avg_bits12,
+                   vp9_highbd_12_variance4x8,
+                   vp9_highbd_12_sub_pixel_variance4x8,
+                   vp9_highbd_12_sub_pixel_avg_variance4x8,
+                   NULL,
+                   vp9_highbd_sad4x8x8_bits12,
+                   vp9_highbd_sad4x8x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_4X4,
+                   vp9_highbd_sad4x4_bits12,
+                   vp9_highbd_sad4x4_avg_bits12,
+                   vp9_highbd_12_variance4x4,
+                   vp9_highbd_12_sub_pixel_variance4x4,
+                   vp9_highbd_12_sub_pixel_avg_variance4x4,
+                   vp9_highbd_sad4x4x3_bits12,
+                   vp9_highbd_sad4x4x8_bits12,
+                   vp9_highbd_sad4x4x4d_bits12)
+        break;
+
+      default:
+        assert(0 && "cm->bit_depth should be VPX_BITS_8, "
+                    "VPX_BITS_10 or VPX_BITS_12");
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
@@ -619,10 +1225,8 @@
 
   cpi->oxcf = *oxcf;
 #if CONFIG_VP9_HIGHBITDEPTH
-  if (cpi->oxcf.use_highbitdepth) {
-    cpi->mb.e_mbd.bd = (int)cm->bit_depth;
-  }
-#endif
+  cpi->mb.e_mbd.bd = (int)cm->bit_depth;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
   rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
 
@@ -693,6 +1297,10 @@
   cpi->ext_refresh_frame_flags_pending = 0;
   cpi->ext_refresh_frame_context_pending = 0;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  highbd_set_var_fns(cpi);
+#endif
+
 #if CONFIG_VP9_TEMPORAL_DENOISING
   if (cpi->oxcf.noise_sensitivity > 0) {
     vp9_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height,
@@ -772,7 +1380,7 @@
   vp9_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc);
 
   cm->current_video_frame = 0;
-  cpi->skippable_frame = 0;
+  cpi->partition_search_skippable_frame = 0;
 
   // Create the encoder segmentation map and set all entries to 0
   CHECK_MEM_ERROR(cm, cpi->segmentation_map,
@@ -1066,6 +1674,10 @@
       vp9_sub_pixel_avg_variance4x4,
       vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  highbd_set_var_fns(cpi);
+#endif
+
   /* vp9_init_quantizer() is first called here. Add check in
    * vp9_frame_init_quantizer() so that vp9_init_quantizer is only
    * called later when needed. This will avoid unnecessary calls of
@@ -1098,15 +1710,16 @@
                              - cpi->first_time_stamp_ever) / 10000000.000;
       double total_encode_time = (cpi->time_receive_data +
                                   cpi->time_compress_data)   / 1000.000;
-      double dr = (double)cpi->bytes * (double) 8 / (double)1000
-                  / time_encoded;
+      const double dr =
+          (double)cpi->bytes * (double) 8 / (double)1000 / time_encoded;
+      const double peak = (double)((1 << cpi->oxcf.input_bit_depth) - 1);
 
       if (cpi->b_calculate_psnr) {
         const double total_psnr =
-            vpx_sse_to_psnr((double)cpi->total_samples, 255.0,
+            vpx_sse_to_psnr((double)cpi->total_samples, peak,
                             (double)cpi->total_sq_error);
         const double totalp_psnr =
-            vpx_sse_to_psnr((double)cpi->totalp_samples, 255.0,
+            vpx_sse_to_psnr((double)cpi->totalp_samples, peak,
                             (double)cpi->totalp_sq_error);
         const double total_ssim = 100 * pow(cpi->summed_quality /
                                                 cpi->summed_weights, 8.0);
@@ -1193,6 +1806,7 @@
 
 #endif
 }
+
 static int64_t get_sse(const uint8_t *a, int a_stride,
                        const uint8_t *b, int b_stride,
                        int width, int height) {
@@ -1234,6 +1848,63 @@
   return total_sse;
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride,
+                                    const uint8_t *b8, int b_stride,
+                                    int width, int height,
+                                    unsigned int input_shift) {
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  int64_t total_sse = 0;
+  int x, y;
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
+      int64_t diff;
+      diff = (a[x] >> input_shift) - (b[x] >> input_shift);
+      total_sse += diff * diff;
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+  return total_sse;
+}
+
+static int64_t highbd_get_sse(const uint8_t *a, int a_stride,
+                              const uint8_t *b, int b_stride,
+                              int width, int height) {
+  int64_t total_sse = 0;
+  int x, y;
+  const int dw = width % 16;
+  const int dh = height % 16;
+  unsigned int sse = 0;
+  int sum = 0;
+  if (dw > 0) {
+    highbd_variance(&a[width - dw], a_stride, &b[width - dw], b_stride,
+                    dw, height, &sse, &sum);
+    total_sse += sse;
+  }
+  if (dh > 0) {
+    highbd_variance(&a[(height - dh) * a_stride], a_stride,
+                    &b[(height - dh) * b_stride], b_stride,
+                    width - dw, dh, &sse, &sum);
+    total_sse += sse;
+  }
+  for (y = 0; y < height / 16; ++y) {
+    const uint8_t *pa = a;
+    const uint8_t *pb = b;
+    for (x = 0; x < width / 16; ++x) {
+      vp9_highbd_mse16x16(pa, a_stride, pb, b_stride, &sse);
+      total_sse += sse;
+      pa += 16;
+      pb += 16;
+    }
+    a += 16 * a_stride;
+    b += 16 * b_stride;
+  }
+  return total_sse;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 typedef struct {
   double psnr[4];       // total/y/u/v
   uint64_t sse[4];      // total/y/u/v
@@ -1242,6 +1913,7 @@
 
 static void calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
                       PSNR_STATS *psnr) {
+  static const double peak = 255.0;
   const int widths[3]        = {a->y_width,  a->uv_width,  a->uv_width };
   const int heights[3]       = {a->y_height, a->uv_height, a->uv_height};
   const uint8_t *a_planes[3] = {a->y_buffer, a->u_buffer,  a->v_buffer };
@@ -1261,7 +1933,7 @@
                                  w, h);
     psnr->sse[1 + i] = sse;
     psnr->samples[1 + i] = samples;
-    psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, 255.0, (double)sse);
+    psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse);
 
     total_sse += sse;
     total_samples += samples;
@@ -1269,15 +1941,73 @@
 
   psnr->sse[0] = total_sse;
   psnr->samples[0] = total_samples;
-  psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, 255.0,
+  psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
                                   (double)total_sse);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static void calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b,
+                             PSNR_STATS *psnr,
+                             unsigned int bit_depth,
+                             unsigned int in_bit_depth) {
+  const int widths[3] = {a->y_width,  a->uv_width,  a->uv_width };
+  const int heights[3] = {a->y_height, a->uv_height, a->uv_height};
+  const uint8_t *a_planes[3] = {a->y_buffer, a->u_buffer,  a->v_buffer };
+  const int a_strides[3] = {a->y_stride, a->uv_stride, a->uv_stride};
+  const uint8_t *b_planes[3] = {b->y_buffer, b->u_buffer,  b->v_buffer };
+  const int b_strides[3] = {b->y_stride, b->uv_stride, b->uv_stride};
+  int i;
+  uint64_t total_sse = 0;
+  uint32_t total_samples = 0;
+  const double peak = (double)((1 << in_bit_depth) - 1);
+  const unsigned int input_shift = bit_depth - in_bit_depth;
+
+  for (i = 0; i < 3; ++i) {
+    const int w = widths[i];
+    const int h = heights[i];
+    const uint32_t samples = w * h;
+    uint64_t sse;
+    if (a->flags & YV12_FLAG_HIGHBITDEPTH) {
+      if (input_shift) {
+        sse = highbd_get_sse_shift(a_planes[i], a_strides[i],
+                                   b_planes[i], b_strides[i], w, h,
+                                   input_shift);
+      } else {
+        sse = highbd_get_sse(a_planes[i], a_strides[i],
+                             b_planes[i], b_strides[i], w, h);
+      }
+    } else {
+      sse = get_sse(a_planes[i], a_strides[i],
+                    b_planes[i], b_strides[i],
+                    w, h);
+    }
+    psnr->sse[1 + i] = sse;
+    psnr->samples[1 + i] = samples;
+    psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse);
+
+    total_sse += sse;
+    total_samples += samples;
+  }
+
+  psnr->sse[0] = total_sse;
+  psnr->samples[0] = total_samples;
+  psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
+                                  (double)total_sse);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static void generate_psnr_packet(VP9_COMP *cpi) {
   struct vpx_codec_cx_pkt pkt;
   int i;
   PSNR_STATS psnr;
+#if CONFIG_VP9_HIGHBITDEPTH
+  calc_highbd_psnr(cpi->Source, cpi->common.frame_to_show, &psnr,
+                   cpi->mb.e_mbd.bd, cpi->oxcf.input_bit_depth);
+#else
   calc_psnr(cpi->Source, cpi->common.frame_to_show, &psnr);
+#endif
+
   for (i = 0; i < 4; ++i) {
     pkt.data.psnr.samples[i] = psnr.samples[i];
     pkt.data.psnr.sse[i] = psnr.sse[i];
@@ -1386,6 +2116,36 @@
   uint8_t *src = s->y_buffer;
   int h = cm->height;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (s->flags & YV12_FLAG_HIGHBITDEPTH) {
+    uint16_t *src16 = CONVERT_TO_SHORTPTR(s->y_buffer);
+
+    do {
+      fwrite(src16, s->y_width, 2,  yuv_rec_file);
+      src16 += s->y_stride;
+    } while (--h);
+
+    src16 = CONVERT_TO_SHORTPTR(s->u_buffer);
+    h = s->uv_height;
+
+    do {
+      fwrite(src16, s->uv_width, 2,  yuv_rec_file);
+      src16 += s->uv_stride;
+    } while (--h);
+
+    src16 = CONVERT_TO_SHORTPTR(s->v_buffer);
+    h = s->uv_height;
+
+    do {
+      fwrite(src16, s->uv_width, 2, yuv_rec_file);
+      src16 += s->uv_stride;
+    } while (--h);
+
+    fflush(yuv_rec_file);
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
   do {
     fwrite(src, s->y_width, 1,  yuv_rec_file);
     src += s->y_stride;
@@ -1411,8 +2171,14 @@
 }
 #endif
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+                                                YV12_BUFFER_CONFIG *dst,
+                                                int bd) {
+#else
 static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
                                                 YV12_BUFFER_CONFIG *dst) {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   // TODO(dkovalev): replace YV12_BUFFER_CONFIG with vpx_image_t
   int i;
   const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer};
@@ -1428,15 +2194,31 @@
   const int dst_heights[3] = {dst->y_crop_height, dst->uv_crop_height,
                               dst->uv_crop_height};
 
-  for (i = 0; i < MAX_MB_PLANE; ++i)
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vp9_highbd_resize_plane(srcs[i], src_heights[i], src_widths[i],
+                              src_strides[i], dsts[i], dst_heights[i],
+                              dst_widths[i], dst_strides[i], bd);
+    } else {
+      vp9_resize_plane(srcs[i], src_heights[i], src_widths[i], src_strides[i],
+                       dsts[i], dst_heights[i], dst_widths[i], dst_strides[i]);
+    }
+#else
     vp9_resize_plane(srcs[i], src_heights[i], src_widths[i], src_strides[i],
                      dsts[i], dst_heights[i], dst_widths[i], dst_strides[i]);
-
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
   vp9_extend_frame_borders(dst);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+                                   YV12_BUFFER_CONFIG *dst, int bd) {
+#else
 static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
                                    YV12_BUFFER_CONFIG *dst) {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   const int src_w = src->y_crop_width;
   const int src_h = src->y_crop_height;
   const int dst_w = dst->y_crop_width;
@@ -1460,10 +2242,24 @@
                                      src_stride + (x / factor) * src_w / dst_w;
         uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
 
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+          vp9_highbd_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
+                               kernel[x_q4 & 0xf], 16 * src_w / dst_w,
+                               kernel[y_q4 & 0xf], 16 * src_h / dst_h,
+                               16 / factor, 16 / factor, bd);
+        } else {
+          vp9_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
+                        kernel[x_q4 & 0xf], 16 * src_w / dst_w,
+                        kernel[y_q4 & 0xf], 16 * src_h / dst_h,
+                        16 / factor, 16 / factor);
+        }
+#else
         vp9_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
                       kernel[x_q4 & 0xf], 16 * src_w / dst_w,
                       kernel[y_q4 & 0xf], 16 * src_h / dst_h,
                       16 / factor, 16 / factor);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       }
     }
   }
@@ -1632,9 +2428,14 @@
                                cm->subsampling_x, cm->subsampling_y,
 #if CONFIG_VP9_HIGHBITDEPTH
                                cm->use_highbitdepth,
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
                                VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL);
+#if CONFIG_VP9_HIGHBITDEPTH
+      scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf,
+                             (int)cm->bit_depth);
+#else
       scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
     } else {
       cpi->scaled_ref_idx[ref_frame - 1] = idx;
@@ -1698,11 +2499,12 @@
         cpi->rc.total_target_vs_actual,
         (cpi->rc.starting_buffer_level - cpi->rc.bits_off_target),
         cpi->rc.total_actual_bits, cm->base_qindex,
-        vp9_convert_qindex_to_q(cm->base_qindex),
-        (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
-        vp9_convert_qindex_to_q(cpi->twopass.active_worst_quality),
+        vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth),
+        (double)vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth) / 4.0,
+        vp9_convert_qindex_to_q(cpi->twopass.active_worst_quality,
+                                cm->bit_depth),
         cpi->rc.avg_q,
-        vp9_convert_qindex_to_q(cpi->oxcf.cq_level),
+        vp9_convert_qindex_to_q(cpi->oxcf.cq_level, cm->bit_depth),
         cpi->refresh_last_frame, cpi->refresh_golden_frame,
         cpi->refresh_alt_ref_frame, cm->frame_type, cpi->rc.gfu_boost,
         cpi->twopass.bits_left,
@@ -1824,11 +2626,22 @@
            rc->this_key_frame_forced &&
            (rc->projected_frame_size < rc->max_frame_bandwidth)) {
         int last_q = q;
-        int kf_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+        int kf_err;
 
         int high_err_target = cpi->ambient_err;
         int low_err_target = cpi->ambient_err >> 1;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (cm->use_highbitdepth) {
+          kf_err = vp9_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm),
+                                        cm->bit_depth);
+        } else {
+          kf_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+        }
+#else
+        kf_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
         // Prevent possible divide by zero error below for perfect KF
         kf_err += !kf_err;
 
@@ -1999,7 +2812,11 @@
                                           YV12_BUFFER_CONFIG *scaled) {
   if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
       cm->mi_rows * MI_SIZE != unscaled->y_height) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    scale_and_extend_frame_nonnormative(unscaled, scaled, (int)cm->bit_depth);
+#else
     scale_and_extend_frame_nonnormative(unscaled, scaled);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     return scaled;
   } else {
     return unscaled;
@@ -2165,7 +2982,9 @@
     }
   }
   if (is_two_pass_svc(cpi) && cm->error_resilient_mode == 0) {
+    // Use the last frame context for the empty frame.
     cm->frame_context_idx =
+        (cpi->svc.encode_empty_frame_state == ENCODING) ? FRAME_CONTEXTS - 1 :
         cpi->svc.spatial_layer_id * cpi->svc.number_temporal_layers +
         cpi->svc.temporal_layer_id;
 
@@ -2203,9 +3022,9 @@
 
   // Check if the current frame is skippable for the partition search in the
   // second pass according to the first pass stats
-  if (oxcf->pass == 2 &&
+  if (cpi->sf.allow_partition_search_skip && oxcf->pass == 2 &&
       (!cpi->use_svc || is_two_pass_svc(cpi))) {
-    cpi->skippable_frame = is_skippable_frame(cpi);
+    cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
   }
 
   // For 1 pass CBR, check if we are dropping this frame.
@@ -2289,7 +3108,17 @@
   // fixed interval. Note the reconstruction error if it is the frame before
   // the force key frame
   if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->use_highbitdepth) {
+      cpi->ambient_err = vp9_highbd_get_y_sse(cpi->Source,
+                                              get_frame_new_buffer(cm),
+                                              cm->bit_depth);
+    } else {
+      cpi->ambient_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+    }
+#else
     cpi->ambient_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   }
 
   // If the encoder forced a KEY_FRAME decision
@@ -2336,7 +3165,9 @@
   cpi->ref_frame_flags = get_ref_frame_flags(cpi);
 
   cm->last_frame_type = cm->frame_type;
-  vp9_rc_postencode_update(cpi, *size);
+
+  if (!(is_two_pass_svc(cpi) && cpi->svc.encode_empty_frame_state == ENCODING))
+    vp9_rc_postencode_update(cpi, *size);
 
 #if 0
   output_frame_level_debug_stats(cpi);
@@ -2360,12 +3191,8 @@
   cm->last_height = cm->height;
 
   // reset to normal state now that we are done.
-  if (!cm->show_existing_frame) {
-    if (is_two_pass_svc(cpi) && cm->error_resilient_mode == 0)
-      cm->last_show_frame = 0;
-    else
-      cm->last_show_frame = cm->show_frame;
-  }
+  if (!cm->show_existing_frame)
+    cm->last_show_frame = cm->show_frame;
 
   if (cm->show_frame) {
     vp9_swap_mi_and_prev_mi(cm);
@@ -2402,7 +3229,9 @@
                         uint8_t *dest, unsigned int *frame_flags) {
   cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
   encode_frame_to_data_rate(cpi, size, dest, frame_flags);
-  vp9_twopass_postencode_update(cpi);
+
+  if (!(is_two_pass_svc(cpi) && cpi->svc.encode_empty_frame_state == ENCODING))
+    vp9_twopass_postencode_update(cpi);
 }
 
 static void init_motion_estimation(VP9_COMP *cpi) {
@@ -2415,13 +3244,19 @@
   }
 }
 
-static void check_initial_width(VP9_COMP *cpi, int subsampling_x,
-                                int subsampling_y) {
+static void check_initial_width(VP9_COMP *cpi,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                int use_highbitdepth,
+#endif
+                                int subsampling_x, int subsampling_y) {
   VP9_COMMON *const cm = &cpi->common;
 
   if (!cpi->initial_width) {
     cm->subsampling_x = subsampling_x;
     cm->subsampling_y = subsampling_y;
+#if CONFIG_VP9_HIGHBITDEPTH
+    cm->use_highbitdepth = use_highbitdepth;
+#endif
 
     alloc_raw_frame_buffers(cpi);
     alloc_ref_frame_buffers(cpi);
@@ -2441,10 +3276,14 @@
   VP9_COMMON *cm = &cpi->common;
   struct vpx_usec_timer timer;
   int res = 0;
-  const int subsampling_x = sd->uv_width  < sd->y_width;
-  const int subsampling_y = sd->uv_height < sd->y_height;
-
+  const int subsampling_x = sd->subsampling_x;
+  const int subsampling_y = sd->subsampling_y;
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int use_highbitdepth = sd->flags & YV12_FLAG_HIGHBITDEPTH;
+  check_initial_width(cpi, use_highbitdepth, subsampling_x, subsampling_y);
+#else
   check_initial_width(cpi, subsampling_x, subsampling_y);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
   vpx_usec_timer_start(&timer);
 
@@ -2580,6 +3419,9 @@
   if (is_two_pass_svc(cpi)) {
 #if CONFIG_SPATIAL_SVC
     vp9_svc_start_frame(cpi);
+    // Use a small empty frame instead of a real frame
+    if (cpi->svc.encode_empty_frame_state == ENCODING)
+      source = &cpi->svc.empty_frame;
 #endif
     if (oxcf->pass == 2)
       vp9_restore_layer_context(cpi);
@@ -2598,6 +3440,11 @@
 
   // Should we encode an arf frame.
   arf_src_index = get_arf_src_index(cpi);
+
+  // Skip alt frame if we encode the empty frame
+  if (is_two_pass_svc(cpi) && source != NULL)
+    arf_src_index = 0;
+
   if (arf_src_index) {
     assert(arf_src_index <= rc->frames_to_key);
 
@@ -2708,7 +3555,10 @@
 
   // For two pass encodes analyse the first pass stats and determine
   // the bit allocation and other parameters for this frame / group of frames.
-  if ((oxcf->pass == 2) && (!cpi->use_svc || is_two_pass_svc(cpi))) {
+  if ((oxcf->pass == 2) &&
+      (!cpi->use_svc ||
+       (is_two_pass_svc(cpi) &&
+        cpi->svc.encode_empty_frame_state != ENCODING))) {
     vp9_rc_get_second_pass_params(cpi);
   }
 
@@ -2760,7 +3610,7 @@
     vp9_setup_scale_factors_for_frame(&ref_buf->sf,
                                       buf->y_crop_width, buf->y_crop_height,
                                       cm->width, cm->height);
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     if (vp9_is_scaled(&ref_buf->sf))
       vp9_extend_frame_borders(buf);
   }
@@ -2776,14 +3626,14 @@
     const int lossless = is_lossless_requested(oxcf);
 #if CONFIG_VP9_HIGHBITDEPTH
     if (cpi->oxcf.use_highbitdepth)
-      cpi->mb.fwd_txm4x4 = lossless ? vp9_high_fwht4x4 : vp9_high_fdct4x4;
+      cpi->mb.fwd_txm4x4 = lossless ? vp9_highbd_fwht4x4 : vp9_highbd_fdct4x4;
     else
       cpi->mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4;
-    cpi->mb.high_itxm_add = lossless ? vp9_high_iwht4x4_add :
-                                       vp9_high_idct4x4_add;
+    cpi->mb.highbd_itxm_add = lossless ? vp9_highbd_iwht4x4_add :
+                                         vp9_highbd_idct4x4_add;
 #else
     cpi->mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4;
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     cpi->mb.itxm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
     vp9_first_pass(cpi, source);
   } else if (oxcf->pass == 2 &&
@@ -2836,7 +3686,12 @@
         YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show;
         YV12_BUFFER_CONFIG *pp = &cm->post_proc_buffer;
         PSNR_STATS psnr;
+#if CONFIG_VP9_HIGHBITDEPTH
+        calc_highbd_psnr(orig, recon, &psnr, cpi->mb.e_mbd.bd,
+                         cpi->oxcf.input_bit_depth);
+#else
         calc_psnr(orig, recon, &psnr);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
         cpi->total += psnr.psnr[0];
         cpi->total_y += psnr.psnr[1];
@@ -2856,7 +3711,12 @@
 #endif
           vp9_clear_system_state();
 
+#if CONFIG_VP9_HIGHBITDEPTH
+          calc_highbd_psnr(orig, pp, &psnr, cpi->mb.e_mbd.bd,
+                           cpi->oxcf.input_bit_depth);
+#else
           calc_psnr(orig, pp, &psnr2);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
           cpi->totalp += psnr2.psnr[0];
           cpi->totalp_y += psnr2.psnr[1];
@@ -2865,12 +3725,29 @@
           cpi->totalp_sq_error += psnr2.sse[0];
           cpi->totalp_samples += psnr2.samples[0];
 
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (cm->use_highbitdepth) {
+            frame_ssim2 = vp9_highbd_calc_ssim(orig, recon, &weight, xd->bd);
+          } else {
+            frame_ssim2 = vp9_calc_ssim(orig, recon, &weight);
+          }
+#else
           frame_ssim2 = vp9_calc_ssim(orig, recon, &weight);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
           cpi->summed_quality += frame_ssim2 * weight;
           cpi->summed_weights += weight;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (cm->use_highbitdepth) {
+            frame_ssim2 = vp9_highbd_calc_ssim(
+                orig, &cm->post_proc_buffer, &weight, xd->bd);
+          } else {
+            frame_ssim2 = vp9_calc_ssim(orig, &cm->post_proc_buffer, &weight);
+          }
+#else
           frame_ssim2 = vp9_calc_ssim(orig, &cm->post_proc_buffer, &weight);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
           cpi->summedp_quality += frame_ssim2 * weight;
           cpi->summedp_weights += weight;
@@ -2889,7 +3766,17 @@
 
       if (cpi->b_calculate_ssimg) {
         double y, u, v, frame_all;
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (cm->use_highbitdepth) {
+          frame_all = vp9_highbd_calc_ssimg(cpi->Source, cm->frame_to_show, &y,
+                                            &u, &v, xd->bd);
+        } else {
+          frame_all = vp9_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u,
+                                     &v);
+        }
+#else
         frame_all = vp9_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u, &v);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
         cpi->total_ssimg_y += y;
         cpi->total_ssimg_u += u;
         cpi->total_ssimg_v += v;
@@ -2900,10 +3787,18 @@
 
 #endif
 
-  if (is_two_pass_svc(cpi) && cm->show_frame) {
-    ++cpi->svc.spatial_layer_to_encode;
-    if (cpi->svc.spatial_layer_to_encode >= cpi->svc.number_spatial_layers)
-      cpi->svc.spatial_layer_to_encode = 0;
+  if (is_two_pass_svc(cpi)) {
+    if (cpi->svc.encode_empty_frame_state == ENCODING)
+      cpi->svc.encode_empty_frame_state = ENCODED;
+
+    if (cm->show_frame) {
+      ++cpi->svc.spatial_layer_to_encode;
+      if (cpi->svc.spatial_layer_to_encode >= cpi->svc.number_spatial_layers)
+        cpi->svc.spatial_layer_to_encode = 0;
+
+      // May need the empty frame after an visible frame.
+      cpi->svc.encode_empty_frame_state = NEED_TO_ENCODE;
+    }
   }
   return 0;
 }
@@ -2986,15 +3881,14 @@
 int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width,
                          unsigned int height) {
   VP9_COMMON *cm = &cpi->common;
-
+#if CONFIG_VP9_HIGHBITDEPTH
+  check_initial_width(cpi, 1, 1, cm->use_highbitdepth);
+#else
   check_initial_width(cpi, 1, 1);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
   if (width) {
     cm->width = width;
-    if (cm->width * 5 < cpi->initial_width) {
-      cm->width = cpi->initial_width / 5 + 1;
-      printf("Warning: Desired width too small, changed to %d\n", cm->width);
-    }
     if (cm->width > cpi->initial_width) {
       cm->width = cpi->initial_width;
       printf("Warning: Desired width too large, changed to %d\n", cm->width);
@@ -3003,10 +3897,6 @@
 
   if (height) {
     cm->height = height;
-    if (cm->height * 5 < cpi->initial_height) {
-      cm->height = cpi->initial_height / 5 + 1;
-      printf("Warning: Desired height too small, changed to %d\n", cm->height);
-    }
     if (cm->height > cpi->initial_height) {
       cm->height = cpi->initial_height;
       printf("Warning: Desired height too large, changed to %d\n", cm->height);
@@ -3033,6 +3923,35 @@
                       a->y_crop_width, a->y_crop_height);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+int vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+                         const YV12_BUFFER_CONFIG *b,
+                         vpx_bit_depth_t bit_depth) {
+  unsigned int sse;
+  int sum;
+  assert(a->y_crop_width == b->y_crop_width);
+  assert(a->y_crop_height == b->y_crop_height);
+  assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+  assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+  switch (bit_depth) {
+    case VPX_BITS_8:
+      highbd_variance(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+                      a->y_crop_width, a->y_crop_height, &sse, &sum);
+      return (int) sse;
+    case VPX_BITS_10:
+      highbd_10_variance(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+                         a->y_crop_width, a->y_crop_height, &sse, &sum);
+      return (int) sse;
+    case VPX_BITS_12:
+      highbd_12_variance(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+                         a->y_crop_width, a->y_crop_height, &sse, &sum);
+      return (int) sse;
+    default:
+      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 int vp9_get_quantizer(VP9_COMP *cpi) {
   return cpi->common.base_qindex;

diff --git a/source/libvpx/vp9/encoder/vp9_encoder.h b/source/libvpx/vp9/encoder/vp9_encoder.h
index 80774de..2c56b81 100644
--- a/source/libvpx/vp9/encoder/vp9_encoder.h
+++ b/source/libvpx/vp9/encoder/vp9_encoder.h

@@ -14,12 +14,10 @@
 #include <stdio.h>
 
 #include "./vpx_config.h"
-#include "vpx_ports/mem.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx/vp8cx.h"
 
 #include "vp9/common/vp9_ppflags.h"
-#include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
@@ -241,7 +239,8 @@
   YV12_BUFFER_CONFIG *unscaled_last_source;
   YV12_BUFFER_CONFIG scaled_last_source;
 
-  int skippable_frame;
+  // For a still frame, this flag is set to 1 to skip partition search.
+  int partition_search_skippable_frame;
 
   int scaled_ref_idx[3];
   int lst_fb_idx;
@@ -482,7 +481,21 @@
   return mb_rows * mb_cols * (16 * 16 * 3 + 4);
 }
 
+// Get the allocated token size for a tile. It does the same calculation as in
+// the frame token allocation.
+static INLINE int allocated_tokens(TileInfo tile) {
+  int tile_mb_rows = (tile.mi_row_end - tile.mi_row_start + 1) >> 1;
+  int tile_mb_cols = (tile.mi_col_end - tile.mi_col_start + 1) >> 1;
+
+  return get_token_alloc(tile_mb_rows, tile_mb_cols);
+}
+
 int vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
+#if CONFIG_VP9_HIGHBITDEPTH
+int vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+                         const YV12_BUFFER_CONFIG *b,
+                         vpx_bit_depth_t bit_depth);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 void vp9_alloc_compressor_data(VP9_COMP *cpi);
 
@@ -525,8 +538,8 @@
   return frame_index & 0x1;
 }
 
-static INLINE int *cond_sad_list(const struct VP9_COMP *cpi, int *sad_list) {
-  return cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? sad_list : NULL;
+static INLINE int *cond_cost_list(const struct VP9_COMP *cpi, int *cost_list) {
+  return cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL;
 }
 
 #ifdef __cplusplus

diff --git a/source/libvpx/vp9/encoder/vp9_extend.c b/source/libvpx/vp9/encoder/vp9_extend.c
index e8517c8..c9b2131 100644
--- a/source/libvpx/vp9/encoder/vp9_extend.c
+++ b/source/libvpx/vp9/encoder/vp9_extend.c

@@ -55,6 +55,52 @@
   }
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_copy_and_extend_plane(const uint8_t *src8, int src_pitch,
+                                         uint8_t *dst8, int dst_pitch,
+                                         int w, int h,
+                                         int extend_top, int extend_left,
+                                         int extend_bottom, int extend_right) {
+  int i, linesize;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+  // copy the left and right most columns out
+  const uint16_t *src_ptr1 = src;
+  const uint16_t *src_ptr2 = src + w - 1;
+  uint16_t *dst_ptr1 = dst - extend_left;
+  uint16_t *dst_ptr2 = dst + w;
+
+  for (i = 0; i < h; i++) {
+    vpx_memset16(dst_ptr1, src_ptr1[0], extend_left);
+    vpx_memcpy(dst_ptr1 + extend_left, src_ptr1, w * sizeof(uint16_t));
+    vpx_memset16(dst_ptr2, src_ptr2[0], extend_right);
+    src_ptr1 += src_pitch;
+    src_ptr2 += src_pitch;
+    dst_ptr1 += dst_pitch;
+    dst_ptr2 += dst_pitch;
+  }
+
+  // Now copy the top and bottom lines into each line of the respective
+  // borders
+  src_ptr1 = dst - extend_left;
+  src_ptr2 = dst + dst_pitch * (h - 1) - extend_left;
+  dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left;
+  dst_ptr2 = dst + dst_pitch * (h) - extend_left;
+  linesize = extend_left + extend_right + w;
+
+  for (i = 0; i < extend_top; i++) {
+    vpx_memcpy(dst_ptr1, src_ptr1, linesize * sizeof(uint16_t));
+    dst_ptr1 += dst_pitch;
+  }
+
+  for (i = 0; i < extend_bottom; i++) {
+    vpx_memcpy(dst_ptr2, src_ptr2, linesize * sizeof(uint16_t));
+    dst_ptr2 += dst_pitch;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
                                YV12_BUFFER_CONFIG *dst) {
   // Extend src frame in buffer
@@ -64,10 +110,10 @@
   // Motion estimation may use src block variance with the block size up
   // to 64x64, so the right and bottom need to be extended to 64 multiple
   // or up to 16, whichever is greater.
-  const int eb_y = MAX(ALIGN_POWER_OF_TWO(src->y_width, 6) - src->y_width,
-                       16);
-  const int er_y = MAX(ALIGN_POWER_OF_TWO(src->y_height, 6) - src->y_height,
-                       16);
+  const int eb_y = MAX(src->y_width + 16, ALIGN_POWER_OF_TWO(src->y_width, 6))
+      - src->y_crop_width;
+  const int er_y = MAX(src->y_height + 16, ALIGN_POWER_OF_TWO(src->y_height, 6))
+      - src->y_crop_height;
   const int uv_width_subsampling = (src->uv_width != src->y_width);
   const int uv_height_subsampling = (src->uv_height != src->y_height);
   const int et_uv = et_y >> uv_height_subsampling;
@@ -75,19 +121,39 @@
   const int eb_uv = eb_y >> uv_height_subsampling;
   const int er_uv = er_y >> uv_width_subsampling;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+    highbd_copy_and_extend_plane(src->y_buffer, src->y_stride,
+                                 dst->y_buffer, dst->y_stride,
+                                 src->y_crop_width, src->y_crop_height,
+                                 et_y, el_y, eb_y, er_y);
+
+    highbd_copy_and_extend_plane(src->u_buffer, src->uv_stride,
+                                 dst->u_buffer, dst->uv_stride,
+                                 src->uv_crop_width, src->uv_crop_height,
+                                 et_uv, el_uv, eb_uv, er_uv);
+
+    highbd_copy_and_extend_plane(src->v_buffer, src->uv_stride,
+                                 dst->v_buffer, dst->uv_stride,
+                                 src->uv_crop_width, src->uv_crop_height,
+                                 et_uv, el_uv, eb_uv, er_uv);
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
   copy_and_extend_plane(src->y_buffer, src->y_stride,
                         dst->y_buffer, dst->y_stride,
-                        src->y_width, src->y_height,
+                        src->y_crop_width, src->y_crop_height,
                         et_y, el_y, eb_y, er_y);
 
   copy_and_extend_plane(src->u_buffer, src->uv_stride,
                         dst->u_buffer, dst->uv_stride,
-                        src->uv_width, src->uv_height,
+                        src->uv_crop_width, src->uv_crop_height,
                         et_uv, el_uv, eb_uv, er_uv);
 
   copy_and_extend_plane(src->v_buffer, src->uv_stride,
                         dst->v_buffer, dst->uv_stride,
-                        src->uv_width, src->uv_height,
+                        src->uv_crop_width, src->uv_crop_height,
                         et_uv, el_uv, eb_uv, er_uv);
 }
 

diff --git a/source/libvpx/vp9/encoder/vp9_firstpass.c b/source/libvpx/vp9/encoder/vp9_firstpass.c
index 0282e9f..f1baf83 100644
--- a/source/libvpx/vp9/encoder/vp9_firstpass.c
+++ b/source/libvpx/vp9/encoder/vp9_firstpass.c

@@ -38,14 +38,16 @@
 #define OUTPUT_FPF          0
 #define ARF_STATS_OUTPUT    0
 
+#define BOOST_BREAKOUT      12.5
 #define BOOST_FACTOR        12.5
-#define ERR_DIVISOR         100.0
-#define FACTOR_PT_LOW       0.5
-#define FACTOR_PT_HIGH      0.9
+#define ERR_DIVISOR         128.0
+#define FACTOR_PT_LOW       0.70
+#define FACTOR_PT_HIGH      0.90
 #define FIRST_PASS_Q        10.0
 #define GF_MAX_BOOST        96.0
 #define INTRA_MODE_PENALTY  1024
 #define KF_MAX_BOOST        128.0
+#define MIN_ARF_GF_BOOST    240
 #define MIN_DECAY_FACTOR    0.01
 #define MIN_GF_INTERVAL     4
 #define MIN_KF_BOOST        300
@@ -64,13 +66,6 @@
   *b = temp;
 }
 
-static int gfboost_qadjust(int qindex, vpx_bit_depth_t bit_depth) {
-  const double q = vp9_convert_qindex_to_q(qindex, bit_depth);
-  return (int)((0.00000828 * q * q * q) +
-               (-0.0055 * q * q) +
-               (1.32 * q) + 79.3);
-}
-
 // Resets the first pass file to the given position using a relative seek from
 // the current position.
 static void reset_fpf_position(TWO_PASS *p,
@@ -281,6 +276,60 @@
   return sse;
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static vp9_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
+                                                      int bd) {
+  switch (bd) {
+    default:
+      switch (bsize) {
+        case BLOCK_8X8:
+          return vp9_highbd_mse8x8;
+        case BLOCK_16X8:
+          return vp9_highbd_mse16x8;
+        case BLOCK_8X16:
+          return vp9_highbd_mse8x16;
+        default:
+          return vp9_highbd_mse16x16;
+      }
+      break;
+    case 10:
+      switch (bsize) {
+        case BLOCK_8X8:
+          return vp9_highbd_10_mse8x8;
+        case BLOCK_16X8:
+          return vp9_highbd_10_mse16x8;
+        case BLOCK_8X16:
+          return vp9_highbd_10_mse8x16;
+        default:
+          return vp9_highbd_10_mse16x16;
+      }
+      break;
+    case 12:
+      switch (bsize) {
+        case BLOCK_8X8:
+          return vp9_highbd_12_mse8x8;
+        case BLOCK_16X8:
+          return vp9_highbd_12_mse16x8;
+        case BLOCK_8X16:
+          return vp9_highbd_12_mse8x16;
+        default:
+          return vp9_highbd_12_mse16x16;
+      }
+      break;
+  }
+}
+
+static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize,
+                                                const struct buf_2d *src,
+                                                const struct buf_2d *ref,
+                                                int bd) {
+  unsigned int sse;
+  const vp9_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd);
+  fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
+  return sse;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 // Refine the motion search range according to the frame dimension
 // for first pass test.
 static int get_search_range(const VP9_COMMON *cm) {
@@ -311,6 +360,11 @@
 
   // Override the default variance function to use MSE.
   v_fn_ptr.vf = get_block_variance_fn(bsize);
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, xd->bd);
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
   // Center the initial step/diamond search on best mv.
   tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,
@@ -562,6 +616,24 @@
          (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
       vp9_encode_intra_block_plane(x, bsize, 0);
       this_error = vp9_get_mb_ss(x->plane[0].src_diff);
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (cm->use_highbitdepth) {
+        switch (cm->bit_depth) {
+          case VPX_BITS_8:
+            break;
+          case VPX_BITS_10:
+            this_error >>= 4;
+            break;
+          case VPX_BITS_12:
+            this_error >>= 8;
+            break;
+          default:
+            assert(0 && "cm->bit_depth should be VPX_BITS_8, "
+                        "VPX_BITS_10 or VPX_BITS_12");
+            return;
+        }
+      }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
       if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
         vp9_clear_system_state();
@@ -601,8 +673,18 @@
         struct buf_2d unscaled_last_source_buf_2d;
 
         xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
-        motion_error = get_prediction_error(bsize, &x->plane[0].src,
-                                            &xd->plane[0].pre[0]);
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+          motion_error = highbd_get_prediction_error(
+              bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
+        } else {
+          motion_error = get_prediction_error(
+              bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
+        }
+#else
+        motion_error = get_prediction_error(
+            bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
         // Compute the motion error of the 0,0 motion using the last source
         // frame as the reference. Skip the further motion search on
@@ -611,8 +693,18 @@
             cpi->unscaled_last_source->y_buffer + recon_yoffset;
         unscaled_last_source_buf_2d.stride =
             cpi->unscaled_last_source->y_stride;
-        raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,
-                                                &unscaled_last_source_buf_2d);
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+          raw_motion_error = highbd_get_prediction_error(
+              bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd);
+        } else {
+          raw_motion_error = get_prediction_error(
+              bsize, &x->plane[0].src, &unscaled_last_source_buf_2d);
+        }
+#else
+        raw_motion_error = get_prediction_error(
+            bsize, &x->plane[0].src, &unscaled_last_source_buf_2d);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
         // TODO(pengchong): Replace the hard-coded threshold
         if (raw_motion_error > 25 || lc != NULL) {
@@ -648,8 +740,18 @@
             int gf_motion_error;
 
             xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
-            gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
-                                                   &xd->plane[0].pre[0]);
+#if CONFIG_VP9_HIGHBITDEPTH
+            if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+              gf_motion_error = highbd_get_prediction_error(
+                  bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
+            } else {
+              gf_motion_error = get_prediction_error(
+                  bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
+            }
+#else
+            gf_motion_error = get_prediction_error(
+                bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
             first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv,
                                      &gf_motion_error);
@@ -949,7 +1051,7 @@
 
   // Adjustment based on actual quantizer to power term.
   const double power_term =
-      MIN(vp9_convert_qindex_to_q(q, bit_depth) * 0.0125 + pt_low, pt_high);
+      MIN(vp9_convert_qindex_to_q(q, bit_depth) * 0.01 + pt_low, pt_high);
 
   // Calculate correction factor.
   if (power_term < 1.0)
@@ -958,6 +1060,11 @@
   return fclamp(pow(error_term, power_term), 0.05, 5.0);
 }
 
+// Larger image formats are expected to be a little harder to code relatively
+// given the same prediction error score. This in part at least relates to the
+// increased size and hence coding cost of motion vectors.
+#define EDIV_SIZE_FACTOR 800
+
 static int get_twopass_worst_quality(const VP9_COMP *cpi,
                                      const FIRSTPASS_STATS *stats,
                                      int section_target_bandwidth) {
@@ -971,8 +1078,10 @@
     const double section_err = stats->coded_error / stats->count;
     const double err_per_mb = section_err / num_mbs;
     const double speed_term = 1.0 + 0.04 * oxcf->speed;
+    const double ediv_size_correction = num_mbs / EDIV_SIZE_FACTOR;
     const int target_norm_bits_per_mb = ((uint64_t)section_target_bandwidth <<
                                          BPER_MB_NORMBITS) / num_mbs;
+
     int q;
     int is_svc_upper_layer = 0;
     if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0)
@@ -982,7 +1091,7 @@
     // content at the given rate.
     for (q = rc->best_quality; q < rc->worst_quality; ++q) {
       const double factor =
-          calc_correction_factor(err_per_mb, ERR_DIVISOR,
+          calc_correction_factor(err_per_mb, ERR_DIVISOR - ediv_size_correction,
                                  is_svc_upper_layer ? SVC_FACTOR_PT_LOW :
                                  FACTOR_PT_LOW, FACTOR_PT_HIGH, q,
                                  cpi->common.bit_depth);
@@ -1065,6 +1174,8 @@
   // Reset the vbr bits off target counter
   cpi->rc.vbr_bits_off_target = 0;
 
+  cpi->rc.rate_error_estimate = 0;
+
   // Static sequence monitor variables.
   twopass->kf_zeromotion_pct = 100;
   twopass->last_kfgroup_zeromotion_pct = 100;
@@ -1199,11 +1310,15 @@
                                double this_frame_mv_in_out,
                                double max_boost) {
   double frame_boost;
+  const double lq =
+    vp9_convert_qindex_to_q(cpi->rc.avg_frame_qindex[INTER_FRAME],
+                            cpi->common.bit_depth);
+  const double boost_correction = MIN((0.5 + (lq * 0.015)), 1.5);
 
   // Underlying boost factor is based on inter error ratio.
   frame_boost = (BASELINE_ERR_PER_MB * cpi->common.MBs) /
                 DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
-  frame_boost = frame_boost * BOOST_FACTOR;
+  frame_boost = frame_boost * BOOST_FACTOR * boost_correction;
 
   // Increase boost for frames where new data coming into frame (e.g. zoom out).
   // Slightly reduce boost if there is a net balance of motion out of the frame
@@ -1214,7 +1329,7 @@
   else
     frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);
 
-  return MIN(frame_boost, max_boost);
+  return MIN(frame_boost, max_boost * boost_correction);
 }
 
 static int calc_arf_boost(VP9_COMP *cpi, int offset,
@@ -1303,6 +1418,7 @@
   arf_boost = (*f_boost + *b_boost);
   if (arf_boost < ((b_frames + f_frames) * 20))
     arf_boost = ((b_frames + f_frames) * 20);
+  arf_boost = MAX(arf_boost, MIN_ARF_GF_BOOST);
 
   return arf_boost;
 }
@@ -1580,6 +1696,7 @@
   int b_boost = 0;
   int flash_detected;
   int active_max_gf_interval;
+  int active_min_gf_interval;
   int64_t gf_group_bits;
   double gf_group_error_left;
   int gf_arf_bits;
@@ -1608,21 +1725,30 @@
   // Motion breakout threshold for loop below depends on image size.
   mv_ratio_accumulator_thresh = (cpi->common.width + cpi->common.height) / 4.0;
 
-  // Work out a maximum interval for the GF group.
+  // Set a maximum and minimum interval for the GF group.
   // If the image appears almost completely static we can extend beyond this.
-  if (cpi->multi_arf_allowed) {
-    active_max_gf_interval = rc->max_gf_interval;
-  } else {
-   // The value chosen depends on the active Q range. At low Q we have
-   // bits to spare and are better with a smaller interval and smaller boost.
-   // At high Q when there are few bits to spare we are better with a longer
-   // interval to spread the cost of the GF.
-   active_max_gf_interval =
-     12 + ((int)vp9_convert_qindex_to_q(rc->last_q[INTER_FRAME],
-                                        cpi->common.bit_depth) >> 5);
+  {
+    int int_max_q =
+      (int)(vp9_convert_qindex_to_q(twopass->active_worst_quality,
+                                   cpi->common.bit_depth));
+    int int_lbq =
+      (int)(vp9_convert_qindex_to_q(rc->last_boosted_qindex,
+                                   cpi->common.bit_depth));
+    active_min_gf_interval = MIN_GF_INTERVAL + MIN(2, int_max_q / 200);
+    if (active_min_gf_interval > rc->max_gf_interval)
+      active_min_gf_interval = rc->max_gf_interval;
 
-   if (active_max_gf_interval > rc->max_gf_interval)
-     active_max_gf_interval = rc->max_gf_interval;
+    if (cpi->multi_arf_allowed) {
+      active_max_gf_interval = rc->max_gf_interval;
+    } else {
+      // The value chosen depends on the active Q range. At low Q we have
+      // bits to spare and are better with a smaller interval and smaller boost.
+      // At high Q when there are few bits to spare we are better with a longer
+      // interval to spread the cost of the GF.
+      active_max_gf_interval = 12 + MIN(4, (int_lbq / 6));
+      if (active_max_gf_interval > rc->max_gf_interval)
+        active_max_gf_interval = rc->max_gf_interval;
+    }
   }
 
   i = 0;
@@ -1678,12 +1804,12 @@
       (i >= active_max_gf_interval && (zero_motion_accumulator < 0.995)) ||
       (
         // Don't break out with a very short interval.
-        (i > MIN_GF_INTERVAL) &&
+        (i > active_min_gf_interval) &&
         (!flash_detected) &&
         ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) ||
          (abs_mv_in_out_accumulator > 3.0) ||
          (mv_in_out_accumulator < -2.0) ||
-         ((boost_score - old_boost_score) < BOOST_FACTOR)))) {
+         ((boost_score - old_boost_score) < BOOST_BREAKOUT)))) {
       boost_score = old_boost_score;
       break;
     }
@@ -1731,7 +1857,7 @@
       (cpi->multi_arf_allowed && (rc->baseline_gf_interval >= 6) &&
       (zero_motion_accumulator < 0.995)) ? 1 : 0;
   } else {
-    rc->gfu_boost = MAX((int)boost_score, 125);
+    rc->gfu_boost = MAX((int)boost_score, MIN_ARF_GF_BOOST);
     rc->source_alt_ref_pending = 0;
   }
 
@@ -1742,18 +1868,8 @@
   gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err);
 
   // Calculate the extra bits to be used for boosted frame(s)
-  {
-    int q = rc->last_q[INTER_FRAME];
-    int boost =
-        (rc->gfu_boost * gfboost_qadjust(q, cpi->common.bit_depth)) / 100;
-
-    // Set max and minimum boost and hence minimum allocation.
-    boost = clamp(boost, 125, (rc->baseline_gf_interval + 1) * 200);
-
-    // Calculate the extra bits to be used for boosted frame(s)
-    gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval,
-                                       boost, gf_group_bits);
-  }
+  gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval,
+                                     rc->gfu_boost, gf_group_bits);
 
   // Adjust KF group bits and error remaining.
   twopass->kf_group_error_left -= (int64_t)gf_group_err;
@@ -2101,10 +2217,23 @@
   twopass->modified_error_left -= kf_group_err;
 }
 
+#define VBR_PCT_ADJUSTMENT_LIMIT 50
 // For VBR...adjustment to the frame target based on error from previous frames
-void vbr_rate_correction(int * this_frame_target,
+void vbr_rate_correction(VP9_COMP *cpi,
+                         int * this_frame_target,
                          const int64_t vbr_bits_off_target) {
-  int max_delta = (*this_frame_target * 15) / 100;
+  int max_delta;
+  double position_factor = 1.0;
+
+  // How far through the clip are we.
+  // This number is used to damp the per frame rate correction.
+  // Range 0 - 1.0
+  if (cpi->twopass.total_stats.count) {
+    position_factor = sqrt((double)cpi->common.current_video_frame /
+                           cpi->twopass.total_stats.count);
+  }
+  max_delta = (int)(position_factor *
+                    ((*this_frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100));
 
   // vbr_bits_off_target > 0 means we have extra bits to spend
   if (vbr_bits_off_target > 0) {
@@ -2202,7 +2331,7 @@
 
     // Correction to rate target based on prior over or under shoot.
     if (cpi->oxcf.rc_mode == VPX_VBR)
-      vbr_rate_correction(&target_rate, rc->vbr_bits_off_target);
+      vbr_rate_correction(cpi, &target_rate, rc->vbr_bits_off_target);
 
     vp9_rc_set_frame_target(cpi, target_rate);
     cm->frame_type = INTER_FRAME;
@@ -2234,7 +2363,11 @@
                                                 section_target_bandwidth);
     twopass->active_worst_quality = tmp_q;
     rc->ni_av_qi = tmp_q;
+    rc->last_q[INTER_FRAME] = tmp_q;
     rc->avg_q = vp9_convert_qindex_to_q(tmp_q, cm->bit_depth);
+    rc->avg_frame_qindex[INTER_FRAME] = tmp_q;
+    rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.best_allowed_q) / 2;
+    rc->avg_frame_qindex[KEY_FRAME] = rc->last_q[KEY_FRAME];
   }
   vp9_zero(this_frame);
   if (EOF == input_stats(twopass, &this_frame))
@@ -2259,6 +2392,9 @@
         cpi->ref_frame_flags &=
             (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
         lc->frames_from_key_frame = 0;
+        // Reset the empty frame resolution since we have a key frame.
+        cpi->svc.empty_frame_width = cm->width;
+        cpi->svc.empty_frame_height = cm->height;
       }
     } else {
       cm->frame_type = INTER_FRAME;
@@ -2275,16 +2411,6 @@
   if (rc->frames_till_gf_update_due == 0) {
     define_gf_group(cpi, &this_frame_copy);
 
-    if (twopass->gf_zeromotion_pct > 995) {
-      // As long as max_thresh for encode breakout is small enough, it is ok
-      // to enable it for show frame, i.e. set allow_encode_breakout to
-      // ENCODE_BREAKOUT_LIMITED.
-      if (!cm->show_frame)
-        cpi->allow_encode_breakout = ENCODE_BREAKOUT_DISABLED;
-      else
-        cpi->allow_encode_breakout = ENCODE_BREAKOUT_LIMITED;
-    }
-
     rc->frames_till_gf_update_due = rc->baseline_gf_interval;
     if (lc != NULL)
       cpi->refresh_golden_frame = 1;
@@ -2294,8 +2420,9 @@
       FILE *fpfile;
       fpfile = fopen("arf.stt", "a");
       ++arf_count;
-      fprintf(fpfile, "%10d %10d %10d %10ld\n",
-              cm->current_video_frame, rc->kf_boost, arf_count, rc->gfu_boost);
+      fprintf(fpfile, "%10d %10ld %10d %10d %10ld\n",
+              cm->current_video_frame, rc->frames_till_gf_update_due,
+              rc->kf_boost, arf_count, rc->gfu_boost);
 
       fclose(fpfile);
     }
@@ -2314,7 +2441,7 @@
 
   // Correction to rate target based on prior over or under shoot.
   if (cpi->oxcf.rc_mode == VPX_VBR)
-    vbr_rate_correction(&target_rate, rc->vbr_bits_off_target);
+    vbr_rate_correction(cpi, &target_rate, rc->vbr_bits_off_target);
 
   vp9_rc_set_frame_target(cpi, target_rate);
 
@@ -2322,20 +2449,30 @@
   subtract_stats(&twopass->total_left_stats, &this_frame);
 }
 
+#define MINQ_ADJ_LIMIT 32
+#define Q_LIMIT_STEP 1
 void vp9_twopass_postencode_update(VP9_COMP *cpi) {
   TWO_PASS *const twopass = &cpi->twopass;
   RATE_CONTROL *const rc = &cpi->rc;
+  const int bits_used = rc->base_frame_target;
 
   // VBR correction is done through rc->vbr_bits_off_target. Based on the
   // sign of this value, a limited % adjustment is made to the target rate
   // of subsequent frames, to try and push it back towards 0. This method
   // is designed to prevent extreme behaviour at the end of a clip
   // or group of frames.
-  const int bits_used = rc->base_frame_target;
   rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size;
-
   twopass->bits_left = MAX(twopass->bits_left - bits_used, 0);
 
+  // Calculate the pct rc error.
+  if (rc->total_actual_bits) {
+    rc->rate_error_estimate =
+      (int)((rc->vbr_bits_off_target * 100) / rc->total_actual_bits);
+    rc->rate_error_estimate = clamp(rc->rate_error_estimate, -100, 100);
+  } else {
+    rc->rate_error_estimate = 0;
+  }
+
   if (cpi->common.frame_type != KEY_FRAME &&
       !vp9_is_upper_layer_key_frame(cpi)) {
     twopass->kf_group_bits -= bits_used;
@@ -2345,4 +2482,32 @@
 
   // Increment the gf group index ready for the next frame.
   ++twopass->gf_group.index;
+
+  // If the rate control is drifting consider adjustment ot min or maxq.
+  // Only make adjustments on gf/arf
+  if ((cpi->oxcf.rc_mode == VPX_VBR) &&
+      (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD) &&
+      !cpi->rc.is_src_frame_alt_ref) {
+    const int maxq_adj_limit =
+      rc->worst_quality - twopass->active_worst_quality;
+
+    // Undershoot.
+    if (rc->rate_error_estimate > cpi->oxcf.under_shoot_pct) {
+      --twopass->extend_maxq;
+      if (rc->rolling_target_bits >= rc->rolling_actual_bits)
+        twopass->extend_minq += Q_LIMIT_STEP;
+    // Overshoot.
+    } else if (rc->rate_error_estimate < -cpi->oxcf.over_shoot_pct) {
+      --twopass->extend_minq;
+      if (rc->rolling_target_bits < rc->rolling_actual_bits)
+        twopass->extend_maxq += Q_LIMIT_STEP;
+    } else {
+      if (rc->rolling_target_bits < rc->rolling_actual_bits)
+        --twopass->extend_minq;
+      if (rc->rolling_target_bits > rc->rolling_actual_bits)
+        --twopass->extend_maxq;
+    }
+    twopass->extend_minq = clamp(twopass->extend_minq, 0, MINQ_ADJ_LIMIT);
+    twopass->extend_maxq = clamp(twopass->extend_maxq, 0, maxq_adj_limit);
+  }
 }

diff --git a/source/libvpx/vp9/encoder/vp9_firstpass.h b/source/libvpx/vp9/encoder/vp9_firstpass.h
index 0b82d32..e21d869 100644
--- a/source/libvpx/vp9/encoder/vp9_firstpass.h
+++ b/source/libvpx/vp9/encoder/vp9_firstpass.h

@@ -39,6 +39,8 @@
 } FIRSTPASS_MB_STATS;
 #endif
 
+#define VLOW_MOTION_THRESHOLD 950
+
 typedef struct {
   double frame;
   double intra_error;
@@ -110,8 +112,9 @@
   int kf_zeromotion_pct;
   int last_kfgroup_zeromotion_pct;
   int gf_zeromotion_pct;
-
   int active_worst_quality;
+  int extend_minq;
+  int extend_maxq;
 
   GF_GROUP gf_group;
 } TWO_PASS;

diff --git a/source/libvpx/vp9/encoder/vp9_mbgraph.c b/source/libvpx/vp9/encoder/vp9_mbgraph.c
index 42981d8..bd04c56 100644
--- a/source/libvpx/vp9/encoder/vp9_mbgraph.c
+++ b/source/libvpx/vp9/encoder/vp9_mbgraph.c

@@ -34,7 +34,7 @@
   const int tmp_row_min = x->mv_row_min;
   const int tmp_row_max = x->mv_row_max;
   MV ref_full;
-  int sad_list[5];
+  int cost_list[5];
 
   // Further step/diamond searches as necessary
   int step_param = mv_sf->reduce_first_step_size;
@@ -47,7 +47,7 @@
 
   /*cpi->sf.search_method == HEX*/
   vp9_hex_search(x, &ref_full, step_param, x->errorperbit, 0,
-                 cond_sad_list(cpi, sad_list),
+                 cond_cost_list(cpi, cost_list),
                  &v_fn_ptr, 0, ref_mv, dst_mv);
 
   // Try sub-pixel MC
@@ -58,7 +58,7 @@
     cpi->find_fractional_mv_step(
         x, dst_mv, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
         &v_fn_ptr, 0, mv_sf->subpel_iters_per_step,
-        cond_sad_list(cpi, sad_list),
+        cond_cost_list(cpi, cost_list),
         NULL, NULL,
         &distortion, &sse, NULL, 0, 0);
   }

diff --git a/source/libvpx/vp9/encoder/vp9_mcomp.c b/source/libvpx/vp9/encoder/vp9_mcomp.c
index a25dc61..69b4193 100644
--- a/source/libvpx/vp9/encoder/vp9_mcomp.c
+++ b/source/libvpx/vp9/encoder/vp9_mcomp.c

@@ -284,17 +284,220 @@
   int tc = bc;                                                             \
                                                                            \
   bestmv->row *= 8;                                                        \
-  bestmv->col *= 8;                                                        \
-  if (second_pred != NULL) {                                               \
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);                \
-    vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); \
-    besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);                  \
-  } else {                                                                 \
-    besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1);          \
-  }                                                                        \
-  *distortion = besterr;                                                   \
+  bestmv->col *= 8;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define SETUP_CENTER_ERROR                                                   \
+  if (second_pred != NULL) {                                                 \
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {                       \
+      DECLARE_ALIGNED_ARRAY(16, uint16_t, comp_pred16, 64 * 64);             \
+      vp9_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,   \
+                               y_stride);                                    \
+      besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, z, src_stride,   \
+                        sse1);                                               \
+    } else {                                                                 \
+      DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);                \
+      vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); \
+      besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);                  \
+    }                                                                        \
+  } else {                                                                   \
+    besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1);            \
+  }                                                                          \
+  *distortion = besterr;                                                     \
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
 
+#else
+
+#define SETUP_CENTER_ERROR                                                   \
+  if (second_pred != NULL) {                                                 \
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);                  \
+    vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);   \
+    besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);                    \
+  } else {                                                                   \
+    besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1);            \
+  }                                                                          \
+  *distortion = besterr;                                                     \
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+
+
+
+static INLINE int divide_and_round(const int n, const int d) {
+  return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d);
+}
+
+static INLINE int is_cost_list_wellbehaved(int *cost_list) {
+  return cost_list[0] < cost_list[1] &&
+         cost_list[0] < cost_list[2] &&
+         cost_list[0] < cost_list[3] &&
+         cost_list[0] < cost_list[4];
+}
+
+// Returns surface minima estimate at given precision in 1/2^n bits.
+// Assume a model for the cost surface: S = A(x - x0)^2 + B(y - y0)^2 + C
+// For a given set of costs S0, S1, S2, S3, S4 at points
+// (y, x) = (0, 0), (0, -1), (1, 0), (0, 1) and (-1, 0) respectively,
+// the solution for the location of the minima (x0, y0) is given by:
+// x0 = 1/2 (S1 - S3)/(S1 + S3 - 2*S0),
+// y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0).
+// The code below is an integerized version of that.
+static void get_cost_surf_min(int *cost_list, int *ir, int *ic,
+                              int bits) {
+  *ic = divide_and_round((cost_list[1] - cost_list[3]) * (1 << (bits - 1)),
+                         (cost_list[1] - 2 * cost_list[0] + cost_list[3]));
+  *ir = divide_and_round((cost_list[4] - cost_list[2]) * (1 << (bits - 1)),
+                         (cost_list[4] - 2 * cost_list[0] + cost_list[2]));
+}
+
+int vp9_find_best_sub_pixel_tree_pruned_evenmore(
+    const MACROBLOCK *x,
+    MV *bestmv, const MV *ref_mv,
+    int allow_hp,
+    int error_per_bit,
+    const vp9_variance_fn_ptr_t *vfp,
+    int forced_stop,
+    int iters_per_step,
+    int *cost_list,
+    int *mvjcost, int *mvcost[2],
+    int *distortion,
+    unsigned int *sse1,
+    const uint8_t *second_pred,
+    int w, int h) {
+  SETUP_SUBPEL_SEARCH;
+  SETUP_CENTER_ERROR;
+  (void) halfiters;
+  (void) quarteriters;
+  (void) eighthiters;
+  (void) whichdir;
+  (void) allow_hp;
+  (void) forced_stop;
+  (void) hstep;
+
+  if (cost_list &&
+      cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+      cost_list[4] != INT_MAX &&
+      is_cost_list_wellbehaved(cost_list)) {
+    int ir, ic;
+    unsigned int minpt;
+    get_cost_surf_min(cost_list, &ir, &ic, 2);
+    if (ir != 0 || ic != 0) {
+      CHECK_BETTER(minpt, tr + 2 * ir, tc + 2 * ic);
+    }
+  } else {
+    FIRST_LEVEL_CHECKS;
+    if (halfiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+
+    tr = br;
+    tc = bc;
+
+    // Each subsequent iteration checks at least one point in common with
+    // the last iteration could be 2 ( if diag selected) 1/4 pel
+    // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+    if (forced_stop != 2) {
+      hstep >>= 1;
+      FIRST_LEVEL_CHECKS;
+      if (quarteriters > 1) {
+        SECOND_LEVEL_CHECKS;
+      }
+    }
+  }
+
+  tr = br;
+  tc = bc;
+
+  if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (eighthiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+  }
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+
+int vp9_find_best_sub_pixel_tree_pruned_more(const MACROBLOCK *x,
+                                             MV *bestmv, const MV *ref_mv,
+                                             int allow_hp,
+                                             int error_per_bit,
+                                             const vp9_variance_fn_ptr_t *vfp,
+                                             int forced_stop,
+                                             int iters_per_step,
+                                             int *cost_list,
+                                             int *mvjcost, int *mvcost[2],
+                                             int *distortion,
+                                             unsigned int *sse1,
+                                             const uint8_t *second_pred,
+                                             int w, int h) {
+  SETUP_SUBPEL_SEARCH;
+  SETUP_CENTER_ERROR;
+  if (cost_list &&
+      cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+      cost_list[4] != INT_MAX &&
+      is_cost_list_wellbehaved(cost_list)) {
+    unsigned int minpt;
+    int ir, ic;
+    get_cost_surf_min(cost_list, &ir, &ic, 1);
+    if (ir != 0 || ic != 0) {
+      CHECK_BETTER(minpt, tr + ir * hstep, tc + ic * hstep);
+    }
+  } else {
+    FIRST_LEVEL_CHECKS;
+    if (halfiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+  }
+
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
+
+  // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+  if (forced_stop != 2) {
+    tr = br;
+    tc = bc;
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (quarteriters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+  }
+
+  if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
+    tr = br;
+    tc = bc;
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (eighthiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+  }
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void) tr;
+  (void) tc;
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+
 int vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x,
                                         MV *bestmv, const MV *ref_mv,
                                         int allow_hp,
@@ -302,21 +505,21 @@
                                         const vp9_variance_fn_ptr_t *vfp,
                                         int forced_stop,
                                         int iters_per_step,
-                                        int *sad_list,
+                                        int *cost_list,
                                         int *mvjcost, int *mvcost[2],
                                         int *distortion,
                                         unsigned int *sse1,
                                         const uint8_t *second_pred,
                                         int w, int h) {
   SETUP_SUBPEL_SEARCH;
-
-  if (sad_list &&
-      sad_list[0] != INT_MAX && sad_list[1] != INT_MAX &&
-      sad_list[2] != INT_MAX && sad_list[3] != INT_MAX &&
-      sad_list[4] != INT_MAX) {
+  SETUP_CENTER_ERROR;
+  if (cost_list &&
+      cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+      cost_list[4] != INT_MAX) {
     unsigned int left, right, up, down, diag;
-    whichdir = (sad_list[1] < sad_list[3] ? 0 : 1) +
-               (sad_list[2] < sad_list[4] ? 0 : 2);
+    whichdir = (cost_list[1] < cost_list[3] ? 0 : 1) +
+               (cost_list[2] < cost_list[4] ? 0 : 2);
     switch (whichdir) {
       case 0:
         CHECK_BETTER(left, tr, tc - hstep);
@@ -394,14 +597,15 @@
                                  const vp9_variance_fn_ptr_t *vfp,
                                  int forced_stop,
                                  int iters_per_step,
-                                 int *sad_list,
+                                 int *cost_list,
                                  int *mvjcost, int *mvcost[2],
                                  int *distortion,
                                  unsigned int *sse1,
                                  const uint8_t *second_pred,
                                  int w, int h) {
   SETUP_SUBPEL_SEARCH;
-  (void) sad_list;  // to silence compiler warning
+  SETUP_CENTER_ERROR;
+  (void) cost_list;  // to silence compiler warning
 
   // Each subsequent iteration checks at least one point in
   // common with the last iteration could be 2 ( if diag selected)
@@ -484,6 +688,57 @@
 #define MAX_PATTERN_CANDIDATES      8  // max number of canddiates per scale
 #define PATTERN_CANDIDATES_REF      3  // number of refinement candidates
 
+// Calculate and return a sad+mvcost list around an integer best pel.
+static INLINE void calc_int_cost_list(const MACROBLOCK *x,
+                                      const MV *ref_mv,
+                                      int sadpb,
+                                      const vp9_variance_fn_ptr_t *fn_ptr,
+                                      const MV *best_mv,
+                                      int *cost_list) {
+  static const MV neighbors[4] = {{0, -1}, {1, 0}, {0, 1}, {-1, 0}};
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &x->e_mbd.plane[0].pre[0];
+  const MV fcenter_mv = {ref_mv->row >> 3, ref_mv->col >> 3};
+  int br = best_mv->row;
+  int bc = best_mv->col;
+  MV this_mv;
+  int i;
+  unsigned int sse;
+
+  this_mv.row = br;
+  this_mv.col = bc;
+  cost_list[0] = fn_ptr->vf(what->buf, what->stride,
+                            get_buf_from_mv(in_what, &this_mv),
+                            in_what->stride, &sse) +
+      mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
+  if (check_bounds(x, br, bc, 1)) {
+    for (i = 0; i < 4; i++) {
+      const MV this_mv = {br + neighbors[i].row,
+        bc + neighbors[i].col};
+      cost_list[i + 1] = fn_ptr->vf(what->buf, what->stride,
+                                    get_buf_from_mv(in_what, &this_mv),
+                                    in_what->stride, &sse) +
+          // mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
+          mv_err_cost(&this_mv, &fcenter_mv, x->nmvjointcost, x->mvcost,
+                      x->errorperbit);
+    }
+  } else {
+    for (i = 0; i < 4; i++) {
+      const MV this_mv = {br + neighbors[i].row,
+        bc + neighbors[i].col};
+      if (!is_mv_in(x, &this_mv))
+        cost_list[i + 1] = INT_MAX;
+      else
+        cost_list[i + 1] = fn_ptr->vf(what->buf, what->stride,
+                                      get_buf_from_mv(in_what, &this_mv),
+                                      in_what->stride, &sse) +
+            // mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
+            mv_err_cost(&this_mv, &fcenter_mv, x->nmvjointcost, x->mvcost,
+                        x->errorperbit);
+    }
+  }
+}
+
 // Generic pattern search function that searches over multiple scales.
 // Each scale can have a different number of candidates and shape of
 // candidates as indicated in the num_candidates and candidates arrays
@@ -494,7 +749,7 @@
                               int search_param,
                               int sad_per_bit,
                               int do_init_search,
-                              int *sad_list,
+                              int *cost_list,
                               const vp9_variance_fn_ptr_t *vfp,
                               int use_mvcost,
                               const MV *center_mv,
@@ -646,40 +901,14 @@
   }
 
   // Returns the one-away integer pel sad values around the best as follows:
-  // sad_list[0]: sad at the best integer pel
-  // sad_list[1]: sad at delta {0, -1} (left)   from the best integer pel
-  // sad_list[2]: sad at delta { 1, 0} (bottom) from the best integer pel
-  // sad_list[3]: sad at delta { 0, 1} (right)  from the best integer pel
-  // sad_list[4]: sad at delta {-1, 0} (top)    from the best integer pel
-  if (sad_list) {
-    static const MV neighbors[4] = {{0, -1}, {1, 0}, {0, 1}, {-1, 0}};
-    sad_list[0] = bestsad;
-    if (check_bounds(x, br, bc, 1)) {
-      for (i = 0; i < 4; i++) {
-        const MV this_mv = {br + neighbors[i].row,
-                            bc + neighbors[i].col};
-        sad_list[i + 1] = vfp->sdf(what->buf, what->stride,
-                                   get_buf_from_mv(in_what, &this_mv),
-                                   in_what->stride) +
-            (use_mvcost ?
-             mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit) :
-             0);
-      }
-    } else {
-      for (i = 0; i < 4; i++) {
-        const MV this_mv = {br + neighbors[i].row,
-                            bc + neighbors[i].col};
-        if (!is_mv_in(x, &this_mv))
-          sad_list[i + 1] = INT_MAX;
-        else
-          sad_list[i + 1] = vfp->sdf(what->buf, what->stride,
-                                     get_buf_from_mv(in_what, &this_mv),
-                                     in_what->stride) +
-              (use_mvcost ?
-               mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit) :
-               0);
-      }
-    }
+  // cost_list[0]: cost at the best integer pel
+  // cost_list[1]: cost at delta {0, -1} (left)   from the best integer pel
+  // cost_list[2]: cost at delta { 1, 0} (bottom) from the best integer pel
+  // cost_list[3]: cost at delta { 0, 1} (right)  from the best integer pel
+  // cost_list[4]: cost at delta {-1, 0} (top)    from the best integer pel
+  if (cost_list) {
+    const MV best_mv = { br, bc };
+    calc_int_cost_list(x, &fcenter_mv, sad_per_bit, vfp, &best_mv, cost_list);
   }
   best_mv->row = br;
   best_mv->col = bc;
@@ -687,7 +916,7 @@
 }
 
 // A specialized function where the smallest scale search candidates
-// are 4 1-away neighbors, and sad_list is non-null
+// are 4 1-away neighbors, and cost_list is non-null
 // TODO(debargha): Merge this function with the one above. Also remove
 // use_mvcost option since it is always 1, to save unnecessary branches.
 static int vp9_pattern_search_sad(const MACROBLOCK *x,
@@ -695,7 +924,7 @@
                                   int search_param,
                                   int sad_per_bit,
                                   int do_init_search,
-                                  int *sad_list,
+                                  int *cost_list,
                                   const vp9_variance_fn_ptr_t *vfp,
                                   int use_mvcost,
                                   const MV *center_mv,
@@ -720,8 +949,8 @@
   clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
   br = ref_mv->row;
   bc = ref_mv->col;
-  if (sad_list != NULL) {
-    sad_list[0] = sad_list[1] = sad_list[2] = sad_list[3] = sad_list[4] =
+  if (cost_list != NULL) {
+    cost_list[0] = cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] =
         INT_MAX;
   }
 
@@ -775,7 +1004,7 @@
   // If the center point is still the best, just skip this and move to
   // the refinement step.
   if (best_init_s != -1) {
-    int do_sad = (num_candidates[0] == 4 && sad_list != NULL);
+    int do_sad = (num_candidates[0] == 4 && cost_list != NULL);
     int best_site = -1;
     s = best_init_s;
 
@@ -849,15 +1078,15 @@
       } while (best_site != -1);
     }
 
-    // Note: If we enter the if below, then sad_list must be non-NULL.
+    // Note: If we enter the if below, then cost_list must be non-NULL.
     if (s == 0) {
-      sad_list[0] = bestsad;
+      cost_list[0] = bestsad;
       if (!do_init_search || s != best_init_s) {
         if (check_bounds(x, br, bc, 1 << s)) {
           for (i = 0; i < num_candidates[s]; i++) {
             const MV this_mv = {br + candidates[s][i].row,
                                 bc + candidates[s][i].col};
-            sad_list[i + 1] =
+            cost_list[i + 1] =
             thissad = vfp->sdf(what->buf, what->stride,
                                get_buf_from_mv(in_what, &this_mv),
                                in_what->stride);
@@ -869,7 +1098,7 @@
                                 bc + candidates[s][i].col};
             if (!is_mv_in(x, &this_mv))
               continue;
-            sad_list[i + 1] =
+            cost_list[i + 1] =
             thissad = vfp->sdf(what->buf, what->stride,
                                get_buf_from_mv(in_what, &this_mv),
                                in_what->stride);
@@ -889,15 +1118,15 @@
         next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
         next_chkpts_indices[1] = k;
         next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
-        sad_list[1] = sad_list[2] = sad_list[3] = sad_list[4] = INT_MAX;
-        sad_list[((k + 2) % 4) + 1] = sad_list[0];
-        sad_list[0] = bestsad;
+        cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = INT_MAX;
+        cost_list[((k + 2) % 4) + 1] = cost_list[0];
+        cost_list[0] = bestsad;
 
         if (check_bounds(x, br, bc, 1 << s)) {
           for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
             const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row,
                                 bc + candidates[s][next_chkpts_indices[i]].col};
-            sad_list[next_chkpts_indices[i] + 1] =
+            cost_list[next_chkpts_indices[i] + 1] =
             thissad = vfp->sdf(what->buf, what->stride,
                                get_buf_from_mv(in_what, &this_mv),
                                in_what->stride);
@@ -908,10 +1137,10 @@
             const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row,
                                 bc + candidates[s][next_chkpts_indices[i]].col};
             if (!is_mv_in(x, &this_mv)) {
-              sad_list[next_chkpts_indices[i] + 1] = INT_MAX;
+              cost_list[next_chkpts_indices[i] + 1] = INT_MAX;
               continue;
             }
-            sad_list[next_chkpts_indices[i] + 1] =
+            cost_list[next_chkpts_indices[i] + 1] =
             thissad = vfp->sdf(what->buf, what->stride,
                                get_buf_from_mv(in_what, &this_mv),
                                in_what->stride);
@@ -929,20 +1158,20 @@
   }
 
   // Returns the one-away integer pel sad values around the best as follows:
-  // sad_list[0]: sad at the best integer pel
-  // sad_list[1]: sad at delta {0, -1} (left)   from the best integer pel
-  // sad_list[2]: sad at delta { 1, 0} (bottom) from the best integer pel
-  // sad_list[3]: sad at delta { 0, 1} (right)  from the best integer pel
-  // sad_list[4]: sad at delta {-1, 0} (top)    from the best integer pel
-  if (sad_list) {
+  // cost_list[0]: sad at the best integer pel
+  // cost_list[1]: sad at delta {0, -1} (left)   from the best integer pel
+  // cost_list[2]: sad at delta { 1, 0} (bottom) from the best integer pel
+  // cost_list[3]: sad at delta { 0, 1} (right)  from the best integer pel
+  // cost_list[4]: sad at delta {-1, 0} (top)    from the best integer pel
+  if (cost_list) {
     static const MV neighbors[4] = {{0, -1}, {1, 0}, {0, 1}, {-1, 0}};
-    if (sad_list[0] == INT_MAX) {
-      sad_list[0] = bestsad;
+    if (cost_list[0] == INT_MAX) {
+      cost_list[0] = bestsad;
       if (check_bounds(x, br, bc, 1)) {
         for (i = 0; i < 4; i++) {
-          const MV this_mv = {br + neighbors[i].row,
-            bc + neighbors[i].col};
-          sad_list[i + 1] = vfp->sdf(what->buf, what->stride,
+          const MV this_mv = { br + neighbors[i].row,
+                               bc + neighbors[i].col };
+          cost_list[i + 1] = vfp->sdf(what->buf, what->stride,
                                      get_buf_from_mv(in_what, &this_mv),
                                      in_what->stride);
         }
@@ -951,9 +1180,9 @@
           const MV this_mv = {br + neighbors[i].row,
             bc + neighbors[i].col};
           if (!is_mv_in(x, &this_mv))
-            sad_list[i + 1] = INT_MAX;
+            cost_list[i + 1] = INT_MAX;
           else
-            sad_list[i + 1] = vfp->sdf(what->buf, what->stride,
+            cost_list[i + 1] = vfp->sdf(what->buf, what->stride,
                                        get_buf_from_mv(in_what, &this_mv),
                                        in_what->stride);
         }
@@ -963,8 +1192,8 @@
         for (i = 0; i < 4; i++) {
           const MV this_mv = {br + neighbors[i].row,
             bc + neighbors[i].col};
-          if (sad_list[i + 1] != INT_MAX) {
-            sad_list[i + 1] +=
+          if (cost_list[i + 1] != INT_MAX) {
+            cost_list[i + 1] +=
                 mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
           }
         }
@@ -1014,7 +1243,7 @@
                    int search_param,
                    int sad_per_bit,
                    int do_init_search,
-                   int *sad_list,
+                   int *cost_list,
                    const vp9_variance_fn_ptr_t *vfp,
                    int use_mvcost,
                    const MV *center_mv, MV *best_mv) {
@@ -1039,7 +1268,7 @@
       { -1024, 0}},
   };
   return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
-                            do_init_search, sad_list, vfp, use_mvcost,
+                            do_init_search, cost_list, vfp, use_mvcost,
                             center_mv, best_mv,
                             hex_num_candidates, hex_candidates);
 }
@@ -1049,7 +1278,7 @@
                       int search_param,
                       int sad_per_bit,
                       int do_init_search,
-                      int *sad_list,
+                      int *cost_list,
                       const vp9_variance_fn_ptr_t *vfp,
                       int use_mvcost,
                       const MV *center_mv,
@@ -1081,7 +1310,7 @@
       {-512, 512}, {-1024, 0}},
   };
   return vp9_pattern_search_sad(x, ref_mv, search_param, sad_per_bit,
-                                do_init_search, sad_list, vfp, use_mvcost,
+                                do_init_search, cost_list, vfp, use_mvcost,
                                 center_mv, best_mv,
                                 bigdia_num_candidates, bigdia_candidates);
 }
@@ -1091,7 +1320,7 @@
                       int search_param,
                       int sad_per_bit,
                       int do_init_search,
-                      int *sad_list,
+                      int *cost_list,
                       const vp9_variance_fn_ptr_t *vfp,
                       int use_mvcost,
                       const MV *center_mv,
@@ -1123,7 +1352,7 @@
       {0, 1024}, {-1024, 1024}, {-1024, 0}},
   };
   return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
-                            do_init_search, sad_list, vfp, use_mvcost,
+                            do_init_search, cost_list, vfp, use_mvcost,
                             center_mv, best_mv,
                             square_num_candidates, square_candidates);
 }
@@ -1133,13 +1362,13 @@
                         int search_param,
                         int sad_per_bit,
                         int do_init_search,  // must be zero for fast_hex
-                        int *sad_list,
+                        int *cost_list,
                         const vp9_variance_fn_ptr_t *vfp,
                         int use_mvcost,
                         const MV *center_mv,
                         MV *best_mv) {
   return vp9_hex_search(x, ref_mv, MAX(MAX_MVSEARCH_STEPS - 2, search_param),
-                        sad_per_bit, do_init_search, sad_list, vfp, use_mvcost,
+                        sad_per_bit, do_init_search, cost_list, vfp, use_mvcost,
                         center_mv, best_mv);
 }
 
@@ -1148,13 +1377,13 @@
                         int search_param,
                         int sad_per_bit,
                         int do_init_search,
-                        int *sad_list,
+                        int *cost_list,
                         const vp9_variance_fn_ptr_t *vfp,
                         int use_mvcost,
                         const MV *center_mv,
                         MV *best_mv) {
   return vp9_bigdia_search(x, ref_mv, MAX(MAX_MVSEARCH_STEPS - 2, search_param),
-                           sad_per_bit, do_init_search, sad_list, vfp,
+                           sad_per_bit, do_init_search, cost_list, vfp,
                            use_mvcost, center_mv, best_mv);
 }
 
@@ -1378,10 +1607,10 @@
 /* do_refine: If last step (1-away) of n-step search doesn't pick the center
               point as the best match, we will do a final 1-away diamond
               refining search  */
-
 int vp9_full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x,
                            MV *mvp_full, int step_param,
                            int sadpb, int further_steps, int do_refine,
+                           int *cost_list,
                            const vp9_variance_fn_ptr_t *fn_ptr,
                            const MV *ref_mv, MV *dst_mv) {
   MV temp_mv;
@@ -1434,6 +1663,11 @@
       *dst_mv = best_mv;
     }
   }
+
+  // Return cost list.
+  if (cost_list) {
+    calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list);
+  }
   return bestsme;
 }
 
@@ -1753,46 +1987,46 @@
 int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x,
                           BLOCK_SIZE bsize, MV *mvp_full,
                           int step_param, int error_per_bit,
-                          int *sad_list,
+                          int *cost_list,
                           const MV *ref_mv, MV *tmp_mv,
                           int var_max, int rd) {
   const SPEED_FEATURES *const sf = &cpi->sf;
   const SEARCH_METHODS method = sf->mv.search_method;
   vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
   int var = 0;
-  if (sad_list) {
-    sad_list[0] = INT_MAX;
-    sad_list[1] = INT_MAX;
-    sad_list[2] = INT_MAX;
-    sad_list[3] = INT_MAX;
-    sad_list[4] = INT_MAX;
+  if (cost_list) {
+    cost_list[0] = INT_MAX;
+    cost_list[1] = INT_MAX;
+    cost_list[2] = INT_MAX;
+    cost_list[3] = INT_MAX;
+    cost_list[4] = INT_MAX;
   }
 
   switch (method) {
     case FAST_DIAMOND:
       var = vp9_fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
-                                sad_list, fn_ptr, 1, ref_mv, tmp_mv);
+                                cost_list, fn_ptr, 1, ref_mv, tmp_mv);
       break;
     case FAST_HEX:
       var = vp9_fast_hex_search(x, mvp_full, step_param, error_per_bit, 0,
-                                sad_list, fn_ptr, 1, ref_mv, tmp_mv);
+                                cost_list, fn_ptr, 1, ref_mv, tmp_mv);
       break;
     case HEX:
       var = vp9_hex_search(x, mvp_full, step_param, error_per_bit, 1,
-                           sad_list, fn_ptr, 1, ref_mv, tmp_mv);
+                           cost_list, fn_ptr, 1, ref_mv, tmp_mv);
       break;
     case SQUARE:
       var = vp9_square_search(x, mvp_full, step_param, error_per_bit, 1,
-                              sad_list, fn_ptr, 1, ref_mv, tmp_mv);
+                              cost_list, fn_ptr, 1, ref_mv, tmp_mv);
       break;
     case BIGDIA:
       var = vp9_bigdia_search(x, mvp_full, step_param, error_per_bit, 1,
-                              sad_list, fn_ptr, 1, ref_mv, tmp_mv);
+                              cost_list, fn_ptr, 1, ref_mv, tmp_mv);
       break;
     case NSTEP:
       var = vp9_full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
                                    MAX_MVSEARCH_STEPS - 1 - step_param,
-                                   1, fn_ptr, ref_mv, tmp_mv);
+                                   1, cost_list, fn_ptr, ref_mv, tmp_mv);
       break;
     default:
       assert(!"Invalid search method.");

diff --git a/source/libvpx/vp9/encoder/vp9_mcomp.h b/source/libvpx/vp9/encoder/vp9_mcomp.h
index 9b4734a..9ddca25 100644
--- a/source/libvpx/vp9/encoder/vp9_mcomp.h
+++ b/source/libvpx/vp9/encoder/vp9_mcomp.h

@@ -70,6 +70,7 @@
 int vp9_full_pixel_diamond(const struct VP9_COMP *cpi, MACROBLOCK *x,
                            MV *mvp_full, int step_param,
                            int sadpb, int further_steps, int do_refine,
+                           int *cost_list,
                            const vp9_variance_fn_ptr_t *fn_ptr,
                            const MV *ref_mv, MV *dst_mv);
 
@@ -79,7 +80,7 @@
     int search_param,
     int error_per_bit,
     int do_init_search,
-    int *sad_list,
+    int *cost_list,
     const vp9_variance_fn_ptr_t *vf,
     int use_mvcost,
     const MV *center_mv,
@@ -99,7 +100,7 @@
     const vp9_variance_fn_ptr_t *vfp,
     int forced_stop,  // 0 - full, 1 - qtr only, 2 - half only
     int iters_per_step,
-    int *sad_list,
+    int *cost_list,
     int *mvjcost, int *mvcost[2],
     int *distortion, unsigned int *sse1,
     const uint8_t *second_pred,
@@ -107,6 +108,8 @@
 
 extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree;
 extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned;
+extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned_more;
+extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned_evenmore;
 
 typedef int (*vp9_full_search_fn_t)(const MACROBLOCK *x,
                                     const MV *ref_mv, int sad_per_bit,
@@ -139,7 +142,7 @@
 int vp9_full_pixel_search(struct VP9_COMP *cpi, MACROBLOCK *x,
                           BLOCK_SIZE bsize, MV *mvp_full,
                           int step_param, int error_per_bit,
-                          int *sad_list,
+                          int *cost_list,
                           const MV *ref_mv, MV *tmp_mv,
                           int var_max, int rd);
 

diff --git a/source/libvpx/vp9/encoder/vp9_picklpf.c b/source/libvpx/vp9/encoder/vp9_picklpf.c
index 2fc05e7..85984fd 100644
--- a/source/libvpx/vp9/encoder/vp9_picklpf.c
+++ b/source/libvpx/vp9/encoder/vp9_picklpf.c

@@ -40,7 +40,15 @@
 
   vp9_loop_filter_frame(cm->frame_to_show, cm, &cpi->mb.e_mbd, filt_level, 1,
                         partial_frame);
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cm->use_highbitdepth) {
+    filt_err = vp9_highbd_get_y_sse(sd, cm->frame_to_show, cm->bit_depth);
+  } else {
+    filt_err = vp9_get_y_sse(sd, cm->frame_to_show);
+  }
+#else
   filt_err = vp9_get_y_sse(sd, cm->frame_to_show);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
   // Re-instate the unfiltered frame
   vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
@@ -145,7 +153,26 @@
     const int q = vp9_ac_quant(cm->base_qindex, 0, cm->bit_depth);
     // These values were determined by linear fitting the result of the
     // searched level, filt_guess = q * 0.316206 + 3.87252
+#if CONFIG_VP9_HIGHDEPTH
+    int filt_guess;
+    switch (cm->bit_depth) {
+      case VPX_BITS_8:
+        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
+        break;
+      case VPX_BITS_10:
+        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20);
+        break;
+      case VPX_BITS_12:
+        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 16242526, 22);
+        break;
+      default:
+        assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 "
+                    "or VPX_BITS_12");
+        return;
+    }
+#else
     int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     if (cm->frame_type == KEY_FRAME)
       filt_guess -= 4;
     lf->filter_level = clamp(filt_guess, min_filter_level, max_filter_level);

diff --git a/source/libvpx/vp9/encoder/vp9_pickmode.c b/source/libvpx/vp9/encoder/vp9_pickmode.c
index a788c1d..b74b2dd 100644
--- a/source/libvpx/vp9/encoder/vp9_pickmode.c
+++ b/source/libvpx/vp9/encoder/vp9_pickmode.c

@@ -132,7 +132,7 @@
   const int tmp_row_min = x->mv_row_min;
   const int tmp_row_max = x->mv_row_max;
   int rv = 0;
-  int sad_list[5];
+  int cost_list[5];
   const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
                                                                         ref);
   if (cpi->common.show_frame &&
@@ -160,7 +160,7 @@
   mvp_full.row >>= 3;
 
   vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
-                        cond_sad_list(cpi, sad_list),
+                        cond_cost_list(cpi, cost_list),
                         &ref_mv, &tmp_mv->as_mv, INT_MAX, 0);
 
   x->mv_col_min = tmp_col_min;
@@ -187,7 +187,7 @@
                                  &cpi->fn_ptr[bsize],
                                  cpi->sf.mv.subpel_force_stop,
                                  cpi->sf.mv.subpel_iters_per_step,
-                                 cond_sad_list(cpi, sad_list),
+                                 cond_cost_list(cpi, cost_list),
                                  x->nmvjointcost, x->mvcost,
                                  &dis, &x->pred_sse[ref], NULL, 0, 0);
     x->pred_mv[ref] = tmp_mv->as_mv;
@@ -235,19 +235,54 @@
               tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
     else
       xd->mi[0].src_mi->mbmi.tx_size = TX_8X8;
+
+    if (cpi->sf.partition_search_type == VAR_BASED_PARTITION &&
+        xd->mi[0].src_mi->mbmi.tx_size > TX_16X16)
+      xd->mi[0].src_mi->mbmi.tx_size = TX_16X16;
   } else {
     xd->mi[0].src_mi->mbmi.tx_size =
         MIN(max_txsize_lookup[bsize],
             tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
   }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],
+                                 dc_quant >> (xd->bd - 5), &rate, &dist);
+  } else {
+    vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],
+                                 dc_quant >> 3, &rate, &dist);
+  }
+#else
   vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],
                                dc_quant >> 3, &rate, &dist);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
   *out_rate_sum = rate >> 1;
   *out_dist_sum = dist << 3;
 
-  vp9_model_rd_from_var_lapndz(var, 1 << num_pels_log2_lookup[bsize],
-                               ac_quant >> 3, &rate, &dist);
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vp9_model_rd_from_var_lapndz(var,
+                                 1 << num_pels_log2_lookup[bsize],
+                                 ac_quant >> (xd->bd - 5),
+                                 &rate,
+                                 &dist);
+  } else {
+    vp9_model_rd_from_var_lapndz(var,
+                                 1 << num_pels_log2_lookup[bsize],
+                                 ac_quant >> 3,
+                                 &rate,
+                                 &dist);
+  }
+#else
+  vp9_model_rd_from_var_lapndz(var,
+                               1 << num_pels_log2_lookup[bsize],
+                               ac_quant >> 3,
+                               &rate,
+                               &dist);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
   *out_rate_sum += rate;
   *out_dist_sum += dist << 4;
 }
@@ -293,16 +328,29 @@
     // The encode_breakout input
     const unsigned int min_thresh =
         MIN(((unsigned int)x->encode_breakout << 4), max_thresh);
+#if CONFIG_VP9_HIGHBITDEPTH
+    const int shift = 2 * xd->bd - 16;
+#endif
 
     // Calculate threshold according to dequant value.
     thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && shift > 0) {
+      thresh_ac = ROUND_POWER_OF_TWO(thresh_ac, shift);
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     thresh_ac = clamp(thresh_ac, min_thresh, max_thresh);
 
     // Adjust ac threshold according to partition size.
     thresh_ac >>=
-        8 - (b_width_log2(bsize) + b_height_log2(bsize));
+        8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
 
     thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6);
+#if CONFIG_VP9_HIGHBITDEPTH
+    if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && shift > 0) {
+      thresh_dc = ROUND_POWER_OF_TWO(thresh_dc, shift);
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   } else {
     thresh_ac = 0;
     thresh_dc = 0;
@@ -389,7 +437,7 @@
   pd->dst.buf = &dst_buf_base[4 * (j * dst_stride + i)];
   // Use source buffer as an approximation for the fully reconstructed buffer.
   vp9_predict_intra_block(xd, block >> (2 * tx_size),
-                          b_width_log2(plane_bsize),
+                          b_width_log2_lookup[plane_bsize],
                           tx_size, args->mode,
                           p->src.buf, src_stride,
                           pd->dst.buf, dst_stride,
@@ -438,9 +486,12 @@
   // var_y and sse_y are saved to be used in skipping checking
   unsigned int var_y = UINT_MAX;
   unsigned int sse_y = UINT_MAX;
-
-  const int intra_cost_penalty =
-      20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+  // Reduce the intra cost penalty for small blocks (<=16x16).
+  const int reduction_fac =
+      (cpi->sf.partition_search_type == VAR_BASED_PARTITION &&
+       bsize <= BLOCK_16X16) ? 4 : 1;
+  const int intra_cost_penalty = vp9_get_intra_cost_penalty(
+      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth) / reduction_fac;
   const int64_t inter_mode_thresh = RDCOST(x->rdmult, x->rddiv,
                                            intra_cost_penalty, 0);
   const int intra_mode_cost = 50;
@@ -449,7 +500,7 @@
   const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize];
   const int *const rd_thresh_freq_fact = cpi->rd.thresh_freq_fact[bsize];
   INTERP_FILTER filter_ref = cm->interp_filter;
-  const int bsl = mi_width_log2(bsize);
+  const int bsl = mi_width_log2_lookup[bsize];
   const int pred_filter_search = cm->interp_filter == SWITCHABLE ?
       (((mi_row + mi_col) >> bsl) +
        get_chessboard_index(cm->current_video_frame)) & 0x1 : 0;
@@ -461,14 +512,25 @@
   // tmp[3] points to dst buffer, and the other 3 point to allocated buffers.
   PRED_BUFFER tmp[4];
   DECLARE_ALIGNED_ARRAY(16, uint8_t, pred_buf, 3 * 64 * 64);
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, pred_buf_16, 3 * 64 * 64);
+#endif
   struct buf_2d orig_dst = pd->dst;
   PRED_BUFFER *best_pred = NULL;
   PRED_BUFFER *this_mode_pred = NULL;
+  const int pixels_in_block = bh * bw;
 
   if (cpi->sf.reuse_inter_pred_sby) {
     int i;
     for (i = 0; i < 3; i++) {
-      tmp[i].data = &pred_buf[bw * bh * i];
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (cm->use_highbitdepth)
+        tmp[i].data = CONVERT_TO_BYTEPTR(&pred_buf_16[pixels_in_block * i]);
+      else
+        tmp[i].data = &pred_buf[pixels_in_block * i];
+#else
+      tmp[i].data = &pred_buf[pixels_in_block * i];
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       tmp[i].stride = bw;
       tmp[i].in_use = 0;
     }
@@ -557,7 +619,8 @@
         continue;
 
       if (this_mode == NEWMV) {
-        if (this_rd < (int64_t)(1 << num_pels_log2_lookup[bsize]))
+        if (cpi->sf.partition_search_type != VAR_BASED_PARTITION &&
+            this_rd < (int64_t)(1 << num_pels_log2_lookup[bsize]))
           continue;
         if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
                                     &frame_mv[NEWMV][ref_frame],
@@ -703,8 +766,18 @@
   if (best_pred != NULL && cpi->sf.reuse_inter_pred_sby &&
       best_pred->data != orig_dst.buf) {
     pd->dst = orig_dst;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->use_highbitdepth) {
+      vp9_highbd_convolve_copy(best_pred->data, bw, pd->dst.buf, pd->dst.stride,
+                               NULL, 0, NULL, 0, bw, bh, xd->bd);
+    } else {
+      vp9_convolve_copy(best_pred->data, bw, pd->dst.buf, pd->dst.stride,
+                        NULL, 0, NULL, 0, bw, bh);
+    }
+#else
     vp9_convolve_copy(best_pred->data, bw, pd->dst.buf, pd->dst.stride, NULL, 0,
                       NULL, 0, bw, bh);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   }
 
   mbmi->mode          = best_mode;

diff --git a/source/libvpx/vp9/encoder/vp9_quantize.c b/source/libvpx/vp9/encoder/vp9_quantize.c
index 3d2c409..2ba1f92 100644
--- a/source/libvpx/vp9/encoder/vp9_quantize.c
+++ b/source/libvpx/vp9/encoder/vp9_quantize.c

@@ -41,10 +41,10 @@
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vp9_high_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
-                          const int16_t *round_ptr, const int16_t quant,
-                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                          const int16_t dequant_ptr, uint16_t *eob_ptr) {
+void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
+                            const int16_t *round_ptr, const int16_t quant,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t dequant_ptr, uint16_t *eob_ptr) {
   int eob = -1;
 
   if (!skip_block) {
@@ -88,10 +88,14 @@
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vp9_high_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
-                                const int16_t *round_ptr, const int16_t quant,
-                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                                const int16_t dequant_ptr, uint16_t *eob_ptr) {
+void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
+                                  int skip_block,
+                                  const int16_t *round_ptr,
+                                  const int16_t quant,
+                                  tran_low_t *qcoeff_ptr,
+                                  tran_low_t *dqcoeff_ptr,
+                                  const int16_t dequant_ptr,
+                                  uint16_t *eob_ptr) {
   int eob = -1;
 
   if (!skip_block) {
@@ -154,14 +158,20 @@
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vp9_high_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count,
-                            int skip_block, const int16_t *zbin_ptr,
-                            const int16_t *round_ptr, const int16_t *quant_ptr,
-                            const int16_t *quant_shift_ptr,
-                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                            const int16_t *dequant_ptr,
-                            int zbin_oq_value, uint16_t *eob_ptr,
-                            const int16_t *scan, const int16_t *iscan) {
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr,
+                              intptr_t count,
+                              int skip_block,
+                              const int16_t *zbin_ptr,
+                              const int16_t *round_ptr,
+                              const int16_t *quant_ptr,
+                              const int16_t *quant_shift_ptr,
+                              tran_low_t *qcoeff_ptr,
+                              tran_low_t *dqcoeff_ptr,
+                              const int16_t *dequant_ptr,
+                              int zbin_oq_value,
+                              uint16_t *eob_ptr,
+                              const int16_t *scan,
+                              const int16_t *iscan) {
   int i;
   int eob = -1;
   // TODO(jingning) Decide the need of these arguments after the
@@ -242,17 +252,17 @@
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vp9_high_quantize_fp_32x32_c(const tran_low_t *coeff_ptr,
-                                  intptr_t n_coeffs, int skip_block,
-                                  const int16_t *zbin_ptr,
-                                  const int16_t *round_ptr,
-                                  const int16_t *quant_ptr,
-                                  const int16_t *quant_shift_ptr,
-                                  tran_low_t *qcoeff_ptr,
-                                  tran_low_t *dqcoeff_ptr,
-                                  const int16_t *dequant_ptr,
-                                  int zbin_oq_value, uint16_t *eob_ptr,
-                                  const int16_t *scan, const int16_t *iscan) {
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr,
+                                    intptr_t n_coeffs, int skip_block,
+                                    const int16_t *zbin_ptr,
+                                    const int16_t *round_ptr,
+                                    const int16_t *quant_ptr,
+                                    const int16_t *quant_shift_ptr,
+                                    tran_low_t *qcoeff_ptr,
+                                    tran_low_t *dqcoeff_ptr,
+                                    const int16_t *dequant_ptr,
+                                    int zbin_oq_value, uint16_t *eob_ptr,
+                                    const int16_t *scan, const int16_t *iscan) {
   int i, eob = -1;
   (void)zbin_ptr;
   (void)quant_shift_ptr;
@@ -340,14 +350,14 @@
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vp9_high_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                           int skip_block, const int16_t *zbin_ptr,
-                           const int16_t *round_ptr, const int16_t *quant_ptr,
-                           const int16_t *quant_shift_ptr,
-                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           const int16_t *dequant_ptr, int zbin_oq_value,
-                           uint16_t *eob_ptr, const int16_t *scan,
-                           const int16_t *iscan) {
+void vp9_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             int skip_block, const int16_t *zbin_ptr,
+                             const int16_t *round_ptr, const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr, int zbin_oq_value,
+                             uint16_t *eob_ptr, const int16_t *scan,
+                             const int16_t *iscan) {
   int i, non_zero_count = (int)n_coeffs, eob = -1;
   const int zbins[2] = { zbin_ptr[0] + zbin_oq_value,
                          zbin_ptr[1] + zbin_oq_value };
@@ -452,17 +462,17 @@
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vp9_high_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
-                                 intptr_t n_coeffs, int skip_block,
-                                 const int16_t *zbin_ptr,
-                                 const int16_t *round_ptr,
-                                 const int16_t *quant_ptr,
-                                 const int16_t *quant_shift_ptr,
-                                 tran_low_t *qcoeff_ptr,
-                                 tran_low_t *dqcoeff_ptr,
-                                 const int16_t *dequant_ptr,
-                                 int zbin_oq_value, uint16_t *eob_ptr,
-                                 const int16_t *scan, const int16_t *iscan) {
+void vp9_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
+                                   intptr_t n_coeffs, int skip_block,
+                                   const int16_t *zbin_ptr,
+                                   const int16_t *round_ptr,
+                                   const int16_t *quant_ptr,
+                                   const int16_t *quant_shift_ptr,
+                                   tran_low_t *qcoeff_ptr,
+                                   tran_low_t *dqcoeff_ptr,
+                                   const int16_t *dequant_ptr,
+                                   int zbin_oq_value, uint16_t *eob_ptr,
+                                   const int16_t *scan, const int16_t *iscan) {
   const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1),
                          ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1) };
   const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
@@ -519,7 +529,7 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    vp9_high_quantize_b(BLOCK_OFFSET(p->coeff, block),
+    vp9_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block),
                         16, x->skip_block,
                         p->zbin, p->round, p->quant, p->quant_shift,
                         BLOCK_OFFSET(p->qcoeff, block),
@@ -641,13 +651,14 @@
   x->plane[0].quant_shift = quants->y_quant_shift[qindex];
   x->plane[0].zbin = quants->y_zbin[qindex];
   x->plane[0].round = quants->y_round[qindex];
-  x->plane[0].quant_thred[0] = cm->y_dequant[qindex][0] *
-                                  cm->y_dequant[qindex][0];
-  x->plane[0].quant_thred[1] = cm->y_dequant[qindex][1] *
-                                  cm->y_dequant[qindex][1];
   x->plane[0].zbin_extra = (int16_t)((cm->y_dequant[qindex][1] * zbin) >> 7);
   xd->plane[0].dequant = cm->y_dequant[qindex];
 
+  x->plane[0].quant_thred[0] = (x->plane[0].zbin[0] + x->plane[0].zbin_extra) *
+      (x->plane[0].zbin[0] + x->plane[0].zbin_extra);
+  x->plane[0].quant_thred[1] = (x->plane[0].zbin[1] + x->plane[0].zbin_extra) *
+      (x->plane[0].zbin[1] + x->plane[0].zbin_extra);
+
   // UV
   for (i = 1; i < 3; i++) {
     x->plane[i].quant = quants->uv_quant[qindex];
@@ -656,12 +667,15 @@
     x->plane[i].quant_shift = quants->uv_quant_shift[qindex];
     x->plane[i].zbin = quants->uv_zbin[qindex];
     x->plane[i].round = quants->uv_round[qindex];
-    x->plane[i].quant_thred[0] = cm->y_dequant[qindex][0] *
-                                    cm->y_dequant[qindex][0];
-    x->plane[i].quant_thred[1] = cm->y_dequant[qindex][1] *
-                                    cm->y_dequant[qindex][1];
     x->plane[i].zbin_extra = (int16_t)((cm->uv_dequant[qindex][1] * zbin) >> 7);
     xd->plane[i].dequant = cm->uv_dequant[qindex];
+
+    x->plane[i].quant_thred[0] =
+        (x->plane[i].zbin[0] + x->plane[i].zbin_extra) *
+        (x->plane[i].zbin[0] + x->plane[i].zbin_extra);
+    x->plane[i].quant_thred[1] =
+        (x->plane[i].zbin[1] + x->plane[i].zbin_extra) *
+        (x->plane[i].zbin[1] + x->plane[i].zbin_extra);
   }
 
   x->skip_block = vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);

diff --git a/source/libvpx/vp9/encoder/vp9_quantize.h b/source/libvpx/vp9/encoder/vp9_quantize.h
index d7edb0b..cee46e7 100644
--- a/source/libvpx/vp9/encoder/vp9_quantize.h
+++ b/source/libvpx/vp9/encoder/vp9_quantize.h

@@ -49,15 +49,18 @@
                                 const int16_t *scan, const int16_t *iscan);
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vp9_high_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
-                          const int16_t *round_ptr, const int16_t quant_ptr,
-                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                          const int16_t dequant_ptr, uint16_t *eob_ptr);
-void vp9_high_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
-                                const int16_t *round_ptr,
-                                const int16_t quant_ptr, tran_low_t *qcoeff_ptr,
-                                tran_low_t *dqcoeff_ptr,
-                                const int16_t dequant_ptr, uint16_t *eob_ptr);
+void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
+                            const int16_t *round_ptr, const int16_t quant_ptr,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t dequant_ptr, uint16_t *eob_ptr);
+void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
+                                  int skip_block,
+                                  const int16_t *round_ptr,
+                                  const int16_t quant_ptr,
+                                  tran_low_t *qcoeff_ptr,
+                                  tran_low_t *dqcoeff_ptr,
+                                  const int16_t dequant_ptr,
+                                  uint16_t *eob_ptr);
 #endif
 
 struct VP9_COMP;

diff --git a/source/libvpx/vp9/encoder/vp9_ratectrl.c b/source/libvpx/vp9/encoder/vp9_ratectrl.c
index 9b6c773..65bca66 100644
--- a/source/libvpx/vp9/encoder/vp9_ratectrl.c
+++ b/source/libvpx/vp9/encoder/vp9_ratectrl.c

@@ -177,6 +177,9 @@
   const double q = vp9_convert_qindex_to_q(qindex, bit_depth);
   int enumerator = frame_type == KEY_FRAME ? 2700000 : 1800000;
 
+  assert(correction_factor <= MAX_BPB_FACTOR &&
+         correction_factor >= MIN_BPB_FACTOR);
+
   // q based adjustment to baseline enumerator
   enumerator += (int)(enumerator * q) >> 12;
   return (int)(enumerator * correction_factor / q);
@@ -187,7 +190,8 @@
                               vpx_bit_depth_t bit_depth) {
   const int bpm = (int)(vp9_rc_bits_per_mb(frame_type, q, correction_factor,
                                            bit_depth));
-  return ((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS;
+  return MAX(FRAME_OVERHEAD_BITS,
+             (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS);
 }
 
 int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) {
@@ -276,7 +280,7 @@
   }
 
   rc->last_q[KEY_FRAME] = oxcf->best_allowed_q;
-  rc->last_q[INTER_FRAME] = oxcf->best_allowed_q;
+  rc->last_q[INTER_FRAME] = oxcf->worst_allowed_q;
 
   rc->buffer_level =    rc->starting_buffer_level;
   rc->bits_off_target = rc->starting_buffer_level;
@@ -298,7 +302,6 @@
   rc->source_alt_ref_active = 0;
 
   rc->frames_till_gf_update_due = 0;
-
   rc->ni_av_qi = oxcf->worst_allowed_q;
   rc->ni_tot_qi = 0;
   rc->ni_frames = 0;
@@ -410,7 +413,7 @@
                                                  rate_correction_factor,
                                                  cm->bit_depth);
   // Work out a size correction factor.
-  if (projected_size_based_on_q > 0)
+  if (projected_size_based_on_q > FRAME_OVERHEAD_BITS)
     correction_factor = (100 * cpi->rc.projected_frame_size) /
                             projected_size_based_on_q;
 
@@ -559,7 +562,7 @@
   int adjustment = 0;
   int active_worst_quality;
   if (cm->frame_type == KEY_FRAME)
-    return rc->worst_quality;
+    return rc->worst_quality * 4 / 5;
   if (cm->current_video_frame > 1)
     active_worst_quality = MIN(rc->worst_quality,
                                rc->avg_frame_qindex[INTER_FRAME] * 5 / 4);
@@ -988,6 +991,21 @@
     }
   }
 
+  // Extenstion to max or min Q if undershoot or overshoot is outside
+  // the permitted range.
+  if ((cpi->oxcf.rc_mode == VPX_VBR) &&
+      (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD)) {
+    if (frame_is_intra_only(cm) ||
+        (!rc->is_src_frame_alt_ref &&
+         (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
+      active_best_quality -= cpi->twopass.extend_minq;
+      active_worst_quality += (cpi->twopass.extend_maxq / 2);
+    } else {
+      active_best_quality -= cpi->twopass.extend_minq / 2;
+      active_worst_quality += cpi->twopass.extend_maxq;
+    }
+  }
+
 #if LIMIT_QRANGE_FOR_ALTREF_AND_KEY
   vp9_clear_system_state();
   // Static forced key frames Q restrictions dealt with elsewhere.
@@ -1482,9 +1500,7 @@
   rc->max_gf_interval = 16;
 
   // Extended interval for genuinely static scenes
-  rc->static_scene_max_gf_interval = oxcf->key_freq >> 1;
-  if (rc->static_scene_max_gf_interval > (MAX_LAG_BUFFERS * 2))
-    rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2;
+  rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2;
 
   if (is_altref_enabled(cpi)) {
     if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)

diff --git a/source/libvpx/vp9/encoder/vp9_ratectrl.h b/source/libvpx/vp9/encoder/vp9_ratectrl.h
index edfb9fc..bc74129 100644
--- a/source/libvpx/vp9/encoder/vp9_ratectrl.h
+++ b/source/libvpx/vp9/encoder/vp9_ratectrl.h

@@ -87,6 +87,8 @@
   int long_rolling_target_bits;
   int long_rolling_actual_bits;
 
+  int rate_error_estimate;
+
   int64_t total_actual_bits;
   int64_t total_target_bits;
   int64_t total_target_vs_actual;

diff --git a/source/libvpx/vp9/encoder/vp9_rd.c b/source/libvpx/vp9/encoder/vp9_rd.c
index 17369d4..7f526fc 100644
--- a/source/libvpx/vp9/encoder/vp9_rd.c
+++ b/source/libvpx/vp9/encoder/vp9_rd.c

@@ -44,6 +44,18 @@
 // Factor to weigh the rate for switchable interp filters.
 #define SWITCHABLE_INTERP_RATE_FACTOR 1
 
+void vp9_rd_cost_reset(RD_COST *rd_cost) {
+  rd_cost->rate = INT_MAX;
+  rd_cost->dist = INT64_MAX;
+  rd_cost->rdcost = INT64_MAX;
+}
+
+void vp9_rd_cost_init(RD_COST *rd_cost) {
+  rd_cost->rate = 0;
+  rd_cost->dist = 0;
+  rd_cost->rdcost = 0;
+}
+
 // The baseline rd thresholds for breaking out of the rd loop for
 // certain modes are assumed to be based on 8x8 blocks.
 // This table is used to correct for block size.
@@ -136,9 +148,9 @@
 };
 
 int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) {
-  const int q = vp9_dc_quant(qindex, 0, cpi->common.bit_depth);
+  const int64_t q = vp9_dc_quant(qindex, 0, cpi->common.bit_depth);
 #if CONFIG_VP9_HIGHBITDEPTH
-  int rdmult = 0;
+  int64_t rdmult = 0;
   switch (cpi->common.bit_depth) {
     case VPX_BITS_8:
       rdmult = 88 * q * q / 24;
@@ -154,8 +166,8 @@
       return -1;
   }
 #else
-  int rdmult = 88 * q * q / 24;
-#endif
+  int64_t rdmult = 88 * q * q / 24;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
     const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
     const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index];
@@ -164,7 +176,7 @@
     rdmult = (rdmult * rd_frame_type_factor[frame_type]) >> 7;
     rdmult += ((rdmult * rd_boost_factor[boost_index]) >> 7);
   }
-  return rdmult;
+  return (int)rdmult;
 }
 
 static int compute_rd_thresh_factor(int qindex, vpx_bit_depth_t bit_depth) {
@@ -187,7 +199,7 @@
 #else
   (void) bit_depth;
   q = vp9_dc_quant(qindex, 0, VPX_BITS_8) / 4.0;
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   // TODO(debargha): Adjust the function below.
   return MAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8);
 }
@@ -213,7 +225,7 @@
 #else
   cpi->mb.sadperbit16 = sad_per_bit16lut_8[qindex];
   cpi->mb.sadperbit4 = sad_per_bit4lut_8[qindex];
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
 static void set_block_thresholds(const VP9_COMMON *cm, RD_OPT *rd) {
@@ -598,3 +610,24 @@
     if (sf->disable_split_mask & (1 << i))
       rd->thresh_mult_sub8x8[i] = INT_MAX;
 }
+
+int vp9_get_intra_cost_penalty(int qindex, int qdelta,
+                               vpx_bit_depth_t bit_depth) {
+  const int q = vp9_dc_quant(qindex, qdelta, bit_depth);
+#if CONFIG_VP9_HIGHBITDEPTH
+  switch (bit_depth) {
+    case VPX_BITS_8:
+      return 20 * q;
+    case VPX_BITS_10:
+      return 5 * q;
+    case VPX_BITS_12:
+      return ROUND_POWER_OF_TWO(5 * q, 2);
+    default:
+      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1;
+  }
+#else
+  return 20 * q;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}
+

diff --git a/source/libvpx/vp9/encoder/vp9_rd.h b/source/libvpx/vp9/encoder/vp9_rd.h
index 5dcb2f8..1aa5266 100644
--- a/source/libvpx/vp9/encoder/vp9_rd.h
+++ b/source/libvpx/vp9/encoder/vp9_rd.h

@@ -117,6 +117,17 @@
   int RDDIV;
 } RD_OPT;
 
+typedef struct RD_COST {
+  int rate;
+  int64_t dist;
+  int64_t rdcost;
+} RD_COST;
+
+// Reset the rate distortion cost values to maximum (invalid) value.
+void vp9_rd_cost_reset(RD_COST *rd_cost);
+// Initialize the rate distortion cost values to zero.
+void vp9_rd_cost_init(RD_COST *rd_cost);
+
 struct TileInfo;
 struct VP9_COMP;
 struct macroblock;
@@ -162,6 +173,10 @@
                           int mi_row, int mi_col,
                           const struct scale_factors *scale,
                           const struct scale_factors *scale_uv);
+
+int vp9_get_intra_cost_penalty(int qindex, int qdelta,
+                               vpx_bit_depth_t bit_depth);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/source/libvpx/vp9/encoder/vp9_rdopt.c b/source/libvpx/vp9/encoder/vp9_rdopt.c
index f05351a..eca8e58 100644
--- a/source/libvpx/vp9/encoder/vp9_rdopt.c
+++ b/source/libvpx/vp9/encoder/vp9_rdopt.c

@@ -131,7 +131,7 @@
 
 static int raster_block_offset(BLOCK_SIZE plane_bsize,
                                int raster_block, int stride) {
-  const int bw = b_width_log2(plane_bsize);
+  const int bw = b_width_log2_lookup[plane_bsize];
   const int y = 4 * (raster_block >> bw);
   const int x = 4 * (raster_block & ((1 << bw) - 1));
   return y * stride + x;
@@ -169,7 +169,8 @@
 
 static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
                             MACROBLOCK *x, MACROBLOCKD *xd,
-                            int *out_rate_sum, int64_t *out_dist_sum) {
+                            int *out_rate_sum, int64_t *out_dist_sum,
+                            int *skip_txfm_sb, int64_t *skip_sse_sb) {
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
@@ -180,7 +181,9 @@
   unsigned int sse;
   unsigned int var = 0;
   unsigned int sum_sse = 0;
-  const int shift = 8;
+  int64_t total_sse = 0;
+  int skip_flag = 1;
+  const int shift = 6;
   int rate;
   int64_t dist;
 
@@ -192,6 +195,12 @@
     const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
     const TX_SIZE max_tx_size = max_txsize_lookup[bs];
     const BLOCK_SIZE unit_size = txsize_to_bsize[max_tx_size];
+    const int64_t dc_thr = p->quant_thred[0] >> shift;
+    const int64_t ac_thr = p->quant_thred[1] >> shift;
+    // The low thresholds are used to measure if the prediction errors are
+    // low enough so that we can skip the mode search.
+    const int64_t low_dc_thr = MIN(50, dc_thr >> 2);
+    const int64_t low_ac_thr = MIN(80, ac_thr >> 2);
     int bw = 1 << (b_width_log2_lookup[bs] - b_width_log2_lookup[unit_size]);
     int bh = 1 << (b_height_log2_lookup[bs] - b_width_log2_lookup[unit_size]);
     int idx, idy;
@@ -205,32 +214,49 @@
         uint8_t *src = p->src.buf + (idy * p->src.stride << lh) + (idx << lw);
         uint8_t *dst = pd->dst.buf + (idy * pd->dst.stride << lh) + (idx << lh);
         int block_idx = (idy << 1) + idx;
+        int low_err_skip = 0;
 
         var = cpi->fn_ptr[unit_size].vf(src, p->src.stride,
                                         dst, pd->dst.stride, &sse);
         x->bsse[(i << 2) + block_idx] = sse;
         sum_sse += sse;
 
+        x->skip_txfm[(i << 2) + block_idx] = 0;
         if (!x->select_tx_size) {
-          if (x->bsse[(i << 2) + block_idx] < p->quant_thred[0] >> shift)
-            x->skip_txfm[(i << 2) + block_idx] = 1;
-          else if (var < p->quant_thred[1] >> shift)
+          // Check if all ac coefficients can be quantized to zero.
+          if (var < ac_thr || var == 0) {
             x->skip_txfm[(i << 2) + block_idx] = 2;
-          else
-            x->skip_txfm[(i << 2) + block_idx] = 0;
+
+            // Check if dc coefficient can be quantized to zero.
+            if (sse - var < dc_thr || sse == var) {
+              x->skip_txfm[(i << 2) + block_idx] = 1;
+
+              if (!sse || (var < low_ac_thr && sse - var < low_dc_thr))
+                low_err_skip = 1;
+            }
+          }
         }
 
+        if (skip_flag && !low_err_skip)
+          skip_flag = 0;
+
         if (i == 0)
           x->pred_sse[ref] += sse;
       }
     }
 
+    total_sse += sum_sse;
+
     // Fast approximate the modelling function.
     if (cpi->oxcf.speed > 4) {
       int64_t rate;
-      int64_t dist;
-      int64_t square_error = sse;
+      const int64_t square_error = sum_sse;
       int quantizer = (pd->dequant[1] >> 3);
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        quantizer >>= (xd->bd - 8);
+      }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
       if (quantizer < 120)
         rate = (square_error * (280 - quantizer)) >> 8;
@@ -240,13 +266,26 @@
       rate_sum += rate;
       dist_sum += dist;
     } else {
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs],
+                                     pd->dequant[1] >> (xd->bd - 5),
+                                     &rate, &dist);
+      } else {
+        vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs],
+                                     pd->dequant[1] >> 3, &rate, &dist);
+      }
+#else
       vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs],
                                    pd->dequant[1] >> 3, &rate, &dist);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       rate_sum += rate;
       dist_sum += dist;
     }
   }
 
+  *skip_txfm_sb = skip_flag;
+  *skip_sse_sb = total_sse << 4;
   *out_rate_sum = (int)rate_sum;
   *out_dist_sum = dist_sum << 4;
 }
@@ -266,6 +305,31 @@
   return error;
 }
 
+
+#if CONFIG_VP9_HIGHBITDEPTH
+int64_t vp9_highbd_block_error_c(const tran_low_t *coeff,
+                                 const tran_low_t *dqcoeff,
+                                 intptr_t block_size,
+                                 int64_t *ssz, int bd) {
+  int i;
+  int64_t error = 0, sqcoeff = 0;
+  int shift = 2 * (bd - 8);
+  int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+  for (i = 0; i < block_size; i++) {
+    const int64_t diff = coeff[i] - dqcoeff[i];
+    error +=  diff * diff;
+    sqcoeff += (int64_t)coeff[i] * (int64_t)coeff[i];
+  }
+  assert(error >= 0 && sqcoeff >= 0);
+  error = (error + rounding) >> shift;
+  sqcoeff = (sqcoeff + rounding) >> shift;
+
+  *ssz = sqcoeff;
+  return error;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 /* The trailing '0' is a terminator which is used inside cost_coeffs() to
  * decide whether to include cost of a trailing EOB node or not (i.e. we
  * can skip this if the last coefficient in this transform block, e.g. the
@@ -351,8 +415,14 @@
 
   return cost;
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void dist_block(int plane, int block, TX_SIZE tx_size,
+                       struct rdcost_block_args* args, int bd) {
+#else
 static void dist_block(int plane, int block, TX_SIZE tx_size,
                        struct rdcost_block_args* args) {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   const int ss_txfrm_size = tx_size << 1;
   MACROBLOCK* const x = args->x;
   MACROBLOCKD* const xd = &x->e_mbd;
@@ -362,14 +432,24 @@
   int shift = tx_size == TX_32X32 ? 0 : 2;
   tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+#if CONFIG_VP9_HIGHBITDEPTH
+  args->dist = vp9_highbd_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
+                                      &this_sse, bd) >> shift;
+#else
   args->dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
                                &this_sse) >> shift;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   args->sse  = this_sse >> shift;
 
   if (x->skip_encode && !is_inter_block(&xd->mi[0].src_mi->mbmi)) {
     // TODO(jingning): tune the model to better capture the distortion.
     int64_t p = (pd->dequant[1] * pd->dequant[1] *
                     (1 << ss_txfrm_size)) >> (shift + 2);
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      p >>= ((xd->bd - 8) * 2);
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     args->dist += (p >> 4);
     args->sse  += p;
   }
@@ -399,12 +479,28 @@
 
   if (!is_inter_block(mbmi)) {
     vp9_encode_block_intra(x, plane, block, plane_bsize, tx_size, &mbmi->skip);
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      dist_block(plane, block, tx_size, args, xd->bd);
+    } else {
+      dist_block(plane, block, tx_size, args, 8);
+    }
+#else
     dist_block(plane, block, tx_size, args);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   } else if (max_txsize_lookup[plane_bsize] == tx_size) {
     if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 0) {
       // full forward transform and quantization
       vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        dist_block(plane, block, tx_size, args, xd->bd);
+      } else {
+        dist_block(plane, block, tx_size, args, 8);
+      }
+#else
       dist_block(plane, block, tx_size, args);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 2) {
       // compute DC coefficient
       tran_low_t *const coeff   = BLOCK_OFFSET(x->plane[plane].coeff, block);
@@ -412,9 +508,17 @@
       vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
       args->sse  = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
       args->dist = args->sse;
-      if (!x->plane[plane].eobs[block])
-        args->dist = args->sse - ((coeff[0] * coeff[0] -
-            (coeff[0] - dqcoeff[0]) * (coeff[0] - dqcoeff[0])) >> 2);
+      if (x->plane[plane].eobs[block]) {
+        int64_t dc_correct = coeff[0] * coeff[0] -
+            (coeff[0] - dqcoeff[0]) * (coeff[0] - dqcoeff[0]);
+#if CONFIG_VP9_HIGHBITDEPTH
+        dc_correct >>= ((xd->bd - 8) * 2);
+#endif
+        if (tx_size != TX_32X32)
+          dc_correct >>= 2;
+
+        args->dist = MAX(0, args->sse - dc_correct);
+      }
     } else {
       // skip forward transform
       x->plane[plane].eobs[block] = 0;
@@ -424,7 +528,15 @@
   } else {
     // full forward transform and quantization
     vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      dist_block(plane, block, tx_size, args, xd->bd);
+    } else {
+      dist_block(plane, block, tx_size, args, 8);
+    }
+#else
     dist_block(plane, block, tx_size, args);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   }
 
   rate_block(plane, block, plane_bsize, tx_size, args);
@@ -659,6 +771,9 @@
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   int idx, idy;
   uint8_t best_dst[8 * 8];
+#if CONFIG_VP9_HIGHBITDEPTH
+  uint16_t best_dst16[8 * 8];
+#endif
 
   assert(ib < 4);
 
@@ -666,6 +781,108 @@
   vpx_memcpy(tl, l, sizeof(tl));
   xd->mi[0].src_mi->mbmi.tx_size = TX_4X4;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
+      int64_t this_rd;
+      int ratey = 0;
+      int64_t distortion = 0;
+      int rate = bmode_costs[mode];
+
+      if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode)))
+        continue;
+
+      // Only do the oblique modes if the best so far is
+      // one of the neighboring directional modes
+      if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+        if (conditional_skipintra(mode, *best_mode))
+            continue;
+      }
+
+      vpx_memcpy(tempa, ta, sizeof(ta));
+      vpx_memcpy(templ, tl, sizeof(tl));
+
+      for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
+        for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
+          const int block = ib + idy * 2 + idx;
+          const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
+          uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
+          int16_t *const src_diff = raster_block_offset_int16(BLOCK_8X8, block,
+                                                              p->src_diff);
+          tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
+          xd->mi[0].src_mi->bmi[block].as_mode = mode;
+          vp9_predict_intra_block(xd, block, 1,
+                                  TX_4X4, mode,
+                                  x->skip_encode ? src : dst,
+                                  x->skip_encode ? src_stride : dst_stride,
+                                  dst, dst_stride, idx, idy, 0);
+          vp9_highbd_subtract_block(4, 4, src_diff, 8, src, src_stride,
+                                    dst, dst_stride, xd->bd);
+          if (xd->lossless) {
+            const scan_order *so = &vp9_default_scan_orders[TX_4X4];
+            vp9_highbd_fwht4x4(src_diff, coeff, 8);
+            vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+            ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
+                                 so->scan, so->neighbors,
+                                 cpi->sf.use_fast_coef_costing);
+            if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+              goto next_highbd;
+            vp9_highbd_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block),
+                                   dst, dst_stride,
+                                   p->eobs[block], xd->bd);
+          } else {
+            int64_t unused;
+            const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
+            const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type];
+            vp9_highbd_fht4x4(src_diff, coeff, 8, tx_type);
+            vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+            ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
+                                 so->scan, so->neighbors,
+                                 cpi->sf.use_fast_coef_costing);
+            distortion += vp9_highbd_block_error(
+                coeff, BLOCK_OFFSET(pd->dqcoeff, block),
+                16, &unused, xd->bd) >> 2;
+            if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+              goto next_highbd;
+            vp9_highbd_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block),
+                                  dst, dst_stride, p->eobs[block], xd->bd);
+          }
+        }
+      }
+
+      rate += ratey;
+      this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+      if (this_rd < best_rd) {
+        *bestrate = rate;
+        *bestratey = ratey;
+        *bestdistortion = distortion;
+        best_rd = this_rd;
+        *best_mode = mode;
+        vpx_memcpy(a, tempa, sizeof(tempa));
+        vpx_memcpy(l, templ, sizeof(templ));
+        for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy) {
+          vpx_memcpy(best_dst16 + idy * 8,
+                     CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
+                     num_4x4_blocks_wide * 4 * sizeof(uint16_t));
+        }
+      }
+    next_highbd:
+      {}
+    }
+    if (best_rd >= rd_thresh || x->skip_encode)
+      return best_rd;
+
+    for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy) {
+      vpx_memcpy(CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
+                 best_dst16 + idy * 8,
+                 num_4x4_blocks_wide * 4 * sizeof(uint16_t));
+    }
+
+    return best_rd;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
     int64_t this_rd;
     int ratey = 0;
@@ -827,6 +1044,7 @@
   return RDCOST(mb->rdmult, mb->rddiv, cost, total_distortion);
 }
 
+// This function is used only for intra_only frames
 static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                       int *rate, int *rate_tokenonly,
                                       int64_t *distortion, int *skippable,
@@ -841,24 +1059,21 @@
   int64_t this_distortion, this_rd;
   TX_SIZE best_tx = TX_4X4;
   int i;
-  int *bmode_costs = cpi->mbmode_cost;
+  int *bmode_costs;
+  const MODE_INFO *above_mi = xd->mi[-xd->mi_stride].src_mi;
+  const MODE_INFO *left_mi = xd->left_available ? xd->mi[-1].src_mi : NULL;
+  const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0);
+  const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0);
+  bmode_costs = cpi->y_mode_costs[A][L];
 
   if (cpi->sf.tx_size_search_method == USE_FULL_RD)
     for (i = 0; i < TX_MODES; i++)
       tx_cache[i] = INT64_MAX;
 
+  vpx_memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
   /* Y Search for intra prediction mode */
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
     int64_t local_tx_cache[TX_MODES];
-    MODE_INFO *above_mi = xd->mi[-xd->mi_stride].src_mi;
-    MODE_INFO *left_mi = xd->left_available ? xd->mi[-1].src_mi : NULL;
-
-    if (cpi->common.frame_type == KEY_FRAME) {
-      const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0);
-      const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0);
-
-      bmode_costs = cpi->y_mode_costs[A][L];
-    }
     mic->mbmi.mode = mode;
 
     super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
@@ -897,21 +1112,24 @@
   return best_rd;
 }
 
-static void super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x,
-                             int *rate, int64_t *distortion, int *skippable,
-                             int64_t *sse, BLOCK_SIZE bsize,
-                             int64_t ref_best_rd) {
+// Return value 0: early termination triggered, no valid rd cost available;
+//              1: rd cost values are valid.
+static int super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x,
+                            int *rate, int64_t *distortion, int *skippable,
+                            int64_t *sse, BLOCK_SIZE bsize,
+                            int64_t ref_best_rd) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
   const TX_SIZE uv_tx_size = get_uv_tx_size(mbmi, &xd->plane[1]);
   int plane;
   int pnrate = 0, pnskip = 1;
   int64_t pndist = 0, pnsse = 0;
+  int is_cost_valid = 1;
 
   if (ref_best_rd < 0)
-    goto term;
+    is_cost_valid = 0;
 
-  if (is_inter_block(mbmi)) {
+  if (is_inter_block(mbmi) && is_cost_valid) {
     int plane;
     for (plane = 1; plane < MAX_MB_PLANE; ++plane)
       vp9_subtract_plane(x, bsize, plane);
@@ -926,21 +1144,25 @@
     txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse,
                      ref_best_rd, plane, bsize, uv_tx_size,
                      cpi->sf.use_fast_coef_costing);
-    if (pnrate == INT_MAX)
-      goto term;
+    if (pnrate == INT_MAX) {
+      is_cost_valid = 0;
+      break;
+    }
     *rate += pnrate;
     *distortion += pndist;
     *sse += pnsse;
     *skippable &= pnskip;
   }
-  return;
 
-  term:
-  *rate = INT_MAX;
-  *distortion = INT64_MAX;
-  *sse = INT64_MAX;
-  *skippable = 0;
-  return;
+  if (!is_cost_valid) {
+    // reset cost value
+    *rate = INT_MAX;
+    *distortion = INT64_MAX;
+    *sse = INT64_MAX;
+    *skippable = 0;
+  }
+
+  return is_cost_valid;
 }
 
 static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
@@ -955,15 +1177,15 @@
   int this_rate_tokenonly, this_rate, s;
   int64_t this_distortion, this_sse;
 
+  vpx_memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
     if (!(cpi->sf.intra_uv_mode_mask[max_tx_size] & (1 << mode)))
       continue;
 
     xd->mi[0].src_mi->mbmi.uv_mode = mode;
 
-    super_block_uvrd(cpi, x, &this_rate_tokenonly,
-                     &this_distortion, &s, &this_sse, bsize, best_rd);
-    if (this_rate_tokenonly == INT_MAX)
+    if (!super_block_uvrd(cpi, x, &this_rate_tokenonly,
+                          &this_distortion, &s, &this_sse, bsize, best_rd))
       continue;
     this_rate = this_rate_tokenonly +
                 cpi->intra_uv_mode_cost[cpi->common.frame_type][mode];
@@ -993,6 +1215,7 @@
   int64_t unused;
 
   x->e_mbd.mi[0].src_mi->mbmi.uv_mode = DC_PRED;
+  vpx_memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
   super_block_uvrd(cpi, x, rate_tokenonly, distortion,
                    skippable, &unused, bsize, INT64_MAX);
   *rate = *rate_tokenonly + cpi->intra_uv_mode_cost[cm->frame_type][DC_PRED];
@@ -1121,6 +1344,16 @@
   for (ref = 0; ref < 1 + is_compound; ++ref) {
     const uint8_t *pre = &pd->pre[ref].buf[raster_block_offset(BLOCK_8X8, i,
                                                pd->pre[ref].stride)];
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vp9_highbd_build_inter_predictor(pre, pd->pre[ref].stride,
+                                     dst, pd->dst.stride,
+                                     &mi->bmi[i].as_mv[ref].as_mv,
+                                     &xd->block_refs[ref]->sf, width, height,
+                                     ref, kernel, MV_PRECISION_Q3,
+                                     mi_col * MI_SIZE + 4 * (i % 2),
+                                     mi_row * MI_SIZE + 4 * (i / 2), xd->bd);
+  } else {
     vp9_build_inter_predictor(pre, pd->pre[ref].stride,
                               dst, pd->dst.stride,
                               &mi->bmi[i].as_mv[ref].as_mv,
@@ -1129,11 +1362,32 @@
                               mi_col * MI_SIZE + 4 * (i % 2),
                               mi_row * MI_SIZE + 4 * (i / 2));
   }
+#else
+    vp9_build_inter_predictor(pre, pd->pre[ref].stride,
+                              dst, pd->dst.stride,
+                              &mi->bmi[i].as_mv[ref].as_mv,
+                              &xd->block_refs[ref]->sf, width, height, ref,
+                              kernel, MV_PRECISION_Q3,
+                              mi_col * MI_SIZE + 4 * (i % 2),
+                              mi_row * MI_SIZE + 4 * (i / 2));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vp9_highbd_subtract_block(
+        height, width, raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
+        src, p->src.stride, dst, pd->dst.stride, xd->bd);
+  } else {
+    vp9_subtract_block(
+        height, width, raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
+        src, p->src.stride, dst, pd->dst.stride);
+  }
+#else
   vp9_subtract_block(height, width,
                      raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
-                     src, p->src.stride,
-                     dst, pd->dst.stride);
+                     src, p->src.stride, dst, pd->dst.stride);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
   k = i;
   for (idy = 0; idy < height / 4; ++idy) {
@@ -1146,8 +1400,19 @@
       x->fwd_txm4x4(raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
                     coeff, 8);
       vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan);
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        thisdistortion += vp9_highbd_block_error(coeff,
+                                                 BLOCK_OFFSET(pd->dqcoeff, k),
+                                                 16, &ssz, xd->bd);
+      } else {
+        thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
+                                          16, &ssz);
+      }
+#else
       thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
                                         16, &ssz);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       thissse += ssz;
       thisrate += cost_coeffs(x, 0, k, ta + (k & 1), tl + (k >> 1), TX_4X4,
                               so->scan, so->neighbors,
@@ -1369,7 +1634,7 @@
           int sadpb = x->sadperbit4;
           MV mvp_full;
           int max_mv;
-          int sad_list[5];
+          int cost_list[5];
 
           /* Is the best so far sufficiently good that we cant justify doing
            * and new motion search. */
@@ -1415,7 +1680,7 @@
 
           bestsme = vp9_full_pixel_search(
               cpi, x, bsize, &mvp_full, step_param, sadpb,
-              cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? sad_list : NULL,
+              cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL,
               &bsi->ref_mv[0]->as_mv, new_mv,
               INT_MAX, 1);
 
@@ -1429,7 +1694,7 @@
                                            sadpb, 16, &cpi->fn_ptr[bsize],
                                            &bsi->ref_mv[0]->as_mv,
                                            &best_mv->as_mv);
-            sad_list[1] = sad_list[2] = sad_list[3] = sad_list[4] = INT_MAX;
+            cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = INT_MAX;
             if (thissme < bestsme) {
               bestsme = thissme;
               *new_mv = best_mv->as_mv;
@@ -1450,7 +1715,7 @@
                 x->errorperbit, &cpi->fn_ptr[bsize],
                 cpi->sf.mv.subpel_force_stop,
                 cpi->sf.mv.subpel_iters_per_step,
-                cond_sad_list(cpi, sad_list),
+                cond_cost_list(cpi, cost_list),
                 x->nmvjointcost, x->mvcost,
                 &distortion,
                 &x->pred_sse[mbmi->ref_frame[0]],
@@ -1784,7 +2049,7 @@
   int tmp_col_max = x->mv_col_max;
   int tmp_row_min = x->mv_row_min;
   int tmp_row_max = x->mv_row_max;
-  int sad_list[5];
+  int cost_list[5];
 
   const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
                                                                         ref);
@@ -1820,14 +2085,14 @@
   }
 
   if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64) {
-    int boffset = 2 * (b_width_log2(BLOCK_64X64) - MIN(b_height_log2(bsize),
-                                                       b_width_log2(bsize)));
+    int boffset = 2 * (b_width_log2_lookup[BLOCK_64X64] -
+          MIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize]));
     step_param = MAX(step_param, boffset);
   }
 
   if (cpi->sf.adaptive_motion_search) {
-    int bwl = b_width_log2(bsize);
-    int bhl = b_height_log2(bsize);
+    int bwl = b_width_log2_lookup[bsize];
+    int bhl = b_height_log2_lookup[bsize];
     int i;
     int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
 
@@ -1856,7 +2121,7 @@
   mvp_full.row >>= 3;
 
   bestsme = vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
-                                  cond_sad_list(cpi, sad_list),
+                                  cond_cost_list(cpi, cost_list),
                                   &ref_mv, &tmp_mv->as_mv, INT_MAX, 1);
 
   x->mv_col_min = tmp_col_min;
@@ -1872,7 +2137,7 @@
                                  &cpi->fn_ptr[bsize],
                                  cpi->sf.mv.subpel_force_stop,
                                  cpi->sf.mv.subpel_iters_per_step,
-                                 cond_sad_list(cpi, sad_list),
+                                 cond_cost_list(cpi, cost_list),
                                  x->nmvjointcost, x->mvcost,
                                  &dis, &x->pred_sse[ref], NULL, 0, 0);
   }
@@ -1904,7 +2169,12 @@
   int_mv ref_mv[2];
   int ite, ref;
   // Prediction buffer from second frame.
+#if CONFIG_VP9_HIGHBITDEPTH
+  uint8_t *second_pred;
+  uint8_t *second_pred_alloc;
+#else
   uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   const InterpKernel *kernel = vp9_get_interp_kernel(mbmi->interp_filter);
 
   // Do joint motion search in compound mode to get more accurate mv.
@@ -1915,6 +2185,15 @@
     vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]),
     vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[1])
   };
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    second_pred_alloc = vpx_memalign(16, pw * ph * sizeof(uint16_t));
+    second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc);
+  } else {
+    second_pred_alloc = vpx_memalign(16, pw * ph * sizeof(uint8_t));
+    second_pred = second_pred_alloc;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
   for (ref = 0; ref < 2; ++ref) {
     ref_mv[ref] = mbmi->ref_mvs[refs[ref]][0];
@@ -1953,6 +2232,28 @@
     ref_yv12[1] = xd->plane[0].pre[1];
 
     // Get pred block from second frame.
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vp9_highbd_build_inter_predictor(ref_yv12[!id].buf,
+                                       ref_yv12[!id].stride,
+                                       second_pred, pw,
+                                       &frame_mv[refs[!id]].as_mv,
+                                       &xd->block_refs[!id]->sf,
+                                       pw, ph, 0,
+                                       kernel, MV_PRECISION_Q3,
+                                       mi_col * MI_SIZE, mi_row * MI_SIZE,
+                                       xd->bd);
+    } else {
+      vp9_build_inter_predictor(ref_yv12[!id].buf,
+                                ref_yv12[!id].stride,
+                                second_pred, pw,
+                                &frame_mv[refs[!id]].as_mv,
+                                &xd->block_refs[!id]->sf,
+                                pw, ph, 0,
+                                kernel, MV_PRECISION_Q3,
+                                mi_col * MI_SIZE, mi_row * MI_SIZE);
+    }
+#else
     vp9_build_inter_predictor(ref_yv12[!id].buf,
                               ref_yv12[!id].stride,
                               second_pred, pw,
@@ -1961,6 +2262,7 @@
                               pw, ph, 0,
                               kernel, MV_PRECISION_Q3,
                               mi_col * MI_SIZE, mi_row * MI_SIZE);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
     // Compound motion search on first ref frame.
     if (id)
@@ -2029,7 +2331,11 @@
                                 x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
   }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  vpx_free(second_pred_alloc);
+#else
   vpx_free(second_pred);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
 static INLINE void restore_dst_buf(MACROBLOCKD *xd,
@@ -2042,93 +2348,12 @@
   }
 }
 
-static void rd_encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x,
-                                    BLOCK_SIZE bsize, int *rate2,
-                                    int64_t *distortion, int64_t *distortion_uv,
-                                    int *disable_skip) {
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  const BLOCK_SIZE y_size = get_plane_block_size(bsize, &xd->plane[0]);
-  const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
-  unsigned int var, sse;
-  // Skipping threshold for ac.
-  unsigned int thresh_ac;
-  // Skipping threshold for dc
-  unsigned int thresh_dc;
-
-  var = cpi->fn_ptr[y_size].vf(x->plane[0].src.buf, x->plane[0].src.stride,
-                               xd->plane[0].dst.buf,
-                               xd->plane[0].dst.stride, &sse);
-
-  if (x->encode_breakout > 0) {
-    // Set a maximum for threshold to avoid big PSNR loss in low bitrate
-    // case. Use extreme low threshold for static frames to limit skipping.
-    const unsigned int max_thresh = (cpi->allow_encode_breakout ==
-                                     ENCODE_BREAKOUT_LIMITED) ? 128 : 36000;
-    // The encode_breakout input
-    const unsigned int min_thresh =
-        MIN(((unsigned int)x->encode_breakout << 4), max_thresh);
-
-    // Calculate threshold according to dequant value.
-    thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
-    thresh_ac = clamp(thresh_ac, min_thresh, max_thresh);
-
-    // Adjust threshold according to partition size.
-    thresh_ac >>= 8 - (b_width_log2(bsize) +
-        b_height_log2(bsize));
-    thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6);
-  } else {
-    thresh_ac = 0;
-    thresh_dc = 0;
-  }
-
-  // Y skipping condition checking
-  if (sse < thresh_ac || sse == 0) {
-    // dc skipping checking
-    if ((sse - var) < thresh_dc || sse == var) {
-      unsigned int sse_u, sse_v;
-      unsigned int var_u, var_v;
-
-      var_u = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf,
-                                      x->plane[1].src.stride,
-                                      xd->plane[1].dst.buf,
-                                      xd->plane[1].dst.stride, &sse_u);
-
-      // U skipping condition checking
-      if ((sse_u * 4 < thresh_ac || sse_u == 0) &&
-          (sse_u - var_u < thresh_dc || sse_u == var_u)) {
-        var_v = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf,
-                                        x->plane[2].src.stride,
-                                        xd->plane[2].dst.buf,
-                                        xd->plane[2].dst.stride, &sse_v);
-
-        // V skipping condition checking
-        if ((sse_v * 4 < thresh_ac || sse_v == 0) &&
-            (sse_v - var_v < thresh_dc || sse_v == var_v)) {
-          x->skip = 1;
-
-          // The cost of skip bit needs to be added.
-          *rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
-
-          // Scaling factor for SSE from spatial domain to frequency domain
-          // is 16. Adjust distortion accordingly.
-          *distortion_uv = (sse_u + sse_v) << 4;
-          *distortion = (sse << 4) + *distortion_uv;
-
-          *disable_skip = 1;
-        }
-      }
-    }
-  }
-}
-
 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                  BLOCK_SIZE bsize,
                                  int64_t txfm_cache[],
                                  int *rate2, int64_t *distortion,
                                  int *skippable,
-                                 int *rate_y, int64_t *distortion_y,
-                                 int *rate_uv, int64_t *distortion_uv,
+                                 int *rate_y, int *rate_uv,
                                  int *disable_skip,
                                  int_mv (*mode_mv)[MAX_REF_FRAMES],
                                  int mi_row, int mi_col,
@@ -2148,8 +2373,13 @@
   int refs[2] = { mbmi->ref_frame[0],
     (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
   int_mv cur_mv[2];
-  int64_t this_rd = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, tmp_buf16, MAX_MB_PLANE * 64 * 64);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf8, MAX_MB_PLANE * 64 * 64);
+  uint8_t *tmp_buf;
+#else
   DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAX_MB_PLANE * 64 * 64);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   int pred_exists = 0;
   int intpel_mv;
   int64_t rd, tmp_rd, best_rd = INT64_MAX;
@@ -2166,6 +2396,18 @@
       (((mi_row + mi_col) >> bsl) +
        get_chessboard_index(cm->current_video_frame)) & 0x1 : 0;
 
+  int skip_txfm_sb = 0;
+  int64_t skip_sse_sb = INT64_MAX;
+  int64_t distortion_y = 0, distortion_uv = 0;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf16);
+  } else {
+    tmp_buf = tmp_buf8;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
   if (pred_filter_search) {
     INTERP_FILTER af = SWITCHABLE, lf = SWITCHABLE;
     if (xd->up_available)
@@ -2275,6 +2517,9 @@
       for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
         int j;
         int64_t rs_rd;
+        int tmp_skip_sb = 0;
+        int64_t tmp_skip_sse = INT64_MAX;
+
         mbmi->interp_filter = i;
         rs = vp9_get_switchable_rate(cpi);
         rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
@@ -2310,7 +2555,8 @@
             }
           }
           vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
-          model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum);
+          model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum,
+                          &tmp_skip_sb, &tmp_skip_sse);
 
           rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum);
           rd_opt->filter_cache[i] = rd;
@@ -2339,8 +2585,6 @@
           best_filter = mbmi->interp_filter;
           if (cm->interp_filter == SWITCHABLE && i && !intpel_mv)
             best_needs_copy = !best_needs_copy;
-          vpx_memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
-          vpx_memcpy(bsse, x->bsse, sizeof(bsse));
         }
 
         if ((cm->interp_filter == SWITCHABLE && newbest) ||
@@ -2348,6 +2592,11 @@
              cm->interp_filter == mbmi->interp_filter)) {
           pred_exists = 1;
           tmp_rd = best_rd;
+
+          skip_txfm_sb = tmp_skip_sb;
+          skip_sse_sb = tmp_skip_sse;
+          vpx_memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
+          vpx_memcpy(bsse, x->bsse, sizeof(bsse));
         }
       }
       restore_dst_buf(xd, orig_dst, orig_dst_stride);
@@ -2374,7 +2623,8 @@
     // switchable list (ex. bilinear) is indicated at the frame level, or
     // skip condition holds.
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
-    model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist);
+    model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist,
+                    &skip_txfm_sb, &skip_sse_sb);
     rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
     vpx_memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
     vpx_memcpy(bsse, x->bsse, sizeof(bsse));
@@ -2401,23 +2651,17 @@
   if (cm->interp_filter == SWITCHABLE)
     *rate2 += rs;
 
-  if (!is_comp_pred) {
-    if (cpi->allow_encode_breakout)
-      rd_encode_breakout_test(cpi, x, bsize, rate2, distortion, distortion_uv,
-                              disable_skip);
-  }
-
   vpx_memcpy(x->skip_txfm, skip_txfm, sizeof(skip_txfm));
   vpx_memcpy(x->bsse, bsse, sizeof(bsse));
 
-  if (!x->skip) {
+  if (!skip_txfm_sb) {
     int skippable_y, skippable_uv;
     int64_t sseuv = INT64_MAX;
     int64_t rdcosty = INT64_MAX;
 
     // Y cost and distortion
     vp9_subtract_plane(x, bsize, 0);
-    super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse,
+    super_block_yrd(cpi, x, rate_y, &distortion_y, &skippable_y, psse,
                     bsize, txfm_cache, ref_best_rd);
 
     if (*rate_y == INT_MAX) {
@@ -2428,14 +2672,13 @@
     }
 
     *rate2 += *rate_y;
-    *distortion += *distortion_y;
+    *distortion += distortion_y;
 
     rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
     rdcosty = MIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse));
 
-    super_block_uvrd(cpi, x, rate_uv, distortion_uv, &skippable_uv, &sseuv,
-                     bsize, ref_best_rd - rdcosty);
-    if (*rate_uv == INT_MAX) {
+    if (!super_block_uvrd(cpi, x, rate_uv, &distortion_uv, &skippable_uv,
+                          &sseuv, bsize, ref_best_rd - rdcosty)) {
       *rate2 = INT_MAX;
       *distortion = INT64_MAX;
       restore_dst_buf(xd, orig_dst, orig_dst_stride);
@@ -2444,20 +2687,27 @@
 
     *psse += sseuv;
     *rate2 += *rate_uv;
-    *distortion += *distortion_uv;
+    *distortion += distortion_uv;
     *skippable = skippable_y && skippable_uv;
+  } else {
+    x->skip = 1;
+    *disable_skip = 1;
+
+    // The cost of skip bit needs to be added.
+    *rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+
+    *distortion = skip_sse_sb;
   }
 
   if (!is_comp_pred)
     single_skippable[this_mode][refs[0]] = *skippable;
 
   restore_dst_buf(xd, orig_dst, orig_dst_stride);
-  return this_rd;  // if 0, this will be re-calculated by caller
+  return 0;  // The rate-distortion cost will be re-calculated by caller.
 }
 
 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
-                               int *returnrate, int64_t *returndist,
-                               BLOCK_SIZE bsize,
+                               RD_COST *rd_cost, BLOCK_SIZE bsize,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -2474,37 +2724,34 @@
     if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
                                &dist_y, &y_skip, bsize, tx_cache,
                                best_rd) >= best_rd) {
-      *returnrate = INT_MAX;
+      rd_cost->rate = INT_MAX;
       return;
     }
-    max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0].src_mi->mbmi.tx_size, bsize,
-                                         pd[1].subsampling_x,
-                                         pd[1].subsampling_y);
-    rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
-                            &dist_uv, &uv_skip, bsize, max_uv_tx_size);
   } else {
     y_skip = 0;
     if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate_y, &rate_y_tokenonly,
                                      &dist_y, best_rd) >= best_rd) {
-      *returnrate = INT_MAX;
+      rd_cost->rate = INT_MAX;
       return;
     }
-    max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0].src_mi->mbmi.tx_size, bsize,
-                                         pd[1].subsampling_x,
-                                         pd[1].subsampling_y);
-    rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
-                            &dist_uv, &uv_skip, BLOCK_8X8, max_uv_tx_size);
   }
+  max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0].src_mi->mbmi.tx_size, bsize,
+                                       pd[1].subsampling_x,
+                                       pd[1].subsampling_y);
+  rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
+                          &dist_uv, &uv_skip, MAX(BLOCK_8X8, bsize),
+                          max_uv_tx_size);
 
   if (y_skip && uv_skip) {
-    *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
-                  vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
-    *returndist = dist_y + dist_uv;
+    rd_cost->rate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
+                    vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+    rd_cost->dist = dist_y + dist_uv;
     vp9_zero(ctx->tx_rd_diff);
   } else {
     int i;
-    *returnrate = rate_y + rate_uv + vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
-    *returndist = dist_y + dist_uv;
+    rd_cost->rate = rate_y + rate_uv +
+                      vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
+    rd_cost->dist = dist_y + dist_uv;
     if (cpi->sf.tx_size_search_method == USE_FULL_RD)
       for (i = 0; i < TX_MODES; i++) {
         if (tx_cache[i] < INT64_MAX && tx_cache[cm->tx_mode] < INT64_MAX)
@@ -2515,45 +2762,43 @@
   }
 
   ctx->mic = *xd->mi[0].src_mi;
+  rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
 }
 
-// Updating rd_thresh_freq_fact[] here means that the different
-// partition/block sizes are handled independently based on the best
-// choice for the current partition. It may well be better to keep a scaled
-// best rd so far value and update rd_thresh_freq_fact based on the mode/size
-// combination that wins out.
 static void update_rd_thresh_fact(VP9_COMP *cpi, int bsize,
                                   int best_mode_index) {
   if (cpi->sf.adaptive_rd_thresh > 0) {
     const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES;
     int mode;
     for (mode = 0; mode < top_mode; ++mode) {
-      int *const fact = &cpi->rd.thresh_freq_fact[bsize][mode];
-
-      if (mode == best_mode_index) {
-        *fact -= (*fact >> 3);
-      } else {
-        *fact = MIN(*fact + RD_THRESH_INC,
-                    cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
+      const BLOCK_SIZE min_size = MAX(bsize - 1, BLOCK_4X4);
+      const BLOCK_SIZE max_size = MIN(bsize + 2, BLOCK_64X64);
+      BLOCK_SIZE bs;
+      for (bs = min_size; bs <= max_size; ++bs) {
+        int *const fact = &cpi->rd.thresh_freq_fact[bs][mode];
+        if (mode == best_mode_index) {
+          *fact -= (*fact >> 4);
+        } else {
+          *fact = MIN(*fact + RD_THRESH_INC,
+                      cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
+        }
       }
     }
   }
 }
 
-int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
-                                  const TileInfo *const tile,
-                                  int mi_row, int mi_col,
-                                  int *returnrate,
-                                  int64_t *returndistortion,
-                                  BLOCK_SIZE bsize,
-                                  PICK_MODE_CONTEXT *ctx,
-                                  int64_t best_rd_so_far) {
+void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
+                               const TileInfo *const tile,
+                               int mi_row, int mi_col,
+                               RD_COST *rd_cost, BLOCK_SIZE bsize,
+                               PICK_MODE_CONTEXT *ctx,
+                               int64_t best_rd_so_far) {
   VP9_COMMON *const cm = &cpi->common;
   RD_OPT *const rd_opt = &cpi->rd;
+  SPEED_FEATURES *const sf = &cpi->sf;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
   const struct segmentation *const seg = &cm->seg;
-  struct macroblockd_plane *const pd = xd->plane;
   PREDICTION_MODE this_mode;
   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
   unsigned char segment_id = mbmi->segment_id;
@@ -2584,20 +2829,20 @@
   int64_t dist_uv[TX_SIZES];
   int skip_uv[TX_SIZES];
   PREDICTION_MODE mode_uv[TX_SIZES];
-  const int intra_cost_penalty =
-      20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+  const int intra_cost_penalty = vp9_get_intra_cost_penalty(
+      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
   int best_skip2 = 0;
   uint8_t ref_frame_skip_mask[2] = { 0 };
   uint16_t mode_skip_mask[MAX_REF_FRAMES] = { 0 };
-  int mode_skip_start = cpi->sf.mode_skip_start + 1;
+  int mode_skip_start = sf->mode_skip_start + 1;
   const int *const rd_threshes = rd_opt->threshes[segment_id][bsize];
   const int *const rd_thresh_freq_fact = rd_opt->thresh_freq_fact[bsize];
-  int mode_threshold[MAX_MODES];
+  int64_t mode_threshold[MAX_MODES];
   int *mode_map = rd_opt->mode_map[bsize];
-  const int mode_search_skip_flags = cpi->sf.mode_search_skip_flags;
+  const int mode_search_skip_flags = sf->mode_search_skip_flags;
   vp9_zero(best_mbmode);
 
-  x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
+  x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
 
   estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
                            &comp_mode_p);
@@ -2619,7 +2864,7 @@
     }
   }
 
-  *returnrate = INT_MAX;
+  rd_cost->rate = INT_MAX;
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
@@ -2638,7 +2883,7 @@
       // are masked out.
       ref_frame_skip_mask[0] |= (1 << ref_frame);
       ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
-    } else if (cpi->sf.reference_masking) {
+    } else if (sf->reference_masking) {
       for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
         // Skip fixed mv modes for poor references
         if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) {
@@ -2676,25 +2921,37 @@
   }
 
   if (cpi->rc.is_src_frame_alt_ref) {
-    if (cpi->sf.alt_ref_search_fp) {
+    if (sf->alt_ref_search_fp) {
       mode_skip_mask[ALTREF_FRAME] = 0;
       ref_frame_skip_mask[0] = ~(1 << ALTREF_FRAME);
       ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
     }
   }
 
-  if (bsize > cpi->sf.max_intra_bsize) {
+  if (sf->alt_ref_search_fp)
+    if (!cm->show_frame && x->pred_mv_sad[GOLDEN_FRAME] < INT_MAX)
+      if (x->pred_mv_sad[ALTREF_FRAME] > (x->pred_mv_sad[GOLDEN_FRAME] << 1))
+        mode_skip_mask[ALTREF_FRAME] |= INTER_ALL;
+
+  if (sf->adaptive_mode_search) {
+    if (cm->show_frame && !cpi->rc.is_src_frame_alt_ref &&
+        cpi->rc.frames_since_golden >= 3)
+      if (x->pred_mv_sad[GOLDEN_FRAME] > (x->pred_mv_sad[LAST_FRAME] << 1))
+        mode_skip_mask[GOLDEN_FRAME] |= INTER_ALL;
+  }
+
+  if (bsize > sf->max_intra_bsize) {
     ref_frame_skip_mask[0] |= (1 << INTRA_FRAME);
     ref_frame_skip_mask[1] |= (1 << INTRA_FRAME);
   }
 
   mode_skip_mask[INTRA_FRAME] |=
-      ~(cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]]);
+      ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
 
   for (i = 0; i < MAX_MODES; ++i)
     mode_threshold[i] = ((int64_t)rd_threshes[i] * rd_thresh_freq_fact[i]) >> 5;
 
-  midx =  cpi->sf.schedule_mode_search ? mode_skip_start : 0;
+  midx =  sf->schedule_mode_search ? mode_skip_start : 0;
   while (midx > 4) {
     uint8_t end_pos = 0;
     for (i = 5; i < midx; ++i) {
@@ -2758,18 +3015,18 @@
       continue;
 
     // Test best rd so far against threshold for trying this mode.
-    if (best_mode_skippable && cpi->sf.schedule_mode_search)
+    if (best_mode_skippable && sf->schedule_mode_search)
       mode_threshold[mode_index] <<= 1;
 
     if (best_rd < mode_threshold[mode_index])
       continue;
 
-    if (cpi->sf.motion_field_mode_search) {
+    if (sf->motion_field_mode_search) {
       const int mi_width  = MIN(num_8x8_blocks_wide_lookup[bsize],
                                 tile->mi_col_end - mi_col);
       const int mi_height = MIN(num_8x8_blocks_high_lookup[bsize],
                                 tile->mi_row_end - mi_row);
-      const int bsl = mi_width_log2(bsize);
+      const int bsl = mi_width_log2_lookup[bsize];
       int cb_partition_search_ctrl = (((mi_row + mi_col) >> bsl)
           + get_chessboard_index(cm->current_video_frame)) & 0x1;
       MB_MODE_INFO *ref_mbmi;
@@ -2838,7 +3095,7 @@
     }
 
     if (ref_frame == INTRA_FRAME) {
-      if (cpi->sf.adaptive_mode_search)
+      if (sf->adaptive_mode_search)
         if ((x->source_variance << num_pels_log2_lookup[bsize]) > best_pred_sse)
           continue;
 
@@ -2895,14 +3152,15 @@
 
     if (ref_frame == INTRA_FRAME) {
       TX_SIZE uv_tx;
+      struct macroblockd_plane *const pd = &xd->plane[1];
+      vpx_memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
       super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
                       NULL, bsize, tx_cache, best_rd);
-
       if (rate_y == INT_MAX)
         continue;
 
-      uv_tx = get_uv_tx_size_impl(mbmi->tx_size, bsize, pd[1].subsampling_x,
-                                  pd[1].subsampling_y);
+      uv_tx = get_uv_tx_size_impl(mbmi->tx_size, bsize, pd->subsampling_x,
+                                  pd->subsampling_y);
       if (rate_uv_intra[uv_tx] == INT_MAX) {
         choose_intra_uv_mode(cpi, ctx, bsize, uv_tx,
                              &rate_uv_intra[uv_tx], &rate_uv_tokenonly[uv_tx],
@@ -2922,8 +3180,7 @@
       this_rd = handle_inter_mode(cpi, x, bsize,
                                   tx_cache,
                                   &rate2, &distortion2, &skippable,
-                                  &rate_y, &distortion_y,
-                                  &rate_uv, &distortion_uv,
+                                  &rate_y, &rate_uv,
                                   &disable_skip, frame_mv,
                                   mi_row, mi_col,
                                   single_newmv, single_inter_filter,
@@ -3004,8 +3261,9 @@
           best_pred_sse = x->pred_sse[ref_frame];
         }
 
-        *returnrate = rate2;
-        *returndistortion = distortion2;
+        rd_cost->rate = rate2;
+        rd_cost->dist = distortion2;
+        rd_cost->rdcost = this_rd;
         best_rd = this_rd;
         best_mbmode = *mbmi;
         best_skip2 = this_skip2;
@@ -3020,9 +3278,14 @@
         // based on qp, activity mask and history
         if ((mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
             (mode_index > MIN_EARLY_TERM_INDEX)) {
-          const int qstep = xd->plane[0].dequant[1];
+          int qstep = xd->plane[0].dequant[1];
           // TODO(debargha): Enhance this by specializing for each mode_index
           int scale = 4;
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            qstep >>= (xd->bd - 8);
+          }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
           if (x->source_variance < UINT_MAX) {
             const int var_adjust = (x->source_variance < 16);
             scale -= var_adjust;
@@ -3130,11 +3393,14 @@
       best_mbmode.mode = ZEROMV;
   }
 
-  if (best_mode_index < 0 || best_rd >= best_rd_so_far)
-    return INT64_MAX;
+  if (best_mode_index < 0 || best_rd >= best_rd_so_far) {
+    rd_cost->rate = INT_MAX;
+    rd_cost->rdcost = INT64_MAX;
+    return;
+  }
 
   // If we used an estimate for the uv intra rd in the loop above...
-  if (cpi->sf.use_uv_intra_rd_estimate) {
+  if (sf->use_uv_intra_rd_estimate) {
     // Do Intra UV best rd mode selection if best mode choice above was intra.
     if (best_mbmode.ref_frame[0] == INTRA_FRAME) {
       TX_SIZE uv_tx_size;
@@ -3191,18 +3457,33 @@
   // updating code causes PSNR loss. Need to figure out the confliction.
   x->skip |= best_mode_skippable;
 
+  if (!x->skip && !x->select_tx_size) {
+    int has_high_freq_coeff = 0;
+    int plane;
+    int max_plane = is_inter_block(&xd->mi[0].src_mi->mbmi)
+                        ? MAX_MB_PLANE : 1;
+    for (plane = 0; plane < max_plane; ++plane) {
+      x->plane[plane].eobs = ctx->eobs_pbuf[plane][1];
+      has_high_freq_coeff |= vp9_has_high_freq_in_plane(x, bsize, plane);
+    }
+
+    for (plane = max_plane; plane < MAX_MB_PLANE; ++plane) {
+      x->plane[plane].eobs = ctx->eobs_pbuf[plane][2];
+      has_high_freq_coeff |= vp9_has_high_freq_in_plane(x, bsize, plane);
+    }
+
+    best_mode_skippable |= !has_high_freq_coeff;
+  }
+
   store_coding_context(x, ctx, best_mode_index, best_pred_diff,
                        best_tx_diff, best_filter_diff, best_mode_skippable);
-
-  return best_rd;
 }
 
-int64_t vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, MACROBLOCK *x,
-                                           int *returnrate,
-                                           int64_t *returndistortion,
-                                           BLOCK_SIZE bsize,
-                                           PICK_MODE_CONTEXT *ctx,
-                                           int64_t best_rd_so_far) {
+void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, MACROBLOCK *x,
+                                        RD_COST *rd_cost,
+                                        BLOCK_SIZE bsize,
+                                        PICK_MODE_CONTEXT *ctx,
+                                        int64_t best_rd_so_far) {
   VP9_COMMON *const cm = &cpi->common;
   RD_OPT *const rd_opt = &cpi->rd;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -3230,7 +3511,7 @@
   for (i = LAST_FRAME; i < MAX_REF_FRAMES; ++i)
     x->pred_mv_sad[i] = INT_MAX;
 
-  *returnrate = INT_MAX;
+  rd_cost->rate = INT_MAX;
 
   assert(vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP));
 
@@ -3279,11 +3560,15 @@
   rate2 += ref_costs_single[LAST_FRAME];
   this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
 
-  *returnrate = rate2;
-  *returndistortion = distortion2;
+  rd_cost->rate = rate2;
+  rd_cost->dist = distortion2;
+  rd_cost->rdcost = this_rd;
 
-  if (this_rd >= best_rd_so_far)
-    return INT64_MAX;
+  if (this_rd >= best_rd_so_far) {
+    rd_cost->rate = INT_MAX;
+    rd_cost->rdcost = INT64_MAX;
+    return;
+  }
 
   assert((cm->interp_filter == SWITCHABLE) ||
          (cm->interp_filter == mbmi->interp_filter));
@@ -3298,20 +3583,18 @@
     swap_block_ptr(x, ctx, 1, 0, 0, MAX_MB_PLANE);
   store_coding_context(x, ctx, THR_ZEROMV,
                        best_pred_diff, best_tx_diff, best_filter_diff, 0);
-
-  return this_rd;
 }
 
-int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
-                                      const TileInfo *const tile,
-                                      int mi_row, int mi_col,
-                                      int *returnrate,
-                                      int64_t *returndistortion,
-                                      BLOCK_SIZE bsize,
-                                      PICK_MODE_CONTEXT *ctx,
-                                      int64_t best_rd_so_far) {
+void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
+                                   const TileInfo *const tile,
+                                   int mi_row, int mi_col,
+                                   RD_COST *rd_cost,
+                                   BLOCK_SIZE bsize,
+                                   PICK_MODE_CONTEXT *ctx,
+                                   int64_t best_rd_so_far) {
   VP9_COMMON *const cm = &cpi->common;
   RD_OPT *const rd_opt = &cpi->rd;
+  SPEED_FEATURES *const sf = &cpi->sf;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
   const struct segmentation *const seg = &cm->seg;
@@ -3338,14 +3621,14 @@
   int64_t dist_uv;
   int skip_uv;
   PREDICTION_MODE mode_uv = DC_PRED;
-  const int intra_cost_penalty =
-      20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+  const int intra_cost_penalty = vp9_get_intra_cost_penalty(
+      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
   int_mv seg_mvs[4][MAX_REF_FRAMES];
   b_mode_info best_bmodes[4];
   int best_skip2 = 0;
   int ref_frame_skip_mask[2] = { 0 };
 
-  x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
+  x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
   vpx_memset(x->zcoeff_blk[TX_4X4], 0, 4);
   vp9_zero(best_mbmode);
 
@@ -3364,7 +3647,7 @@
     best_filter_rd[i] = INT64_MAX;
   rate_uv_intra = INT_MAX;
 
-  *returnrate = INT_MAX;
+  rd_cost->rate = INT_MAX;
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
@@ -3398,7 +3681,7 @@
 
     // Look at the reference frame of the best mode so far and set the
     // skip mask to look at a subset of the remaining modes.
-    if (ref_index > 2 && cpi->sf.mode_skip_start < MAX_MODES) {
+    if (ref_index > 2 && sf->mode_skip_start < MAX_MODES) {
       if (ref_index == 3) {
         switch (best_mbmode.ref_frame[0]) {
           case INTRA_FRAME:
@@ -3443,7 +3726,7 @@
       if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
         continue;
 
-      if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
+      if ((sf->mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
           best_mbmode.ref_frame[0] == INTRA_FRAME)
         continue;
     }
@@ -3547,12 +3830,12 @@
 
       if (cm->interp_filter != BILINEAR) {
         tmp_best_filter = EIGHTTAP;
-        if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
+        if (x->source_variance < sf->disable_filter_search_var_thresh) {
           tmp_best_filter = EIGHTTAP;
-        } else if (cpi->sf.adaptive_pred_interp_filter == 1 &&
+        } else if (sf->adaptive_pred_interp_filter == 1 &&
                    ctx->pred_interp_filter < SWITCHABLE) {
           tmp_best_filter = ctx->pred_interp_filter;
-        } else if (cpi->sf.adaptive_pred_interp_filter == 2) {
+        } else if (sf->adaptive_pred_interp_filter == 2) {
           tmp_best_filter = ctx->pred_interp_filter < SWITCHABLE ?
                               ctx->pred_interp_filter : 0;
         } else {
@@ -3605,7 +3888,7 @@
               }
               pred_exists = 1;
               if (switchable_filter_index == 0 &&
-                  cpi->sf.use_rd_breakout &&
+                  sf->use_rd_breakout &&
                   best_rd < INT64_MAX) {
                 if (tmp_best_rdu / 2 > best_rd) {
                   // skip searching the other filters if the first is
@@ -3668,10 +3951,11 @@
         // then dont bother looking at UV
         vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
                                         BLOCK_8X8);
-        super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
-                         &uv_sse, BLOCK_8X8, tmp_best_rdu);
-        if (rate_uv == INT_MAX)
+        vpx_memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
+        if (!super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
+                              &uv_sse, BLOCK_8X8, tmp_best_rdu))
           continue;
+
         rate2 += rate_uv;
         distortion2 += distortion_uv;
         skippable = skippable && uv_skippable;
@@ -3738,8 +4022,9 @@
           max_plane = 1;
         }
 
-        *returnrate = rate2;
-        *returndistortion = distortion2;
+        rd_cost->rate = rate2;
+        rd_cost->dist = distortion2;
+        rd_cost->rdcost = this_rd;
         best_rd = this_rd;
         best_yrd = best_rd -
                    RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv);
@@ -3755,11 +4040,16 @@
 
         // TODO(debargha): enhance this test with a better distortion prediction
         // based on qp, activity mask and history
-        if ((cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
+        if ((sf->mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
             (ref_index > MIN_EARLY_TERM_INDEX)) {
-          const int qstep = xd->plane[0].dequant[1];
+          int qstep = xd->plane[0].dequant[1];
           // TODO(debargha): Enhance this by specializing for each mode_index
           int scale = 4;
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            qstep >>= (xd->bd - 8);
+          }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
           if (x->source_variance < UINT_MAX) {
             const int var_adjust = (x->source_variance < 16);
             scale -= var_adjust;
@@ -3826,11 +4116,14 @@
       break;
   }
 
-  if (best_rd >= best_rd_so_far)
-    return INT64_MAX;
+  if (best_rd >= best_rd_so_far) {
+    rd_cost->rate = INT_MAX;
+    rd_cost->rdcost = INT64_MAX;
+    return;
+  }
 
   // If we used an estimate for the uv intra rd in the loop above...
-  if (cpi->sf.use_uv_intra_rd_estimate) {
+  if (sf->use_uv_intra_rd_estimate) {
     // Do Intra UV best rd mode selection if best mode choice above was intra.
     if (best_mbmode.ref_frame[0] == INTRA_FRAME) {
       *mbmi = best_mbmode;
@@ -3843,9 +4136,10 @@
   }
 
   if (best_rd == INT64_MAX) {
-    *returnrate = INT_MAX;
-    *returndistortion = INT64_MAX;
-    return best_rd;
+    rd_cost->rate = INT_MAX;
+    rd_cost->dist = INT64_MAX;
+    rd_cost->rdcost = INT64_MAX;
+    return;
   }
 
   assert((cm->interp_filter == SWITCHABLE) ||
@@ -3891,7 +4185,5 @@
 
   store_coding_context(x, ctx, best_ref_index,
                        best_pred_diff, best_tx_diff, best_filter_diff, 0);
-
-  return best_rd;
 }
 

diff --git a/source/libvpx/vp9/encoder/vp9_rdopt.h b/source/libvpx/vp9/encoder/vp9_rdopt.h
index 52c603f..ed38ce8 100644
--- a/source/libvpx/vp9/encoder/vp9_rdopt.h
+++ b/source/libvpx/vp9/encoder/vp9_rdopt.h

@@ -23,38 +23,33 @@
 struct TileInfo;
 struct VP9_COMP;
 struct macroblock;
+struct RD_COST;
 
 void vp9_rd_pick_intra_mode_sb(struct VP9_COMP *cpi, struct macroblock *x,
-                               int *r, int64_t *d, BLOCK_SIZE bsize,
+                               struct RD_COST *rd_cost, BLOCK_SIZE bsize,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd);
 
-int64_t vp9_rd_pick_inter_mode_sb(struct VP9_COMP *cpi, struct macroblock *x,
-                                  const struct TileInfo *const tile,
-                                  int mi_row, int mi_col,
-                                  int *returnrate,
-                                  int64_t *returndistortion,
-                                  BLOCK_SIZE bsize,
-                                  PICK_MODE_CONTEXT *ctx,
-                                  int64_t best_rd_so_far);
+void vp9_rd_pick_inter_mode_sb(struct VP9_COMP *cpi, struct macroblock *x,
+                               const struct TileInfo *const tile,
+                               int mi_row, int mi_col,
+                               struct RD_COST *rd_cost,
+                               BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                               int64_t best_rd_so_far);
 
-int64_t vp9_rd_pick_inter_mode_sb_seg_skip(struct VP9_COMP *cpi,
-                                           struct macroblock *x,
-                                           int *returnrate,
-                                           int64_t *returndistortion,
-                                           BLOCK_SIZE bsize,
-                                           PICK_MODE_CONTEXT *ctx,
-                                           int64_t best_rd_so_far);
+void vp9_rd_pick_inter_mode_sb_seg_skip(struct VP9_COMP *cpi,
+                                        struct macroblock *x,
+                                        struct RD_COST *rd_cost,
+                                        BLOCK_SIZE bsize,
+                                        PICK_MODE_CONTEXT *ctx,
+                                        int64_t best_rd_so_far);
 
-int64_t vp9_rd_pick_inter_mode_sub8x8(struct VP9_COMP *cpi,
-                                      struct macroblock *x,
-                                      const struct TileInfo *const tile,
-                                      int mi_row, int mi_col,
-                                      int *returnrate,
-                                      int64_t *returndistortion,
-                                      BLOCK_SIZE bsize,
-                                      PICK_MODE_CONTEXT *ctx,
-                                      int64_t best_rd_so_far);
-
+void vp9_rd_pick_inter_mode_sub8x8(struct VP9_COMP *cpi,
+                                   struct macroblock *x,
+                                   const struct TileInfo *const tile,
+                                   int mi_row, int mi_col,
+                                   struct RD_COST *rd_cost,
+                                   BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                                   int64_t best_rd_so_far);
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/source/libvpx/vp9/encoder/vp9_resize.c b/source/libvpx/vp9/encoder/vp9_resize.c
index 4e6efae..3d361d4 100644
--- a/source/libvpx/vp9/encoder/vp9_resize.c
+++ b/source/libvpx/vp9/encoder/vp9_resize.c

@@ -312,7 +312,7 @@
 static void down2_symeven(const uint8_t *const input, int length,
                           uint8_t *output) {
   // Actual filter len = 2 * filter_len_half.
-  static const int16_t *filter = vp9_down2_symeven_half_filter;
+  const int16_t *filter = vp9_down2_symeven_half_filter;
   const int filter_len_half = sizeof(vp9_down2_symeven_half_filter) / 2;
   int i, j;
   uint8_t *optr = output;
@@ -368,7 +368,7 @@
 static void down2_symodd(const uint8_t *const input, int length,
                          uint8_t *output) {
   // Actual filter len = 2 * filter_len_half - 1.
-  static const int16_t *filter = vp9_down2_symodd_half_filter;
+  const int16_t *filter = vp9_down2_symodd_half_filter;
   const int filter_len_half = sizeof(vp9_down2_symodd_half_filter) / 2;
   int i, j;
   uint8_t *optr = output;
@@ -529,6 +529,302 @@
   free(arrbuf);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_interpolate(const uint16_t *const input, int inlength,
+                               uint16_t *output, int outlength, int bd) {
+  const int64_t delta =
+      (((uint64_t)inlength << 32) + outlength / 2) / outlength;
+  const int64_t offset = inlength > outlength ?
+      (((int64_t)(inlength - outlength) << 31) + outlength / 2) / outlength :
+      -(((int64_t)(outlength - inlength) << 31) + outlength / 2) / outlength;
+  uint16_t *optr = output;
+  int x, x1, x2, sum, k, int_pel, sub_pel;
+  int64_t y;
+
+  const interp_kernel *interp_filters =
+      choose_interp_filter(inlength, outlength);
+
+  x = 0;
+  y = offset;
+  while ((y >> INTERP_PRECISION_BITS) < (INTERP_TAPS / 2 - 1)) {
+    x++;
+    y += delta;
+  }
+  x1 = x;
+  x = outlength - 1;
+  y = delta * x + offset;
+  while ((y >> INTERP_PRECISION_BITS) +
+         (int64_t)(INTERP_TAPS / 2) >= inlength) {
+    x--;
+    y -= delta;
+  }
+  x2 = x;
+  if (x1 > x2) {
+    for (x = 0, y = offset; x < outlength; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k) {
+        const int pk = int_pel - INTERP_TAPS / 2 + 1 + k;
+        sum += filter[k] *
+            input[(pk < 0 ? 0 : (pk >= inlength ? inlength - 1 : pk))];
+      }
+      *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+    }
+  } else {
+    // Initial part.
+    for (x = 0, y = offset; x < x1; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k)
+        sum += filter[k] *
+            input[(int_pel - INTERP_TAPS / 2 + 1 + k < 0 ?
+                   0 : int_pel - INTERP_TAPS / 2 + 1 + k)];
+      *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+    }
+    // Middle part.
+    for (; x <= x2; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k)
+        sum += filter[k] * input[int_pel - INTERP_TAPS / 2 + 1 + k];
+      *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+    }
+    // End part.
+    for (; x < outlength; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k)
+        sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k >=
+                                  inlength ?  inlength - 1 :
+                                  int_pel - INTERP_TAPS / 2 + 1 + k)];
+      *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+    }
+  }
+}
+
+static void highbd_down2_symeven(const uint16_t *const input, int length,
+                                 uint16_t *output, int bd) {
+  // Actual filter len = 2 * filter_len_half.
+  static const int16_t *filter = vp9_down2_symeven_half_filter;
+  const int filter_len_half = sizeof(vp9_down2_symeven_half_filter) / 2;
+  int i, j;
+  uint16_t *optr = output;
+  int l1 = filter_len_half;
+  int l2 = (length - filter_len_half);
+  l1 += (l1 & 1);
+  l2 += (l2 & 1);
+  if (l1 > l2) {
+    // Short input length.
+    for (i = 0; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] +
+                input[(i + 1 + j >= length ? length - 1 : i + 1 + j)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+  } else {
+    // Initial part.
+    for (i = 0; i < l1; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + 1 + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+    // Middle part.
+    for (; i < l2; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[i + 1 + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+    // End part.
+    for (; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[i - j] +
+                input[(i + 1 + j >= length ? length - 1 : i + 1 + j)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+  }
+}
+
+static void highbd_down2_symodd(const uint16_t *const input, int length,
+                              uint16_t *output, int bd) {
+  // Actual filter len = 2 * filter_len_half - 1.
+  static const int16_t *filter = vp9_down2_symodd_half_filter;
+  const int filter_len_half = sizeof(vp9_down2_symodd_half_filter) / 2;
+  int i, j;
+  uint16_t *optr = output;
+  int l1 = filter_len_half - 1;
+  int l2 = (length - filter_len_half + 1);
+  l1 += (l1 & 1);
+  l2 += (l2 & 1);
+  if (l1 > l2) {
+    // Short input length.
+    for (i = 0; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] +
+                input[(i + j >= length ? length - 1 : i + j)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+  } else {
+    // Initial part.
+    for (i = 0; i < l1; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+    // Middle part.
+    for (; i < l2; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[i + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+    // End part.
+    for (; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[(i + j >= length ? length - 1 : i + j)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+  }
+}
+
+static void highbd_resize_multistep(const uint16_t *const input,
+                                    int length,
+                                    uint16_t *output,
+                                    int olength,
+                                    uint16_t *buf,
+                                    int bd) {
+  int steps;
+  if (length == olength) {
+    memcpy(output, input, sizeof(uint16_t) * length);
+    return;
+  }
+  steps = get_down2_steps(length, olength);
+
+  if (steps > 0) {
+    int s;
+    uint16_t *out = NULL;
+    uint16_t *tmpbuf = NULL;
+    uint16_t *otmp, *otmp2;
+    int filteredlength = length;
+    if (!tmpbuf) {
+      tmpbuf = (uint16_t *)malloc(sizeof(uint16_t) * length);
+      otmp = tmpbuf;
+    } else {
+      otmp = buf;
+    }
+    otmp2 = otmp + get_down2_length(length, 1);
+    for (s = 0; s < steps; ++s) {
+      const int proj_filteredlength = get_down2_length(filteredlength, 1);
+      const uint16_t *const in = (s == 0 ? input : out);
+      if (s == steps - 1 && proj_filteredlength == olength)
+        out = output;
+      else
+        out = (s & 1 ? otmp2 : otmp);
+      if (filteredlength & 1)
+        highbd_down2_symodd(in, filteredlength, out, bd);
+      else
+        highbd_down2_symeven(in, filteredlength, out, bd);
+      filteredlength = proj_filteredlength;
+    }
+    if (filteredlength != olength) {
+      highbd_interpolate(out, filteredlength, output, olength, bd);
+    }
+    if (tmpbuf)
+      free(tmpbuf);
+  } else {
+    highbd_interpolate(input, length, output, olength, bd);
+  }
+}
+
+static void highbd_fill_col_to_arr(uint16_t *img, int stride, int len,
+                                   uint16_t *arr) {
+  int i;
+  uint16_t *iptr = img;
+  uint16_t *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *aptr++ = *iptr;
+  }
+}
+
+static void highbd_fill_arr_to_col(uint16_t *img, int stride, int len,
+                                   uint16_t *arr) {
+  int i;
+  uint16_t *iptr = img;
+  uint16_t *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *iptr = *aptr++;
+  }
+}
+
+void vp9_highbd_resize_plane(const uint8_t *const input,
+                             int height,
+                             int width,
+                             int in_stride,
+                             uint8_t *output,
+                             int height2,
+                             int width2,
+                             int out_stride,
+                             int bd) {
+  int i;
+  uint16_t *intbuf = (uint16_t *)malloc(sizeof(uint16_t) * width2 * height);
+  uint16_t *tmpbuf = (uint16_t *)malloc(sizeof(uint16_t) *
+                                        (width < height ? height : width));
+  uint16_t *arrbuf = (uint16_t *)malloc(sizeof(uint16_t) * (height + height2));
+  for (i = 0; i < height; ++i) {
+    highbd_resize_multistep(CONVERT_TO_SHORTPTR(input + in_stride * i), width,
+                            intbuf + width2 * i, width2, tmpbuf, bd);
+  }
+  for (i = 0; i < width2; ++i) {
+    highbd_fill_col_to_arr(intbuf + i, width2, height, arrbuf);
+    highbd_resize_multistep(arrbuf, height, arrbuf + height, height2, tmpbuf,
+                            bd);
+    highbd_fill_arr_to_col(CONVERT_TO_SHORTPTR(output + i), out_stride, height2,
+                           arrbuf + height);
+  }
+  free(intbuf);
+  free(tmpbuf);
+  free(arrbuf);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 void vp9_resize_frame420(const uint8_t *const y,
                          int y_stride,
                          const uint8_t *const u, const uint8_t *const v,
@@ -574,3 +870,51 @@
   vp9_resize_plane(v, height, width, uv_stride,
                    ov, oheight, owidth, ouv_stride);
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_resize_frame420(const uint8_t *const y,
+                                int y_stride,
+                                const uint8_t *const u, const uint8_t *const v,
+                                int uv_stride,
+                                int height, int width,
+                                uint8_t *oy, int oy_stride,
+                                uint8_t *ou, uint8_t *ov, int ouv_stride,
+                                int oheight, int owidth, int bd) {
+  vp9_highbd_resize_plane(y, height, width, y_stride,
+                          oy, oheight, owidth, oy_stride, bd);
+  vp9_highbd_resize_plane(u, height / 2, width / 2, uv_stride,
+                          ou, oheight / 2, owidth / 2, ouv_stride, bd);
+  vp9_highbd_resize_plane(v, height / 2, width / 2, uv_stride,
+                          ov, oheight / 2, owidth / 2, ouv_stride, bd);
+}
+
+void vp9_highbd_resize_frame422(const uint8_t *const y, int y_stride,
+                                const uint8_t *const u, const uint8_t *const v,
+                                int uv_stride,
+                                int height, int width,
+                                uint8_t *oy, int oy_stride,
+                                uint8_t *ou, uint8_t *ov, int ouv_stride,
+                                int oheight, int owidth, int bd) {
+  vp9_highbd_resize_plane(y, height, width, y_stride,
+                          oy, oheight, owidth, oy_stride, bd);
+  vp9_highbd_resize_plane(u, height, width / 2, uv_stride,
+                          ou, oheight, owidth / 2, ouv_stride, bd);
+  vp9_highbd_resize_plane(v, height, width / 2, uv_stride,
+                          ov, oheight, owidth / 2, ouv_stride, bd);
+}
+
+void vp9_highbd_resize_frame444(const uint8_t *const y, int y_stride,
+                                const uint8_t *const u, const uint8_t *const v,
+                                int uv_stride,
+                                int height, int width,
+                                uint8_t *oy, int oy_stride,
+                                uint8_t *ou, uint8_t *ov, int ouv_stride,
+                                int oheight, int owidth, int bd) {
+  vp9_highbd_resize_plane(y, height, width, y_stride,
+                          oy, oheight, owidth, oy_stride, bd);
+  vp9_highbd_resize_plane(u, height, width, uv_stride,
+                          ou, oheight, owidth, ouv_stride, bd);
+  vp9_highbd_resize_plane(v, height, width, uv_stride,
+                          ov, oheight, owidth, ouv_stride, bd);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH

diff --git a/source/libvpx/vp9/encoder/vp9_resize.h b/source/libvpx/vp9/encoder/vp9_resize.h
index 1818cd4..067af53 100644
--- a/source/libvpx/vp9/encoder/vp9_resize.h
+++ b/source/libvpx/vp9/encoder/vp9_resize.h

@@ -65,4 +65,60 @@
                          int oheight,
                          int owidth);
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_resize_plane(const uint8_t *const input,
+                             int height,
+                             int width,
+                             int in_stride,
+                             uint8_t *output,
+                             int height2,
+                             int width2,
+                             int out_stride,
+                             int bd);
+void vp9_highbd_resize_frame420(const uint8_t *const y,
+                                int y_stride,
+                                const uint8_t *const u,
+                                const uint8_t *const v,
+                                int uv_stride,
+                                int height,
+                                int width,
+                                uint8_t *oy,
+                                int oy_stride,
+                                uint8_t *ou,
+                                uint8_t *ov,
+                                int ouv_stride,
+                                int oheight,
+                                int owidth,
+                                int bd);
+void vp9_highbd_resize_frame422(const uint8_t *const y,
+                                int y_stride,
+                                const uint8_t *const u,
+                                const uint8_t *const v,
+                                int uv_stride,
+                                int height,
+                                int width,
+                                uint8_t *oy,
+                                int oy_stride,
+                                uint8_t *ou,
+                                uint8_t *ov,
+                                int ouv_stride,
+                                int oheight,
+                                int owidth,
+                                int bd);
+void vp9_highbd_resize_frame444(const uint8_t *const y,
+                                int y_stride,
+                                const uint8_t *const u,
+                                const uint8_t *const v,
+                                int uv_stride,
+                                int height,
+                                int width,
+                                uint8_t *oy,
+                                int oy_stride,
+                                uint8_t *ou,
+                                uint8_t *ov,
+                                int ouv_stride,
+                                int oheight,
+                                int owidth,
+                                int bd);
+#endif    // CONFIG_VP9_HIGHBITDEPTH
 #endif    // VP9_ENCODER_VP9_RESIZE_H_

diff --git a/source/libvpx/vp9/encoder/vp9_sad.c b/source/libvpx/vp9/encoder/vp9_sad.c
index cee6ce1..73134f2 100644
--- a/source/libvpx/vp9/encoder/vp9_sad.c
+++ b/source/libvpx/vp9/encoder/vp9_sad.c

@@ -32,7 +32,6 @@
     a += a_stride;
     b += b_stride;
   }
-
   return sad;
 }
 
@@ -136,9 +135,9 @@
 sadMxNx4D(4, 4)
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static INLINE unsigned int high_sad(const uint8_t *a8, int a_stride,
-                                    const uint8_t *b8, int b_stride,
-                                    int width, int height) {
+static INLINE unsigned int highbd_sad(const uint8_t *a8, int a_stride,
+                                      const uint8_t *b8, int b_stride,
+                                      int width, int height) {
   int y, x;
   unsigned int sad = 0;
   const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
@@ -153,9 +152,9 @@
   return sad;
 }
 
-static INLINE unsigned int high_sadb(const uint8_t *a8, int a_stride,
-                                     const uint16_t *b, int b_stride,
-                                     int width, int height) {
+static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
+                                       const uint16_t *b, int b_stride,
+                                       int width, int height) {
   int y, x;
   unsigned int sad = 0;
   const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
@@ -169,103 +168,109 @@
   return sad;
 }
 
-#define high_sadMxN(m, n) \
-unsigned int vp9_high_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
-                                       const uint8_t *ref, int ref_stride) { \
-  return high_sad(src, src_stride, ref, ref_stride, m, n); \
+#define highbd_sadMxN(m, n) \
+unsigned int vp9_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
+                                         const uint8_t *ref, int ref_stride) { \
+  return highbd_sad(src, src_stride, ref, ref_stride, m, n); \
 } \
-unsigned int vp9_high_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
-                                           const uint8_t *ref, int ref_stride, \
-                                           const uint8_t *second_pred) { \
+unsigned int vp9_highbd_sad##m##x##n##_avg_c(const uint8_t *src, \
+                                             int src_stride, \
+                                             const uint8_t *ref, \
+                                             int ref_stride, \
+                                             const uint8_t *second_pred) { \
   uint16_t comp_pred[m * n]; \
-  vp9_high_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \
-  return high_sadb(src, src_stride, comp_pred, m, m, n); \
+  vp9_highbd_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \
+  return highbd_sadb(src, src_stride, comp_pred, m, m, n); \
 }
 
-#define high_sadMxNxK(m, n, k) \
-void vp9_high_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \
-                                     const uint8_t *ref, int ref_stride, \
-                                     unsigned int *sads) { \
+#define highbd_sadMxNxK(m, n, k) \
+void vp9_highbd_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \
+                                       const uint8_t *ref, int ref_stride, \
+                                       unsigned int *sads) { \
   int i; \
-  for (i = 0; i < k; ++i) \
-    sads[i] = vp9_high_sad##m##x##n##_c(src, src_stride, &ref[i], ref_stride); \
+  for (i = 0; i < k; ++i) { \
+    sads[i] = vp9_highbd_sad##m##x##n##_c(src, src_stride, &ref[i], \
+                                          ref_stride); \
+  } \
 }
 
-#define high_sadMxNx4D(m, n) \
-void vp9_high_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
-                                  const uint8_t *const refs[], \
-                                  int ref_stride, unsigned int *sads) { \
+#define highbd_sadMxNx4D(m, n) \
+void vp9_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
+                                    const uint8_t *const refs[], \
+                                    int ref_stride, unsigned int *sads) { \
   int i; \
-  for (i = 0; i < 4; ++i) \
-    sads[i] = vp9_high_sad##m##x##n##_c(src, src_stride, refs[i], ref_stride); \
+  for (i = 0; i < 4; ++i) { \
+    sads[i] = vp9_highbd_sad##m##x##n##_c(src, src_stride, refs[i], \
+                                          ref_stride); \
+  }  \
 }
 
 // 64x64
-high_sadMxN(64, 64)
-high_sadMxNxK(64, 64, 3)
-high_sadMxNxK(64, 64, 8)
-high_sadMxNx4D(64, 64)
+highbd_sadMxN(64, 64)
+highbd_sadMxNxK(64, 64, 3)
+highbd_sadMxNxK(64, 64, 8)
+highbd_sadMxNx4D(64, 64)
 
 // 64x32
-high_sadMxN(64, 32)
-high_sadMxNx4D(64, 32)
+highbd_sadMxN(64, 32)
+highbd_sadMxNx4D(64, 32)
 
 // 32x64
-high_sadMxN(32, 64)
-high_sadMxNx4D(32, 64)
+highbd_sadMxN(32, 64)
+highbd_sadMxNx4D(32, 64)
 
 // 32x32
-high_sadMxN(32, 32)
-high_sadMxNxK(32, 32, 3)
-high_sadMxNxK(32, 32, 8)
-high_sadMxNx4D(32, 32)
+highbd_sadMxN(32, 32)
+highbd_sadMxNxK(32, 32, 3)
+highbd_sadMxNxK(32, 32, 8)
+highbd_sadMxNx4D(32, 32)
 
 // 32x16
-high_sadMxN(32, 16)
-high_sadMxNx4D(32, 16)
+highbd_sadMxN(32, 16)
+highbd_sadMxNx4D(32, 16)
 
 // 16x32
-high_sadMxN(16, 32)
-high_sadMxNx4D(16, 32)
+highbd_sadMxN(16, 32)
+highbd_sadMxNx4D(16, 32)
 
 // 16x16
-high_sadMxN(16, 16)
-high_sadMxNxK(16, 16, 3)
-high_sadMxNxK(16, 16, 8)
-high_sadMxNx4D(16, 16)
+highbd_sadMxN(16, 16)
+highbd_sadMxNxK(16, 16, 3)
+highbd_sadMxNxK(16, 16, 8)
+highbd_sadMxNx4D(16, 16)
 
 // 16x8
-high_sadMxN(16, 8)
-high_sadMxNxK(16, 8, 3)
-high_sadMxNxK(16, 8, 8)
-high_sadMxNx4D(16, 8)
+highbd_sadMxN(16, 8)
+highbd_sadMxNxK(16, 8, 3)
+highbd_sadMxNxK(16, 8, 8)
+highbd_sadMxNx4D(16, 8)
 
 // 8x16
-high_sadMxN(8, 16)
-high_sadMxNxK(8, 16, 3)
-high_sadMxNxK(8, 16, 8)
-high_sadMxNx4D(8, 16)
+highbd_sadMxN(8, 16)
+highbd_sadMxNxK(8, 16, 3)
+highbd_sadMxNxK(8, 16, 8)
+highbd_sadMxNx4D(8, 16)
 
 // 8x8
-high_sadMxN(8, 8)
-high_sadMxNxK(8, 8, 3)
-high_sadMxNxK(8, 8, 8)
-high_sadMxNx4D(8, 8)
+highbd_sadMxN(8, 8)
+highbd_sadMxNxK(8, 8, 3)
+highbd_sadMxNxK(8, 8, 8)
+highbd_sadMxNx4D(8, 8)
 
 // 8x4
-high_sadMxN(8, 4)
-high_sadMxNxK(8, 4, 8)
-high_sadMxNx4D(8, 4)
+highbd_sadMxN(8, 4)
+highbd_sadMxNxK(8, 4, 8)
+highbd_sadMxNx4D(8, 4)
 
 // 4x8
-high_sadMxN(4, 8)
-high_sadMxNxK(4, 8, 8)
-high_sadMxNx4D(4, 8)
+highbd_sadMxN(4, 8)
+highbd_sadMxNxK(4, 8, 8)
+highbd_sadMxNx4D(4, 8)
 
 // 4x4
-high_sadMxN(4, 4)
-high_sadMxNxK(4, 4, 3)
-high_sadMxNxK(4, 4, 8)
-high_sadMxNx4D(4, 4)
+highbd_sadMxN(4, 4)
+highbd_sadMxNxK(4, 4, 3)
+highbd_sadMxNxK(4, 4, 8)
+highbd_sadMxNx4D(4, 4)
 
 #endif  // CONFIG_VP9_HIGHBITDEPTH

diff --git a/source/libvpx/vp9/encoder/vp9_speed_features.c b/source/libvpx/vp9/encoder/vp9_speed_features.c
index 52e9a8e..9e3ee2c 100644
--- a/source/libvpx/vp9/encoder/vp9_speed_features.c
+++ b/source/libvpx/vp9/encoder/vp9_speed_features.c

@@ -59,7 +59,7 @@
       sf->partition_search_breakout_dist_thr = (1 << 23);
     else
       sf->partition_search_breakout_dist_thr = (1 << 21);
-    sf->partition_search_breakout_rate_thr = 500;
+    sf->partition_search_breakout_rate_thr = 80;
   }
 
   if (speed >= 2) {
@@ -70,8 +70,12 @@
       sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
                                               : DISABLE_ALL_INTER_SPLIT;
       sf->adaptive_pred_interp_filter = 0;
+      sf->partition_search_breakout_dist_thr = (1 << 24);
+      sf->partition_search_breakout_rate_thr = 120;
     } else {
       sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
+      sf->partition_search_breakout_dist_thr = (1 << 22);
+      sf->partition_search_breakout_rate_thr = 100;
     }
 
     sf->reference_masking = 1;
@@ -83,11 +87,7 @@
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
     sf->auto_min_max_partition_size = CONSTRAIN_NEIGHBORING_MIN_MAX;
 
-    if (MIN(cm->width, cm->height) >= 720)
-      sf->partition_search_breakout_dist_thr = (1 << 24);
-    else
-      sf->partition_search_breakout_dist_thr = (1 << 22);
-    sf->partition_search_breakout_rate_thr = 700;
+    sf->allow_partition_search_skip = 1;
   }
 
   if (speed >= 3) {
@@ -96,29 +96,27 @@
     if (MIN(cm->width, cm->height) >= 720) {
       sf->disable_split_mask = DISABLE_ALL_SPLIT;
       sf->schedule_mode_search = cm->base_qindex < 220 ? 1 : 0;
+      sf->partition_search_breakout_dist_thr = (1 << 25);
+      sf->partition_search_breakout_rate_thr = 200;
     } else {
       sf->max_intra_bsize = BLOCK_32X32;
       sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
       sf->schedule_mode_search = cm->base_qindex < 175 ? 1 : 0;
+      sf->partition_search_breakout_dist_thr = (1 << 23);
+      sf->partition_search_breakout_rate_thr = 120;
     }
+    sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED;
     sf->adaptive_pred_interp_filter = 0;
     sf->adaptive_mode_search = 1;
     sf->cb_partition_search = !boosted;
     sf->cb_pred_filter_search = 1;
     sf->alt_ref_search_fp = 1;
-    sf->motion_field_mode_search = !boosted;
     sf->recode_loop = ALLOW_RECODE_KFMAXBW;
     sf->adaptive_rd_thresh = 3;
     sf->mode_skip_start = 6;
     sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
     sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
     sf->adaptive_interp_filter_search = 1;
-
-    if (MIN(cm->width, cm->height) >= 720)
-      sf->partition_search_breakout_dist_thr = (1 << 25);
-    else
-      sf->partition_search_breakout_dist_thr = (1 << 23);
-    sf->partition_search_breakout_rate_thr = 1000;
   }
 
   if (speed >= 4) {
@@ -126,19 +124,20 @@
     sf->tx_size_search_method = USE_LARGESTALL;
     sf->disable_split_mask = DISABLE_ALL_SPLIT;
     sf->mv.search_method = BIGDIA;
-    sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED;
+    sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
     sf->adaptive_rd_thresh = 4;
     sf->mode_search_skip_flags |= FLAG_EARLY_TERMINATE;
     sf->disable_filter_search_var_thresh = 200;
     sf->use_lp32x32fdct = 1;
     sf->use_fast_coef_updates = ONE_LOOP_REDUCED;
     sf->use_fast_coef_costing = 1;
+    sf->motion_field_mode_search = !boosted;
 
     if (MIN(cm->width, cm->height) >= 720)
       sf->partition_search_breakout_dist_thr = (1 << 26);
     else
       sf->partition_search_breakout_dist_thr = (1 << 24);
-    sf->partition_search_breakout_rate_thr = 1500;
+    sf->partition_search_breakout_rate_thr = 300;
   }
 
   if (speed >= 5) {
@@ -152,7 +151,6 @@
       sf->intra_y_mode_mask[i] = INTRA_DC;
       sf->intra_uv_mode_mask[i] = INTRA_DC;
     }
-    cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
   }
   if (speed >= 6) {
     sf->mv.reduce_first_step_size = 1;
@@ -250,6 +248,7 @@
     sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
     sf->frame_parameter_update = 0;
     sf->mv.search_method = FAST_HEX;
+
     sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEAR_NEW;
     sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST;
     sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST;
@@ -272,6 +271,16 @@
     sf->partition_search_type = REFERENCE_PARTITION;
     sf->use_nonrd_pick_mode = 1;
     sf->allow_skip_recode = 0;
+    sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEW_ZERO;
+    sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST_NEW_ZERO;
+    sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST_NEW_ZERO;
+    sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST_NEW_ZERO;
+
+    if (MIN(cm->width, cm->height) >= 720)
+      sf->partition_search_breakout_dist_thr = (1 << 25);
+    else
+      sf->partition_search_breakout_dist_thr = (1 << 23);
+    sf->partition_search_breakout_rate_thr = 200;
   }
 
   if (speed >= 6) {
@@ -279,12 +288,13 @@
       int i;
       // Allow fancy modes at all sizes since SOURCE_VAR_BASED_PARTITION is used
       for (i = 0; i < BLOCK_SIZES; ++i)
-        sf->inter_mode_mask[i] = INTER_ALL;
+        sf->inter_mode_mask[i] = INTER_NEAREST_NEAR_NEW;
     }
 
     // Adaptively switch between SOURCE_VAR_BASED_PARTITION and FIXED_PARTITION.
-    sf->partition_search_type = SOURCE_VAR_BASED_PARTITION;
+    sf->partition_search_type = VAR_BASED_PARTITION;
     sf->search_type_check_frequency = 50;
+    sf->mv.search_method = NSTEP;
 
     sf->tx_size_search_method = is_keyframe ? USE_LARGESTALL : USE_TX_8X8;
 
@@ -292,7 +302,7 @@
     sf->reuse_inter_pred_sby = 1;
 
     // Increase mode checking threshold for NEWMV.
-    sf->elevate_newmv_thresh = 2000;
+    sf->elevate_newmv_thresh = 1000;
 
     sf->mv.reduce_first_step_size = 1;
   }
@@ -365,6 +375,7 @@
   sf->max_delta_qindex = 0;
   sf->disable_filter_search_var_thresh = 0;
   sf->adaptive_interp_filter_search = 0;
+  sf->allow_partition_search_skip = 0;
 
   for (i = 0; i < TX_SIZES; i++) {
     sf->intra_y_mode_mask[i] = INTRA_ALL;
@@ -423,6 +434,10 @@
     cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree;
   } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED) {
     cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned;
+  } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED_MORE) {
+    cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned_more;
+  } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED_EVENMORE) {
+    cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned_evenmore;
   }
 
   cpi->mb.optimize = sf->optimize_coefficients == 1 && oxcf->pass != 1;

diff --git a/source/libvpx/vp9/encoder/vp9_speed_features.h b/source/libvpx/vp9/encoder/vp9_speed_features.h
index ed84008..951b4af 100644
--- a/source/libvpx/vp9/encoder/vp9_speed_features.h
+++ b/source/libvpx/vp9/encoder/vp9_speed_features.h

@@ -34,6 +34,9 @@
 enum {
   INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV),
   INTER_NEAREST = (1 << NEARESTMV),
+  INTER_NEAREST_NEW = (1 << NEARESTMV) | (1 << NEWMV),
+  INTER_NEAREST_ZERO = (1 << NEARESTMV) | (1 << ZEROMV),
+  INTER_NEAREST_NEW_ZERO = (1 << NEARESTMV) | (1 << ZEROMV) | (1 << NEWMV),
   INTER_NEAREST_NEAR_NEW = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV),
   INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV),
 };
@@ -78,7 +81,9 @@
 
 typedef enum {
   SUBPEL_TREE = 0,
-  SUBPEL_TREE_PRUNED = 1,
+  SUBPEL_TREE_PRUNED = 1,           // Prunes 1/2-pel searches
+  SUBPEL_TREE_PRUNED_MORE = 2,      // Prunes 1/2-pel searches more aggressively
+  SUBPEL_TREE_PRUNED_EVENMORE = 3,  // Prunes 1/2- and 1/4-pel searches
   // Other methods to come
 } SUBPEL_SEARCH_METHODS;
 
@@ -144,16 +149,12 @@
 
 typedef enum {
   // Search partitions using RD/NONRD criterion
-  SEARCH_PARTITION = 0,
+  SEARCH_PARTITION,
 
   // Always use a fixed size partition
-  FIXED_PARTITION = 1,
+  FIXED_PARTITION,
 
-  // Use a fixed size partition in every 64X64 SB, where the size is
-  // determined based on source variance
-  VAR_BASED_FIXED_PARTITION = 2,
-
-  REFERENCE_PARTITION = 3,
+  REFERENCE_PARTITION,
 
   // Use an arbitrary partitioning scheme based on source variance within
   // a 64X64 SB
@@ -435,6 +436,9 @@
   // Partition search early breakout thresholds.
   int64_t partition_search_breakout_dist_thr;
   int partition_search_breakout_rate_thr;
+
+  // Allow skipping partition search for still image frame
+  int allow_partition_search_skip;
 } SPEED_FEATURES;
 
 struct VP9_COMP;

diff --git a/source/libvpx/vp9/encoder/vp9_ssim.c b/source/libvpx/vp9/encoder/vp9_ssim.c
index 8435640..5dbfbf5 100644
--- a/source/libvpx/vp9/encoder/vp9_ssim.c
+++ b/source/libvpx/vp9/encoder/vp9_ssim.c

@@ -43,6 +43,24 @@
   }
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_ssim_parms_8x8_c(uint16_t *s, int sp, uint16_t *r, int rp,
+                                 uint32_t *sum_s, uint32_t *sum_r,
+                                 uint32_t *sum_sq_s, uint32_t *sum_sq_r,
+                                 uint32_t *sum_sxr) {
+  int i, j;
+  for (i = 0; i < 8; i++, s += sp, r += rp) {
+    for (j = 0; j < 8; j++) {
+      *sum_s += s[j];
+      *sum_r += r[j];
+      *sum_sq_s += s[j] * s[j];
+      *sum_sq_r += r[j] * r[j];
+      *sum_sxr += s[j] * r[j];
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static const int64_t cc1 =  26634;  // (64^2*(.01*255)^2
 static const int64_t cc2 = 239708;  // (64^2*(.03*255)^2
 
@@ -73,6 +91,22 @@
   return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static double highbd_ssim_8x8(uint16_t *s, int sp, uint16_t *r, int rp,
+                              unsigned int bd) {
+  uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+  const int oshift = bd - 8;
+  vp9_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+                            &sum_sxr);
+  return similarity(sum_s >> oshift,
+                    sum_r >> oshift,
+                    sum_sq_s >> (2 * oshift),
+                    sum_sq_r >> (2 * oshift),
+                    sum_sxr >> (2 * oshift),
+                    64);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 // We are using a 8x8 moving window with starting location of each 8x8 window
 // on the 4x4 pixel grid. Such arrangement allows the windows to overlap
 // block boundaries to penalize blocking artifacts.
@@ -94,6 +128,31 @@
   ssim_total /= samples;
   return ssim_total;
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+double vp9_highbd_ssim2(uint8_t *img1, uint8_t *img2, int stride_img1,
+                        int stride_img2, int width, int height,
+                        unsigned int bd) {
+  int i, j;
+  int samples = 0;
+  double ssim_total = 0;
+
+  // sample point start with each 4x4 location
+  for (i = 0; i <= height - 8;
+       i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+    for (j = 0; j <= width - 8; j += 4) {
+      double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1,
+                                 CONVERT_TO_SHORTPTR(img2 + j), stride_img2,
+                                 bd);
+      ssim_total += v;
+      samples++;
+    }
+  }
+  ssim_total /= samples;
+  return ssim_total;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 double vp9_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
                      double *weight) {
   double a, b, c;
@@ -141,3 +200,55 @@
 
   return ssim_all;
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+double vp9_highbd_calc_ssim(YV12_BUFFER_CONFIG *source,
+                            YV12_BUFFER_CONFIG *dest,
+                            double *weight, unsigned int bd) {
+  double a, b, c;
+  double ssimv;
+
+  a = vp9_highbd_ssim2(source->y_buffer, dest->y_buffer,
+                       source->y_stride, dest->y_stride,
+                       source->y_crop_width, source->y_crop_height, bd);
+
+  b = vp9_highbd_ssim2(source->u_buffer, dest->u_buffer,
+                       source->uv_stride, dest->uv_stride,
+                       source->uv_crop_width, source->uv_crop_height, bd);
+
+  c = vp9_highbd_ssim2(source->v_buffer, dest->v_buffer,
+                       source->uv_stride, dest->uv_stride,
+                       source->uv_crop_width, source->uv_crop_height, bd);
+
+  ssimv = a * .8 + .1 * (b + c);
+
+  *weight = 1;
+
+  return ssimv;
+}
+
+double vp9_highbd_calc_ssimg(YV12_BUFFER_CONFIG *source,
+                             YV12_BUFFER_CONFIG *dest, double *ssim_y,
+                             double *ssim_u, double *ssim_v, unsigned int bd) {
+  double ssim_all = 0;
+  double a, b, c;
+
+  a = vp9_highbd_ssim2(source->y_buffer, dest->y_buffer,
+                       source->y_stride, dest->y_stride,
+                       source->y_crop_width, source->y_crop_height, bd);
+
+  b = vp9_highbd_ssim2(source->u_buffer, dest->u_buffer,
+                       source->uv_stride, dest->uv_stride,
+                       source->uv_crop_width, source->uv_crop_height, bd);
+
+  c = vp9_highbd_ssim2(source->v_buffer, dest->v_buffer,
+                       source->uv_stride, dest->uv_stride,
+                       source->uv_crop_width, source->uv_crop_height, bd);
+  *ssim_y = a;
+  *ssim_u = b;
+  *ssim_v = c;
+  ssim_all = (a * 4 + b + c) / 6;
+
+  return ssim_all;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH

diff --git a/source/libvpx/vp9/encoder/vp9_ssim.h b/source/libvpx/vp9/encoder/vp9_ssim.h
index d1dd1b7..e75623b 100644
--- a/source/libvpx/vp9/encoder/vp9_ssim.h
+++ b/source/libvpx/vp9/encoder/vp9_ssim.h

@@ -23,6 +23,20 @@
 double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
                       double *ssim_y, double *ssim_u, double *ssim_v);
 
+#if CONFIG_VP9_HIGHBITDEPTH
+double vp9_highbd_calc_ssim(YV12_BUFFER_CONFIG *source,
+                            YV12_BUFFER_CONFIG *dest,
+                            double *weight,
+                            unsigned int bd);
+
+double vp9_highbd_calc_ssimg(YV12_BUFFER_CONFIG *source,
+                             YV12_BUFFER_CONFIG *dest,
+                             double *ssim_y,
+                             double *ssim_u,
+                             double *ssim_v,
+                             unsigned int bd);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/source/libvpx/vp9/encoder/vp9_svc_layercontext.c b/source/libvpx/vp9/encoder/vp9_svc_layercontext.c
index eed681c..1573557 100644
--- a/source/libvpx/vp9/encoder/vp9_svc_layercontext.c
+++ b/source/libvpx/vp9/encoder/vp9_svc_layercontext.c

@@ -14,6 +14,8 @@
 #include "vp9/encoder/vp9_svc_layercontext.h"
 #include "vp9/encoder/vp9_extend.h"
 
+#define SMALL_FRAME_FB_IDX 7
+
 void vp9_init_layer_context(VP9_COMP *const cpi) {
   SVC *const svc = &cpi->svc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
@@ -28,6 +30,25 @@
     layer_end = svc->number_temporal_layers;
   } else {
     layer_end = svc->number_spatial_layers;
+
+    if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) {
+      if (vp9_realloc_frame_buffer(&cpi->svc.empty_frame.img,
+                                   cpi->common.width, cpi->common.height,
+                                   cpi->common.subsampling_x,
+                                   cpi->common.subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                 cpi->common.use_highbitdepth,
+#endif
+                                 VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
+        vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate empty frame for multiple frame "
+                           "contexts");
+
+      vpx_memset(cpi->svc.empty_frame.img.buffer_alloc, 0x80,
+                 cpi->svc.empty_frame.img.buffer_alloc_sz);
+      cpi->svc.empty_frame_width = cpi->common.width;
+      cpi->svc.empty_frame_height = cpi->common.height;
+    }
   }
 
   for (layer = 0; layer < layer_end; ++layer) {
@@ -310,6 +331,47 @@
   get_layer_resolution(cpi->oxcf.width, cpi->oxcf.height,
                        lc->scaling_factor_num, lc->scaling_factor_den,
                        &width, &height);
+
+  // Workaround for multiple frame contexts. In some frames we can't use prev_mi
+  // since its previous frame could be changed during decoding time. The idea is
+  // we put a empty invisible frame in front of them, then we will not use
+  // prev_mi when encoding these frames.
+  if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2 &&
+      cpi->svc.encode_empty_frame_state == NEED_TO_ENCODE) {
+    if ((cpi->svc.number_temporal_layers > 1 &&
+         cpi->svc.temporal_layer_id < cpi->svc.number_temporal_layers - 1) ||
+        (cpi->svc.number_spatial_layers > 1 &&
+         cpi->svc.spatial_layer_id == 0)) {
+      struct lookahead_entry *buf = vp9_lookahead_peek(cpi->lookahead, 0);
+
+      if (buf != NULL) {
+        cpi->svc.empty_frame.ts_start = buf->ts_start;
+        cpi->svc.empty_frame.ts_end = buf->ts_end;
+        cpi->svc.encode_empty_frame_state = ENCODING;
+        cpi->common.show_frame = 0;
+        cpi->ref_frame_flags = 0;
+        cpi->common.frame_type = INTER_FRAME;
+        cpi->lst_fb_idx =
+            cpi->gld_fb_idx = cpi->alt_fb_idx = SMALL_FRAME_FB_IDX;
+
+        // Gradually make the empty frame smaller to save bits. Make it half of
+        // its previous size because of the scaling factor restriction.
+        cpi->svc.empty_frame_width >>= 1;
+        cpi->svc.empty_frame_width = (cpi->svc.empty_frame_width + 1) & ~1;
+        if (cpi->svc.empty_frame_width < 16)
+          cpi->svc.empty_frame_width = 16;
+
+        cpi->svc.empty_frame_height >>= 1;
+        cpi->svc.empty_frame_height = (cpi->svc.empty_frame_height + 1) & ~1;
+        if (cpi->svc.empty_frame_height < 16)
+          cpi->svc.empty_frame_height = 16;
+
+        width = cpi->svc.empty_frame_width;
+        height = cpi->svc.empty_frame_height;
+      }
+    }
+  }
+
   if (vp9_set_size_literal(cpi, width, height) != 0)
     return VPX_CODEC_INVALID_PARAM;
 
@@ -317,7 +379,6 @@
   cpi->oxcf.best_allowed_q = vp9_quantizer_to_qindex(lc->min_q);
 
   vp9_change_config(cpi, &cpi->oxcf);
-
   vp9_set_high_precision_mv(cpi, 1);
 
   cpi->alt_ref_source = get_layer_context(cpi)->alt_ref_source;

diff --git a/source/libvpx/vp9/encoder/vp9_svc_layercontext.h b/source/libvpx/vp9/encoder/vp9_svc_layercontext.h
index 47a5456..e9645ce 100644
--- a/source/libvpx/vp9/encoder/vp9_svc_layercontext.h
+++ b/source/libvpx/vp9/encoder/vp9_svc_layercontext.h

@@ -50,6 +50,16 @@
 
   int spatial_layer_to_encode;
 
+  // Workaround for multiple frame contexts
+  enum {
+    ENCODED = 0,
+    ENCODING,
+    NEED_TO_ENCODE
+  }encode_empty_frame_state;
+  struct lookahead_entry empty_frame;
+  int empty_frame_width;
+  int empty_frame_height;
+
   // Store scaled source frames to be used for temporal filter to generate
   // a alt ref frame.
   YV12_BUFFER_CONFIG scaled_frames[MAX_LAG_BUFFERS];

diff --git a/source/libvpx/vp9/encoder/vp9_temporal_filter.c b/source/libvpx/vp9/encoder/vp9_temporal_filter.c
index 6fd796d..5599227 100644
--- a/source/libvpx/vp9/encoder/vp9_temporal_filter.c
+++ b/source/libvpx/vp9/encoder/vp9_temporal_filter.c

@@ -56,6 +56,34 @@
     mv_precision_uv = MV_PRECISION_Q3;
   }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vp9_highbd_build_inter_predictor(y_mb_ptr, stride,
+                                     &pred[0], 16,
+                                     &mv,
+                                     scale,
+                                     16, 16,
+                                     which_mv,
+                                     kernel, MV_PRECISION_Q3, x, y, xd->bd);
+
+    vp9_highbd_build_inter_predictor(u_mb_ptr, uv_stride,
+                                     &pred[256], uv_block_width,
+                                     &mv,
+                                     scale,
+                                     uv_block_width, uv_block_height,
+                                     which_mv,
+                                     kernel, mv_precision_uv, x, y, xd->bd);
+
+    vp9_highbd_build_inter_predictor(v_mb_ptr, uv_stride,
+                                     &pred[512], uv_block_width,
+                                     &mv,
+                                     scale,
+                                     uv_block_width, uv_block_height,
+                                     which_mv,
+                                     kernel, mv_precision_uv, x, y, xd->bd);
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   vp9_build_inter_predictor(y_mb_ptr, stride,
                             &pred[0], 16,
                             &mv,
@@ -133,6 +161,54 @@
   }
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_temporal_filter_apply_c(uint8_t *frame1_8,
+                                        unsigned int stride,
+                                        uint8_t *frame2_8,
+                                        unsigned int block_width,
+                                        unsigned int block_height,
+                                        int strength,
+                                        int filter_weight,
+                                        unsigned int *accumulator,
+                                        uint16_t *count) {
+  uint16_t *frame1 = CONVERT_TO_SHORTPTR(frame1_8);
+  uint16_t *frame2 = CONVERT_TO_SHORTPTR(frame2_8);
+  unsigned int i, j, k;
+  int modifier;
+  int byte = 0;
+  const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
+
+  for (i = 0, k = 0; i < block_height; i++) {
+    for (j = 0; j < block_width; j++, k++) {
+      int src_byte = frame1[byte];
+      int pixel_value = *frame2++;
+
+      modifier   = src_byte - pixel_value;
+      // This is an integer approximation of:
+      // float coeff = (3.0 * modifer * modifier) / pow(2, strength);
+      // modifier =  (int)roundf(coeff > 16 ? 0 : 16-coeff);
+      modifier *= modifier;
+      modifier *= 3;
+      modifier += rounding;
+      modifier >>= strength;
+
+      if (modifier > 16)
+        modifier = 16;
+
+      modifier = 16 - modifier;
+      modifier *= filter_weight;
+
+      count[k] += modifier;
+      accumulator[k] += modifier * pixel_value;
+
+      byte++;
+    }
+
+    byte += stride - block_width;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
                                               uint8_t *arf_frame_buf,
                                               uint8_t *frame_ptr_buf,
@@ -145,7 +221,7 @@
   int bestsme = INT_MAX;
   int distortion;
   unsigned int sse;
-  int sad_list[5];
+  int cost_list[5];
 
   MV best_ref_mv1 = {0, 0};
   MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
@@ -169,7 +245,7 @@
 
   // Ignore mv costing by sending NULL pointer instead of cost arrays
   vp9_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1,
-                 cond_sad_list(cpi, sad_list),
+                 cond_cost_list(cpi, cost_list),
                  &cpi->fn_ptr[BLOCK_16X16], 0, &best_ref_mv1, ref_mv);
 
   // Ignore mv costing by sending NULL pointer instead of cost array
@@ -179,7 +255,7 @@
                                          x->errorperbit,
                                          &cpi->fn_ptr[BLOCK_16X16],
                                          0, mv_sf->subpel_iters_per_step,
-                                         cond_sad_list(cpi, sad_list),
+                                         cond_cost_list(cpi, cost_list),
                                          NULL, NULL,
                                          &distortion, &sse, NULL, 0, 0);
 
@@ -209,13 +285,26 @@
   MACROBLOCKD *mbd = &cpi->mb.e_mbd;
   YV12_BUFFER_CONFIG *f = frames[alt_ref_index];
   uint8_t *dst1, *dst2;
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED_ARRAY(16, uint16_t,  predictor16, 16 * 16 * 3);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t,  predictor8, 16 * 16 * 3);
+  uint8_t *predictor;
+#else
   DECLARE_ALIGNED_ARRAY(16, uint8_t,  predictor, 16 * 16 * 3);
+#endif
   const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y;
   const int mb_uv_width  = 16 >> mbd->plane[1].subsampling_x;
 
   // Save input state
   uint8_t* input_buffer[MAX_MB_PLANE];
   int i;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    predictor = CONVERT_TO_BYTEPTR(predictor16);
+  } else {
+    predictor = predictor8;
+  }
+#endif
 
   for (i = 0; i < MAX_MB_PLANE; i++)
     input_buffer[i] = mbd->plane[i].pre[0].buf;
@@ -286,6 +375,44 @@
               predictor, scale,
               mb_col * 16, mb_row * 16);
 
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            int adj_strength = strength + 2 * (mbd->bd - 8);
+            // Apply the filter (YUV)
+            vp9_highbd_temporal_filter_apply(f->y_buffer + mb_y_offset,
+                                             f->y_stride,
+                                             predictor, 16, 16, adj_strength,
+                                             filter_weight,
+                                             accumulator, count);
+            vp9_highbd_temporal_filter_apply(f->u_buffer + mb_uv_offset,
+                                             f->uv_stride, predictor + 256,
+                                             mb_uv_width, mb_uv_height,
+                                             adj_strength,
+                                             filter_weight, accumulator + 256,
+                                             count + 256);
+            vp9_highbd_temporal_filter_apply(f->v_buffer + mb_uv_offset,
+                                             f->uv_stride, predictor + 512,
+                                             mb_uv_width, mb_uv_height,
+                                             adj_strength, filter_weight,
+                                             accumulator + 512, count + 512);
+          } else {
+            // Apply the filter (YUV)
+            vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
+                                      predictor, 16, 16,
+                                      strength, filter_weight,
+                                      accumulator, count);
+            vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
+                                      predictor + 256,
+                                      mb_uv_width, mb_uv_height, strength,
+                                      filter_weight, accumulator + 256,
+                                      count + 256);
+            vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
+                                      predictor + 512,
+                                      mb_uv_width, mb_uv_height, strength,
+                                      filter_weight, accumulator + 512,
+                                      count + 512);
+          }
+#else
           // Apply the filter (YUV)
           vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
                                     predictor, 16, 16,
@@ -301,9 +428,108 @@
                                     mb_uv_width, mb_uv_height, strength,
                                     filter_weight, accumulator + 512,
                                     count + 512);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
         }
       }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        uint16_t *dst1_16;
+        uint16_t *dst2_16;
+        // Normalize filter output to produce AltRef frame
+        dst1 = cpi->alt_ref_buffer.y_buffer;
+        dst1_16 = CONVERT_TO_SHORTPTR(dst1);
+        stride = cpi->alt_ref_buffer.y_stride;
+        byte = mb_y_offset;
+        for (i = 0, k = 0; i < 16; i++) {
+          for (j = 0; j < 16; j++, k++) {
+            unsigned int pval = accumulator[k] + (count[k] >> 1);
+            pval *= fixed_divide[count[k]];
+            pval >>= 19;
+
+            dst1_16[byte] = (uint16_t)pval;
+
+            // move to next pixel
+            byte++;
+          }
+
+          byte += stride - 16;
+        }
+
+        dst1 = cpi->alt_ref_buffer.u_buffer;
+        dst2 = cpi->alt_ref_buffer.v_buffer;
+        dst1_16 = CONVERT_TO_SHORTPTR(dst1);
+        dst2_16 = CONVERT_TO_SHORTPTR(dst2);
+        stride = cpi->alt_ref_buffer.uv_stride;
+        byte = mb_uv_offset;
+        for (i = 0, k = 256; i < mb_uv_height; i++) {
+          for (j = 0; j < mb_uv_width; j++, k++) {
+            int m = k + 256;
+
+            // U
+            unsigned int pval = accumulator[k] + (count[k] >> 1);
+            pval *= fixed_divide[count[k]];
+            pval >>= 19;
+            dst1_16[byte] = (uint16_t)pval;
+
+            // V
+            pval = accumulator[m] + (count[m] >> 1);
+            pval *= fixed_divide[count[m]];
+            pval >>= 19;
+            dst2_16[byte] = (uint16_t)pval;
+
+            // move to next pixel
+            byte++;
+          }
+
+          byte += stride - mb_uv_width;
+        }
+      } else {
+        // Normalize filter output to produce AltRef frame
+        dst1 = cpi->alt_ref_buffer.y_buffer;
+        stride = cpi->alt_ref_buffer.y_stride;
+        byte = mb_y_offset;
+        for (i = 0, k = 0; i < 16; i++) {
+          for (j = 0; j < 16; j++, k++) {
+            unsigned int pval = accumulator[k] + (count[k] >> 1);
+            pval *= fixed_divide[count[k]];
+            pval >>= 19;
+
+            dst1[byte] = (uint8_t)pval;
+
+            // move to next pixel
+            byte++;
+          }
+          byte += stride - 16;
+        }
+
+        dst1 = cpi->alt_ref_buffer.u_buffer;
+        dst2 = cpi->alt_ref_buffer.v_buffer;
+        stride = cpi->alt_ref_buffer.uv_stride;
+        byte = mb_uv_offset;
+        for (i = 0, k = 256; i < mb_uv_height; i++) {
+          for (j = 0; j < mb_uv_width; j++, k++) {
+            int m = k + 256;
+
+            // U
+            unsigned int pval = accumulator[k] + (count[k] >> 1);
+            pval *= fixed_divide[count[k]];
+            pval >>= 19;
+            dst1[byte] = (uint8_t)pval;
+
+            // V
+            pval = accumulator[m] + (count[m] >> 1);
+            pval *= fixed_divide[count[m]];
+            pval >>= 19;
+            dst2[byte] = (uint8_t)pval;
+
+            // move to next pixel
+            byte++;
+          }
+          byte += stride - mb_uv_width;
+        }
+      }
+#else
       // Normalize filter output to produce AltRef frame
       dst1 = cpi->alt_ref_buffer.y_buffer;
       stride = cpi->alt_ref_buffer.y_stride;
@@ -347,6 +573,7 @@
         }
         byte += stride - mb_uv_width;
       }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       mb_y_offset += 16;
       mb_uv_offset += mb_uv_width;
     }
@@ -449,60 +676,69 @@
     frames[frames_to_blur - 1 - frame] = &buf->img;
   }
 
-  // Setup scaling factors. Scaling on each of the arnr frames is not supported
-  if (is_two_pass_svc(cpi)) {
-    // In spatial svc the scaling factors might be less then 1/2. So we will use
-    // non-normative scaling.
-    int frame_used = 0;
+  if (frames_to_blur > 0) {
+    // Setup scaling factors. Scaling on each of the arnr frames is not
+    // supported.
+    if (is_two_pass_svc(cpi)) {
+      // In spatial svc the scaling factors might be less then 1/2.
+      // So we will use non-normative scaling.
+      int frame_used = 0;
 #if CONFIG_VP9_HIGHBITDEPTH
-    vp9_setup_scale_factors_for_frame(&sf,
-                                      get_frame_new_buffer(cm)->y_crop_width,
-                                      get_frame_new_buffer(cm)->y_crop_height,
-                                      get_frame_new_buffer(cm)->y_crop_width,
-                                      get_frame_new_buffer(cm)->y_crop_height,
-                                      cm->use_highbitdepth);
+      vp9_setup_scale_factors_for_frame(
+          &sf,
+          get_frame_new_buffer(cm)->y_crop_width,
+          get_frame_new_buffer(cm)->y_crop_height,
+          get_frame_new_buffer(cm)->y_crop_width,
+          get_frame_new_buffer(cm)->y_crop_height,
+          cm->use_highbitdepth);
 #else
-    vp9_setup_scale_factors_for_frame(&sf,
-                                      get_frame_new_buffer(cm)->y_crop_width,
-                                      get_frame_new_buffer(cm)->y_crop_height,
-                                      get_frame_new_buffer(cm)->y_crop_width,
-                                      get_frame_new_buffer(cm)->y_crop_height);
-#endif
-    for (frame = 0; frame < frames_to_blur; ++frame) {
-      if (cm->mi_cols * MI_SIZE != frames[frame]->y_width ||
-          cm->mi_rows * MI_SIZE != frames[frame]->y_height) {
-        if (vp9_realloc_frame_buffer(&cpi->svc.scaled_frames[frame_used],
-                                     cm->width, cm->height,
-                                     cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_VP9_HIGHBITDEPTH
-                                     cm->use_highbitdepth,
-#endif
-                                     VP9_ENC_BORDER_IN_PIXELS, NULL, NULL,
-                                     NULL))
-          vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                             "Failed to reallocate alt_ref_buffer");
+      vp9_setup_scale_factors_for_frame(
+          &sf,
+          get_frame_new_buffer(cm)->y_crop_width,
+          get_frame_new_buffer(cm)->y_crop_height,
+          get_frame_new_buffer(cm)->y_crop_width,
+          get_frame_new_buffer(cm)->y_crop_height);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
-        frames[frame] = vp9_scale_if_required(cm, frames[frame],
-                            &cpi->svc.scaled_frames[frame_used]);
-        ++frame_used;
-      }
-    }
-  } else {
-    // ARF is produced at the native frame size and resized when coded.
+      for (frame = 0; frame < frames_to_blur; ++frame) {
+        if (cm->mi_cols * MI_SIZE != frames[frame]->y_width ||
+            cm->mi_rows * MI_SIZE != frames[frame]->y_height) {
+          if (vp9_realloc_frame_buffer(&cpi->svc.scaled_frames[frame_used],
+                                       cm->width, cm->height,
+                                       cm->subsampling_x, cm->subsampling_y,
 #if CONFIG_VP9_HIGHBITDEPTH
-    vp9_setup_scale_factors_for_frame(&sf,
-                                      frames[0]->y_crop_width,
-                                      frames[0]->y_crop_height,
-                                      frames[0]->y_crop_width,
-                                      frames[0]->y_crop_height,
-                                      cm->use_highbitdepth);
-#else
-    vp9_setup_scale_factors_for_frame(&sf,
-                                      frames[0]->y_crop_width,
-                                      frames[0]->y_crop_height,
-                                      frames[0]->y_crop_width,
-                                      frames[0]->y_crop_height);
+                                       cm->use_highbitdepth,
 #endif
+                                       VP9_ENC_BORDER_IN_PIXELS, NULL, NULL,
+                                       NULL)) {
+            vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                               "Failed to reallocate alt_ref_buffer");
+          }
+          frames[frame] = vp9_scale_if_required(
+              cm, frames[frame], &cpi->svc.scaled_frames[frame_used]);
+          ++frame_used;
+        }
+      }
+      cm->mi = cm->mip + cm->mi_stride + 1;
+      cpi->mb.e_mbd.mi = cm->mi;
+      cpi->mb.e_mbd.mi[0].src_mi = &cpi->mb.e_mbd.mi[0];
+    } else {
+      // ARF is produced at the native frame size and resized when coded.
+#if CONFIG_VP9_HIGHBITDEPTH
+      vp9_setup_scale_factors_for_frame(&sf,
+                                        frames[0]->y_crop_width,
+                                        frames[0]->y_crop_height,
+                                        frames[0]->y_crop_width,
+                                        frames[0]->y_crop_height,
+                                        cm->use_highbitdepth);
+#else
+      vp9_setup_scale_factors_for_frame(&sf,
+                                        frames[0]->y_crop_width,
+                                        frames[0]->y_crop_height,
+                                        frames[0]->y_crop_width,
+                                        frames[0]->y_crop_height);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
   }
 
   temporal_filter_iterate_c(cpi, frames, frames_to_blur,

diff --git a/source/libvpx/vp9/encoder/vp9_tokenize.c b/source/libvpx/vp9/encoder/vp9_tokenize.c
index 8b9aa91..adf01bf 100644
--- a/source/libvpx/vp9/encoder/vp9_tokenize.c
+++ b/source/libvpx/vp9/encoder/vp9_tokenize.c

@@ -261,7 +261,7 @@
 }
 
 static INLINE void add_token(TOKENEXTRA **t, const vp9_prob *context_tree,
-                             int16_t extra, uint8_t token,
+                             int32_t extra, uint8_t token,
                              uint8_t skip_eob_node,
                              unsigned int *counts) {
   (*t)->token = token;
@@ -329,7 +329,7 @@
   scan = so->scan;
   nb = so->neighbors;
   c = 0;
-#if CONFIG_VP9_HIGH && CONFIG_HIGH_QUANT
+#if CONFIG_VP9_HIGHBITDEPTH
   if (cpi->common.profile >= PROFILE_2) {
     dct_value_tokens = (cpi->common.bit_depth == VPX_BITS_10 ?
                         vp9_dct_value_tokens_high10_ptr :
@@ -403,6 +403,24 @@
   return result;
 }
 
+static void has_high_freq_coeff(int plane, int block,
+                                BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                                void *argv) {
+  struct is_skippable_args *args = argv;
+  int eobs = (tx_size == TX_4X4) ? 3 : 10;
+  (void) plane_bsize;
+
+  *(args->skippable) |= (args->x->plane[plane].eobs[block] > eobs);
+}
+
+int vp9_has_high_freq_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
+  int result = 0;
+  struct is_skippable_args args = {x, &result};
+  vp9_foreach_transformed_block_in_plane(&x->e_mbd, bsize, plane,
+                                         has_high_freq_coeff, &args);
+  return result;
+}
+
 void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
                      BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;

diff --git a/source/libvpx/vp9/encoder/vp9_tokenize.h b/source/libvpx/vp9/encoder/vp9_tokenize.h
index 063c0ba..825252b 100644
--- a/source/libvpx/vp9/encoder/vp9_tokenize.h
+++ b/source/libvpx/vp9/encoder/vp9_tokenize.h

@@ -26,12 +26,20 @@
 
 typedef struct {
   int16_t token;
+#if CONFIG_VP9_HIGHBITDEPTH
+  int32_t extra;
+#else
   int16_t extra;
+#endif
 } TOKENVALUE;
 
 typedef struct {
   const vp9_prob *context_tree;
+#if CONFIG_VP9_HIGHBITDEPTH
+  int32_t extra;
+#else
   int16_t         extra;
+#endif
   uint8_t         token;
   uint8_t         skip_eob_node;
 } TOKENEXTRA;
@@ -41,6 +49,7 @@
 extern struct vp9_token vp9_coef_encodings[];
 
 int vp9_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
+int vp9_has_high_freq_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 
 struct VP9_COMP;
 
@@ -53,6 +62,12 @@
  *  fields are not.
  */
 extern const TOKENVALUE *vp9_dct_value_tokens_ptr;
+#if CONFIG_VP9_HIGHBITDEPTH
+extern const int16_t *vp9_dct_value_cost_high10_ptr;
+extern const TOKENVALUE *vp9_dct_value_tokens_high10_ptr;
+extern const int16_t *vp9_dct_value_cost_high12_ptr;
+extern const TOKENVALUE *vp9_dct_value_tokens_high12_ptr;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/source/libvpx/vp9/encoder/vp9_variance.c b/source/libvpx/vp9/encoder/vp9_variance.c
index c97f93f..4555bde 100644
--- a/source/libvpx/vp9/encoder/vp9_variance.c
+++ b/source/libvpx/vp9/encoder/vp9_variance.c

@@ -269,10 +269,10 @@
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void high_variance64(const uint8_t *a8, int  a_stride,
-                     const uint8_t *b8, int  b_stride,
-                     int w, int h, uint64_t *sse,
-                     uint64_t *sum) {
+void highbd_variance64(const uint8_t *a8, int  a_stride,
+                       const uint8_t *b8, int  b_stride,
+                       int w, int h, uint64_t *sse,
+                       uint64_t *sum) {
   int i, j;
 
   uint16_t *a = CONVERT_TO_SHORTPTR(a8);
@@ -291,40 +291,40 @@
   }
 }
 
-void high_variance(const uint8_t *a8, int  a_stride,
-                   const uint8_t *b8, int  b_stride,
-                   int w, int h, unsigned int *sse,
-                   int *sum) {
+void highbd_variance(const uint8_t *a8, int  a_stride,
+                     const uint8_t *b8, int  b_stride,
+                     int w, int h, unsigned int *sse,
+                     int *sum) {
   uint64_t sse_long = 0;
   uint64_t sum_long = 0;
-  high_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
   *sse = sse_long;
   *sum = sum_long;
 }
 
-void high_10_variance(const uint8_t *a8, int  a_stride,
-                      const uint8_t *b8, int  b_stride,
-                      int w, int h, unsigned int *sse,
-                      int *sum) {
+void highbd_10_variance(const uint8_t *a8, int  a_stride,
+                        const uint8_t *b8, int  b_stride,
+                        int w, int h, unsigned int *sse,
+                        int *sum) {
   uint64_t sse_long = 0;
   uint64_t sum_long = 0;
-  high_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
   *sum = ROUND_POWER_OF_TWO(sum_long, 2);
   *sse = ROUND_POWER_OF_TWO(sse_long, 4);
 }
 
-void high_12_variance(const uint8_t *a8, int  a_stride,
-                      const uint8_t *b8, int  b_stride,
-                      int w, int h, unsigned int *sse,
-                      int *sum) {
+void highbd_12_variance(const uint8_t *a8, int  a_stride,
+                        const uint8_t *b8, int  b_stride,
+                        int w, int h, unsigned int *sse,
+                        int *sum) {
   uint64_t sse_long = 0;
   uint64_t sum_long = 0;
-  high_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
   *sum = ROUND_POWER_OF_TWO(sum_long, 4);
   *sse = ROUND_POWER_OF_TWO(sse_long, 8);
 }
 
-static void high_var_filter_block2d_bil_first_pass(
+static void highbd_var_filter_block2d_bil_first_pass(
     const uint8_t *src_ptr8,
     uint16_t *output_ptr,
     unsigned int src_pixels_per_line,
@@ -350,7 +350,7 @@
   }
 }
 
-static void high_var_filter_block2d_bil_second_pass(
+static void highbd_var_filter_block2d_bil_second_pass(
     const uint16_t *src_ptr,
     uint16_t *output_ptr,
     unsigned int src_pixels_per_line,
@@ -374,257 +374,267 @@
   }
 }
 
-#define HIGH_VAR(W, H) \
-unsigned int vp9_high_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
-                                            const uint8_t *b, int b_stride, \
+#define HIGHBD_VAR(W, H) \
+unsigned int vp9_highbd_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                              const uint8_t *b, int b_stride, \
+                                              unsigned int *sse) { \
+  int sum; \
+  highbd_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+  return *sse - (((int64_t)sum * sum) / (W * H)); \
+} \
+\
+unsigned int vp9_highbd_10_variance##W##x##H##_c(const uint8_t *a, \
+                                                 int a_stride, \
+                                                 const uint8_t *b, \
+                                                 int b_stride, \
+                                                 unsigned int *sse) { \
+  int sum; \
+  highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+  return *sse - (((int64_t)sum * sum) / (W * H)); \
+} \
+\
+unsigned int vp9_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
+                                                 int a_stride, \
+                                                 const uint8_t *b, \
+                                                 int b_stride, \
+                                                 unsigned int *sse) { \
+  int sum; \
+  highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+  return *sse - (((int64_t)sum * sum) / (W * H)); \
+}
+
+#define HIGHBD_SUBPIX_VAR(W, H) \
+unsigned int vp9_highbd_sub_pixel_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  unsigned int *sse) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+\
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                           W, BILINEAR_FILTERS_2TAP(xoffset)); \
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                            BILINEAR_FILTERS_2TAP(yoffset)); \
+\
+  return vp9_highbd_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
+                                          dst_stride, sse); \
+} \
+\
+unsigned int vp9_highbd_10_sub_pixel_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  unsigned int *sse) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+\
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                           W, BILINEAR_FILTERS_2TAP(xoffset)); \
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                            BILINEAR_FILTERS_2TAP(yoffset)); \
+\
+  return vp9_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+                                             W, dst, dst_stride, sse); \
+} \
+\
+unsigned int vp9_highbd_12_sub_pixel_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  unsigned int *sse) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+\
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                           W, BILINEAR_FILTERS_2TAP(xoffset)); \
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                            BILINEAR_FILTERS_2TAP(yoffset)); \
+\
+  return vp9_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+                                             W, dst, dst_stride, sse); \
+}
+
+#define HIGHBD_SUBPIX_AVG_VAR(W, H) \
+unsigned int vp9_highbd_sub_pixel_avg_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  unsigned int *sse, \
+  const uint8_t *second_pred) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, temp3, H * W); \
+\
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                           W, BILINEAR_FILTERS_2TAP(xoffset)); \
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                            BILINEAR_FILTERS_2TAP(yoffset)); \
+\
+  vp9_highbd_comp_avg_pred(temp3, second_pred, W, H, \
+                           CONVERT_TO_BYTEPTR(temp2), W); \
+\
+  return vp9_highbd_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+                                          dst_stride, sse); \
+} \
+\
+unsigned int vp9_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  unsigned int *sse, \
+  const uint8_t *second_pred) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, temp3, H * W); \
+\
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                           W, BILINEAR_FILTERS_2TAP(xoffset)); \
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                            BILINEAR_FILTERS_2TAP(yoffset)); \
+\
+  vp9_highbd_comp_avg_pred(temp3, second_pred, W, H, \
+                           CONVERT_TO_BYTEPTR(temp2), W); \
+\
+  return vp9_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
+                                             W, dst, dst_stride, sse); \
+} \
+\
+unsigned int vp9_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  unsigned int *sse, \
+  const uint8_t *second_pred) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, temp3, H * W); \
+\
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                           W, BILINEAR_FILTERS_2TAP(xoffset)); \
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                            BILINEAR_FILTERS_2TAP(yoffset)); \
+\
+  vp9_highbd_comp_avg_pred(temp3, second_pred, W, H, \
+                           CONVERT_TO_BYTEPTR(temp2), W); \
+\
+  return vp9_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
+                                             W, dst, dst_stride, sse); \
+}
+
+#define HIGHBD_GET_VAR(S) \
+void vp9_highbd_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+                                    const uint8_t *ref, int ref_stride, \
+                                    unsigned int *sse, int *sum) { \
+  highbd_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
+} \
+\
+void vp9_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+                                       const uint8_t *ref, int ref_stride, \
+                                       unsigned int *sse, int *sum) { \
+  highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
+} \
+\
+void vp9_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+                                       const uint8_t *ref, int ref_stride, \
+                                       unsigned int *sse, int *sum) { \
+  highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
+}
+
+#define HIGHBD_MSE(W, H) \
+unsigned int vp9_highbd_mse##W##x##H##_c(const uint8_t *src, \
+                                         int src_stride, \
+                                         const uint8_t *ref, \
+                                         int ref_stride, \
+                                         unsigned int *sse) { \
+  int sum; \
+  highbd_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+  return *sse; \
+} \
+\
+unsigned int vp9_highbd_10_mse##W##x##H##_c(const uint8_t *src, \
+                                            int src_stride, \
+                                            const uint8_t *ref, \
+                                            int ref_stride, \
                                             unsigned int *sse) { \
   int sum; \
-  high_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
-  return *sse - (((int64_t)sum * sum) / (W * H)); \
-} \
-\
-unsigned int vp9_high_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
-                                               const uint8_t *b, int b_stride, \
-                                                unsigned int *sse) { \
-  int sum; \
-  high_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
-  return *sse - (((int64_t)sum * sum) / (W * H)); \
-} \
-\
-unsigned int vp9_high_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
-                                               const uint8_t *b, int b_stride, \
-                                               unsigned int *sse) { \
-  int sum; \
-  high_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
-  return *sse - (((int64_t)sum * sum) / (W * H)); \
-}
-
-#define HIGH_SUBPIX_VAR(W, H) \
-unsigned int vp9_high_sub_pixel_variance##W##x##H##_c( \
-  const uint8_t *src, int  src_stride, \
-  int xoffset, int  yoffset, \
-  const uint8_t *dst, int dst_stride, \
-  unsigned int *sse) { \
-  uint16_t fdata3[(H + 1) * W]; \
-  uint16_t temp2[H * W]; \
-\
-  high_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                         W, BILINEAR_FILTERS_2TAP(xoffset)); \
-  high_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                          BILINEAR_FILTERS_2TAP(yoffset)); \
-\
-  return vp9_high_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
-                                        dst_stride, sse); \
-} \
-\
-unsigned int vp9_high_10_sub_pixel_variance##W##x##H##_c( \
-  const uint8_t *src, int  src_stride, \
-  int xoffset, int  yoffset, \
-  const uint8_t *dst, int dst_stride, \
-  unsigned int *sse) { \
-  uint16_t fdata3[(H + 1) * W]; \
-  uint16_t temp2[H * W]; \
-\
-  high_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                         W, BILINEAR_FILTERS_2TAP(xoffset)); \
-  high_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                          BILINEAR_FILTERS_2TAP(yoffset)); \
-\
-  return vp9_high_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
-                                           dst_stride, sse); \
-} \
-\
-unsigned int vp9_high_12_sub_pixel_variance##W##x##H##_c( \
-  const uint8_t *src, int  src_stride, \
-  int xoffset, int  yoffset, \
-  const uint8_t *dst, int dst_stride, \
-  unsigned int *sse) { \
-  uint16_t fdata3[(H + 1) * W]; \
-  uint16_t temp2[H * W]; \
-\
-  high_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                         W, BILINEAR_FILTERS_2TAP(xoffset)); \
-  high_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                          BILINEAR_FILTERS_2TAP(yoffset)); \
-\
-  return vp9_high_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
-                                           dst_stride, sse); \
-}
-
-#define HIGH_SUBPIX_AVG_VAR(W, H) \
-unsigned int vp9_high_sub_pixel_avg_variance##W##x##H##_c( \
-  const uint8_t *src, int  src_stride, \
-  int xoffset, int  yoffset, \
-  const uint8_t *dst, int dst_stride, \
-  unsigned int *sse, \
-  const uint8_t *second_pred) { \
-  uint16_t fdata3[(H + 1) * W]; \
-  uint16_t temp2[H * W]; \
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, temp3, H * W); \
-\
-  high_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                         W, BILINEAR_FILTERS_2TAP(xoffset)); \
-  high_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                          BILINEAR_FILTERS_2TAP(yoffset)); \
-\
-  vp9_high_comp_avg_pred(temp3, second_pred, W, H, CONVERT_TO_BYTEPTR(temp2), \
-                         W); \
-\
-  return vp9_high_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
-                                        dst_stride, sse); \
-} \
-\
-unsigned int vp9_high_10_sub_pixel_avg_variance##W##x##H##_c( \
-  const uint8_t *src, int  src_stride, \
-  int xoffset, int  yoffset, \
-  const uint8_t *dst, int dst_stride, \
-  unsigned int *sse, \
-  const uint8_t *second_pred) { \
-  uint16_t fdata3[(H + 1) * W]; \
-  uint16_t temp2[H * W]; \
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, temp3, H * W); \
-\
-  high_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                         W, BILINEAR_FILTERS_2TAP(xoffset)); \
-  high_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                          BILINEAR_FILTERS_2TAP(yoffset)); \
-\
-  vp9_high_comp_avg_pred(temp3, second_pred, W, H, CONVERT_TO_BYTEPTR(temp2), \
-                         W); \
-\
-  return vp9_high_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
-                                        dst_stride, sse); \
-} \
-\
-unsigned int vp9_high_12_sub_pixel_avg_variance##W##x##H##_c( \
-  const uint8_t *src, int  src_stride, \
-  int xoffset, int  yoffset, \
-  const uint8_t *dst, int dst_stride, \
-  unsigned int *sse, \
-  const uint8_t *second_pred) { \
-  uint16_t fdata3[(H + 1) * W]; \
-  uint16_t temp2[H * W]; \
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, temp3, H * W); \
-\
-  high_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                         W, BILINEAR_FILTERS_2TAP(xoffset)); \
-  high_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                          BILINEAR_FILTERS_2TAP(yoffset)); \
-\
-  vp9_high_comp_avg_pred(temp3, second_pred, W, H, CONVERT_TO_BYTEPTR(temp2), \
-                         W); \
-\
-  return vp9_high_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
-                                        dst_stride, sse); \
-}
-
-#define HIGH_GET_VAR(S) \
-void vp9_high_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
-                                  const uint8_t *ref, int ref_stride, \
-                                  unsigned int *sse, int *sum) { \
-  high_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
-} \
-\
-void vp9_high_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
-                                     const uint8_t *ref, int ref_stride, \
-                                     unsigned int *sse, int *sum) { \
-  high_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
-} \
-\
-void vp9_high_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
-                                     const uint8_t *ref, int ref_stride, \
-                                     unsigned int *sse, int *sum) { \
-  high_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
-}
-
-#define HIGH_MSE(W, H) \
-unsigned int vp9_high_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
-                                       const uint8_t *ref, int ref_stride, \
-                                       unsigned int *sse) { \
-  int sum; \
-  high_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+  highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
   return *sse; \
 } \
 \
-unsigned int vp9_high_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
-                                          const uint8_t *ref, int ref_stride, \
-                                          unsigned int *sse) { \
+unsigned int vp9_highbd_12_mse##W##x##H##_c(const uint8_t *src, \
+                                            int src_stride, \
+                                            const uint8_t *ref, \
+                                            int ref_stride, \
+                                            unsigned int *sse) { \
   int sum; \
-  high_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
-  return *sse; \
-} \
-\
-unsigned int vp9_high_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
-                                          const uint8_t *ref, int ref_stride, \
-                                          unsigned int *sse) { \
-  int sum; \
-  high_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+  highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
   return *sse; \
 }
 
-HIGH_GET_VAR(8)
-HIGH_GET_VAR(16)
+HIGHBD_GET_VAR(8)
+HIGHBD_GET_VAR(16)
 
-HIGH_MSE(16, 16)
-HIGH_MSE(16, 8)
-HIGH_MSE(8, 16)
-HIGH_MSE(8, 8)
+HIGHBD_MSE(16, 16)
+HIGHBD_MSE(16, 8)
+HIGHBD_MSE(8, 16)
+HIGHBD_MSE(8, 8)
 
-HIGH_VAR(4, 4)
-HIGH_SUBPIX_VAR(4, 4)
-HIGH_SUBPIX_AVG_VAR(4, 4)
+HIGHBD_VAR(4, 4)
+HIGHBD_SUBPIX_VAR(4, 4)
+HIGHBD_SUBPIX_AVG_VAR(4, 4)
 
-HIGH_VAR(4, 8)
-HIGH_SUBPIX_VAR(4, 8)
-HIGH_SUBPIX_AVG_VAR(4, 8)
+HIGHBD_VAR(4, 8)
+HIGHBD_SUBPIX_VAR(4, 8)
+HIGHBD_SUBPIX_AVG_VAR(4, 8)
 
-HIGH_VAR(8, 4)
-HIGH_SUBPIX_VAR(8, 4)
-HIGH_SUBPIX_AVG_VAR(8, 4)
+HIGHBD_VAR(8, 4)
+HIGHBD_SUBPIX_VAR(8, 4)
+HIGHBD_SUBPIX_AVG_VAR(8, 4)
 
-HIGH_VAR(8, 8)
-HIGH_SUBPIX_VAR(8, 8)
-HIGH_SUBPIX_AVG_VAR(8, 8)
+HIGHBD_VAR(8, 8)
+HIGHBD_SUBPIX_VAR(8, 8)
+HIGHBD_SUBPIX_AVG_VAR(8, 8)
 
-HIGH_VAR(8, 16)
-HIGH_SUBPIX_VAR(8, 16)
-HIGH_SUBPIX_AVG_VAR(8, 16)
+HIGHBD_VAR(8, 16)
+HIGHBD_SUBPIX_VAR(8, 16)
+HIGHBD_SUBPIX_AVG_VAR(8, 16)
 
-HIGH_VAR(16, 8)
-HIGH_SUBPIX_VAR(16, 8)
-HIGH_SUBPIX_AVG_VAR(16, 8)
+HIGHBD_VAR(16, 8)
+HIGHBD_SUBPIX_VAR(16, 8)
+HIGHBD_SUBPIX_AVG_VAR(16, 8)
 
-HIGH_VAR(16, 16)
-HIGH_SUBPIX_VAR(16, 16)
-HIGH_SUBPIX_AVG_VAR(16, 16)
+HIGHBD_VAR(16, 16)
+HIGHBD_SUBPIX_VAR(16, 16)
+HIGHBD_SUBPIX_AVG_VAR(16, 16)
 
-HIGH_VAR(16, 32)
-HIGH_SUBPIX_VAR(16, 32)
-HIGH_SUBPIX_AVG_VAR(16, 32)
+HIGHBD_VAR(16, 32)
+HIGHBD_SUBPIX_VAR(16, 32)
+HIGHBD_SUBPIX_AVG_VAR(16, 32)
 
-HIGH_VAR(32, 16)
-HIGH_SUBPIX_VAR(32, 16)
-HIGH_SUBPIX_AVG_VAR(32, 16)
+HIGHBD_VAR(32, 16)
+HIGHBD_SUBPIX_VAR(32, 16)
+HIGHBD_SUBPIX_AVG_VAR(32, 16)
 
-HIGH_VAR(32, 32)
-HIGH_SUBPIX_VAR(32, 32)
-HIGH_SUBPIX_AVG_VAR(32, 32)
+HIGHBD_VAR(32, 32)
+HIGHBD_SUBPIX_VAR(32, 32)
+HIGHBD_SUBPIX_AVG_VAR(32, 32)
 
-HIGH_VAR(32, 64)
-HIGH_SUBPIX_VAR(32, 64)
-HIGH_SUBPIX_AVG_VAR(32, 64)
+HIGHBD_VAR(32, 64)
+HIGHBD_SUBPIX_VAR(32, 64)
+HIGHBD_SUBPIX_AVG_VAR(32, 64)
 
-HIGH_VAR(64, 32)
-HIGH_SUBPIX_VAR(64, 32)
-HIGH_SUBPIX_AVG_VAR(64, 32)
+HIGHBD_VAR(64, 32)
+HIGHBD_SUBPIX_VAR(64, 32)
+HIGHBD_SUBPIX_AVG_VAR(64, 32)
 
-HIGH_VAR(64, 64)
-HIGH_SUBPIX_VAR(64, 64)
-HIGH_SUBPIX_AVG_VAR(64, 64)
+HIGHBD_VAR(64, 64)
+HIGHBD_SUBPIX_VAR(64, 64)
+HIGHBD_SUBPIX_AVG_VAR(64, 64)
 
-void vp9_high_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
-                            int width, int height, const uint8_t *ref8,
-                            int ref_stride) {
+void vp9_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
+                              int width, int height, const uint8_t *ref8,
+                              int ref_stride) {
   int i, j;
   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);

diff --git a/source/libvpx/vp9/encoder/vp9_variance.h b/source/libvpx/vp9/encoder/vp9_variance.h
index c51d08d..53148f2 100644
--- a/source/libvpx/vp9/encoder/vp9_variance.h
+++ b/source/libvpx/vp9/encoder/vp9_variance.h

@@ -23,20 +23,20 @@
               unsigned int *sse, int *sum);
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void high_variance(const uint8_t *a8, int a_stride,
-                   const uint8_t *b8, int b_stride,
-                   int w, int h,
-                   unsigned int *sse, int *sum);
+void highbd_variance(const uint8_t *a8, int a_stride,
+                     const uint8_t *b8, int b_stride,
+                     int w, int h,
+                     unsigned int *sse, int *sum);
 
-void high_10_variance(const uint8_t *a8, int a_stride,
-                      const uint8_t *b8, int b_stride,
-                      int w, int h,
-                      unsigned int *sse, int *sum);
+void highbd_10_variance(const uint8_t *a8, int a_stride,
+                        const uint8_t *b8, int b_stride,
+                        int w, int h,
+                        unsigned int *sse, int *sum);
 
-void high_12_variance(const uint8_t *a8, int a_stride,
-                      const uint8_t *b8, int b_stride,
-                      int w, int h,
-                      unsigned int *sse, int *sum);
+void highbd_12_variance(const uint8_t *a8, int a_stride,
+                        const uint8_t *b8, int b_stride,
+                        int w, int h,
+                        unsigned int *sse, int *sum);
 #endif
 
 typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
@@ -99,8 +99,9 @@
                        int height, const uint8_t *ref, int ref_stride);
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vp9_high_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred, int width,
-                            int height, const uint8_t *ref, int ref_stride);
+void vp9_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred,
+                              int width, int height,
+                              const uint8_t *ref, int ref_stride);
 #endif
 
 #ifdef __cplusplus

diff --git a/source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c
new file mode 100644
index 0000000..ca6cf1a
--- /dev/null
+++ b/source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c

@@ -0,0 +1,40 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+#include "vpx_ports/mem.h"
+
+
+unsigned int vp9_avg_8x8_sse2(const uint8_t *s, int p) {
+  __m128i s0, s1, u0;
+  unsigned int avg = 0;
+  u0  = _mm_setzero_si128();
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+
+  s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
+  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32));
+  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
+  avg = _mm_extract_epi16(s0, 0);
+  return (avg + 32) >> 6;
+}

diff --git a/source/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c b/source/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c
new file mode 100644
index 0000000..bf5fa88
--- /dev/null
+++ b/source/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c

@@ -0,0 +1,474 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+
+#include "vpx_ports/emmintrin_compat.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/encoder/vp9_context_tree.h"
+#include "vp9/encoder/vp9_denoiser.h"
+#include "vpx_mem/vpx_mem.h"
+
+// Compute the sum of all pixel differences of this MB.
+static INLINE int sum_diff_16x1(__m128i acc_diff) {
+  const __m128i k_1 = _mm_set1_epi16(1);
+  const __m128i acc_diff_lo = _mm_srai_epi16(
+        _mm_unpacklo_epi8(acc_diff, acc_diff), 8);
+  const __m128i acc_diff_hi = _mm_srai_epi16(
+        _mm_unpackhi_epi8(acc_diff, acc_diff), 8);
+  const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi);
+  const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1);
+  const __m128i hgfe_dcba = _mm_add_epi32(hg_fe_dc_ba,
+                                          _mm_srli_si128(hg_fe_dc_ba, 8));
+  const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba,
+                                         _mm_srli_si128(hgfe_dcba, 4));
+  int sum_diff = _mm_cvtsi128_si32(hgfedcba);
+  return sum_diff;
+}
+
+// Denoise a 16x1 vector.
+static INLINE __m128i vp9_denoiser_16x1_sse2(const uint8_t *sig,
+                                             const uint8_t *mc_running_avg_y,
+                                             uint8_t *running_avg_y,
+                                             const __m128i *k_0,
+                                             const __m128i *k_4,
+                                             const __m128i *k_8,
+                                             const __m128i *k_16,
+                                             const __m128i *l3,
+                                             const __m128i *l32,
+                                             const __m128i *l21,
+                                             __m128i acc_diff) {
+  // Calculate differences
+  const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
+  const __m128i v_mc_running_avg_y = _mm_loadu_si128(
+                                     (const __m128i *)(&mc_running_avg_y[0]));
+  __m128i v_running_avg_y;
+  const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
+  const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
+  // Obtain the sign. FF if diff is negative.
+  const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, *k_0);
+  // Clamp absolute difference to 16 to be used to get mask. Doing this
+  // allows us to use _mm_cmpgt_epi8, which operates on signed byte.
+  const __m128i clamped_absdiff = _mm_min_epu8(
+                                  _mm_or_si128(pdiff, ndiff), *k_16);
+  // Get masks for l2 l1 and l0 adjustments.
+  const __m128i mask2 = _mm_cmpgt_epi8(*k_16, clamped_absdiff);
+  const __m128i mask1 = _mm_cmpgt_epi8(*k_8, clamped_absdiff);
+  const __m128i mask0 = _mm_cmpgt_epi8(*k_4, clamped_absdiff);
+  // Get adjustments for l2, l1, and l0.
+  __m128i adj2 = _mm_and_si128(mask2, *l32);
+  const __m128i adj1 = _mm_and_si128(mask1, *l21);
+  const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
+  __m128i adj,  padj, nadj;
+
+  // Combine the adjustments and get absolute adjustments.
+  adj2 = _mm_add_epi8(adj2, adj1);
+  adj = _mm_sub_epi8(*l3, adj2);
+  adj = _mm_andnot_si128(mask0, adj);
+  adj = _mm_or_si128(adj, adj0);
+
+  // Restore the sign and get positive and negative adjustments.
+  padj = _mm_andnot_si128(diff_sign, adj);
+  nadj = _mm_and_si128(diff_sign, adj);
+
+  // Calculate filtered value.
+  v_running_avg_y = _mm_adds_epu8(v_sig, padj);
+  v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj);
+  _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
+
+  // Adjustments <=7, and each element in acc_diff can fit in signed
+  // char.
+  acc_diff = _mm_adds_epi8(acc_diff, padj);
+  acc_diff = _mm_subs_epi8(acc_diff, nadj);
+  return acc_diff;
+}
+
+// Denoise a 16x1 vector with a weaker filter.
+static INLINE __m128i vp9_denoiser_adj_16x1_sse2(const uint8_t *sig,
+                                              const uint8_t *mc_running_avg_y,
+                                              uint8_t *running_avg_y,
+                                              const __m128i k_0,
+                                              const __m128i k_delta,
+                                              __m128i acc_diff) {
+  __m128i v_running_avg_y = _mm_loadu_si128((__m128i *)(&running_avg_y[0]));
+  // Calculate differences.
+  const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
+  const __m128i v_mc_running_avg_y =
+                _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
+  const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
+  const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
+  // Obtain the sign. FF if diff is negative.
+  const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
+  // Clamp absolute difference to delta to get the adjustment.
+  const __m128i adj =
+                _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
+  // Restore the sign and get positive and negative adjustments.
+  __m128i padj, nadj;
+  padj = _mm_andnot_si128(diff_sign, adj);
+  nadj = _mm_and_si128(diff_sign, adj);
+  // Calculate filtered value.
+  v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj);
+  v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj);
+  _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
+
+  // Accumulate the adjustments.
+  acc_diff = _mm_subs_epi8(acc_diff, padj);
+  acc_diff = _mm_adds_epi8(acc_diff, nadj);
+  return acc_diff;
+}
+
+static int vp9_denoiser_4xM_sse2(const uint8_t *sig, int sig_stride,
+                                 const uint8_t *mc_running_avg_y,
+                                 int mc_avg_y_stride,
+                                 uint8_t *running_avg_y, int avg_y_stride,
+                                 int increase_denoising,
+                                 BLOCK_SIZE bs,
+                                 int motion_magnitude) {
+  int sum_diff_thresh;
+  int r;
+  int shift_inc  = (increase_denoising &&
+                   motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0;
+  unsigned char sig_buffer[2][16], mc_running_buffer[2][16],
+                running_buffer[2][16];
+  __m128i acc_diff = _mm_setzero_si128();
+  const __m128i k_0 = _mm_setzero_si128();
+  const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
+  const __m128i k_8 = _mm_set1_epi8(8);
+  const __m128i k_16 = _mm_set1_epi8(16);
+  // Modify each level's adjustment according to motion_magnitude.
+  const __m128i l3 = _mm_set1_epi8(
+                     (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ?
+                     7 + shift_inc : 6);
+  // Difference between level 3 and level 2 is 2.
+  const __m128i l32 = _mm_set1_epi8(2);
+  // Difference between level 2 and level 1 is 1.
+  const __m128i l21 = _mm_set1_epi8(1);
+  int sum_diff = 0;
+
+  for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> 2); ++r) {
+    vpx_memcpy(sig_buffer[r], sig, 4);
+    vpx_memcpy(sig_buffer[r] + 4, sig + sig_stride, 4);
+    vpx_memcpy(sig_buffer[r] + 8, sig + sig_stride * 2, 4);
+    vpx_memcpy(sig_buffer[r] + 12, sig + sig_stride * 3, 4);
+    vpx_memcpy(mc_running_buffer[r], mc_running_avg_y, 4);
+    vpx_memcpy(mc_running_buffer[r] + 4, mc_running_avg_y +
+               mc_avg_y_stride, 4);
+    vpx_memcpy(mc_running_buffer[r] + 8, mc_running_avg_y +
+               mc_avg_y_stride * 2, 4);
+    vpx_memcpy(mc_running_buffer[r] + 12, mc_running_avg_y +
+               mc_avg_y_stride * 3, 4);
+    vpx_memcpy(running_buffer[r], running_avg_y, 4);
+    vpx_memcpy(running_buffer[r] + 4, running_avg_y +
+               avg_y_stride, 4);
+    vpx_memcpy(running_buffer[r] + 8, running_avg_y +
+               avg_y_stride * 2, 4);
+    vpx_memcpy(running_buffer[r] + 12, running_avg_y +
+               avg_y_stride * 3, 4);
+    acc_diff = vp9_denoiser_16x1_sse2(sig_buffer[r],
+                                      mc_running_buffer[r],
+                                      running_buffer[r],
+                                      &k_0, &k_4, &k_8, &k_16,
+                                      &l3, &l32, &l21, acc_diff);
+    vpx_memcpy(running_avg_y, running_buffer[r], 4);
+    vpx_memcpy(running_avg_y + avg_y_stride, running_buffer[r] + 4, 4);
+    vpx_memcpy(running_avg_y + avg_y_stride * 2,
+               running_buffer[r] + 8, 4);
+    vpx_memcpy(running_avg_y + avg_y_stride * 3,
+               running_buffer[r] + 12, 4);
+    // Update pointers for next iteration.
+    sig += (sig_stride << 2);
+    mc_running_avg_y += (mc_avg_y_stride << 2);
+    running_avg_y += (avg_y_stride << 2);
+  }
+
+  {
+    sum_diff = sum_diff_16x1(acc_diff);
+    sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+    if (abs(sum_diff) > sum_diff_thresh) {
+      // Before returning to copy the block (i.e., apply no denoising),
+      // checK if we can still apply some (weaker) temporal filtering to
+      // this block, that would otherwise not be denoised at all. Simplest
+      // is to apply an additional adjustment to running_avg_y to bring it
+      // closer to sig. The adjustment is capped by a maximum delta, and
+      // chosen such that in most cases the resulting sum_diff will be
+      // within the accceptable range given by sum_diff_thresh.
+
+      // The delta is set by the excess of absolute pixel diff over the
+      // threshold.
+      int delta = ((abs(sum_diff) - sum_diff_thresh)
+                  >> num_pels_log2_lookup[bs]) + 1;
+      // Only apply the adjustment for max delta up to 3.
+      if (delta < 4) {
+        const __m128i k_delta = _mm_set1_epi8(delta);
+        running_avg_y -= avg_y_stride * (4 << b_height_log2_lookup[bs]);
+        sum_diff = 0;
+        for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> 2); ++r) {
+          acc_diff = vp9_denoiser_adj_16x1_sse2(
+                             sig_buffer[r], mc_running_buffer[r],
+                             running_buffer[r], k_0, k_delta,
+                             acc_diff);
+          vpx_memcpy(running_avg_y, running_buffer[r], 4);
+          vpx_memcpy(running_avg_y + avg_y_stride, running_buffer[r] + 4, 4);
+          vpx_memcpy(running_avg_y + avg_y_stride * 2,
+                     running_buffer[r] + 8, 4);
+          vpx_memcpy(running_avg_y + avg_y_stride * 3,
+                     running_buffer[r] + 12, 4);
+          // Update pointers for next iteration.
+          running_avg_y += (avg_y_stride << 2);
+        }
+        sum_diff = sum_diff_16x1(acc_diff);
+        if (abs(sum_diff) > sum_diff_thresh) {
+          return COPY_BLOCK;
+        }
+      } else {
+        return COPY_BLOCK;
+      }
+    }
+  }
+  return FILTER_BLOCK;
+}
+
+static int vp9_denoiser_8xM_sse2(const uint8_t *sig, int sig_stride,
+                                 const uint8_t *mc_running_avg_y,
+                                 int mc_avg_y_stride,
+                                 uint8_t *running_avg_y, int avg_y_stride,
+                                 int increase_denoising,
+                                 BLOCK_SIZE bs,
+                                 int motion_magnitude) {
+  int sum_diff_thresh;
+  int r;
+  int shift_inc  = (increase_denoising &&
+                  motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0;
+  unsigned char sig_buffer[8][16], mc_running_buffer[8][16],
+                running_buffer[8][16];
+  __m128i acc_diff = _mm_setzero_si128();
+  const __m128i k_0 = _mm_setzero_si128();
+  const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
+  const __m128i k_8 = _mm_set1_epi8(8);
+  const __m128i k_16 = _mm_set1_epi8(16);
+  // Modify each level's adjustment according to motion_magnitude.
+  const __m128i l3 = _mm_set1_epi8(
+                     (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ?
+                      7 + shift_inc : 6);
+  // Difference between level 3 and level 2 is 2.
+  const __m128i l32 = _mm_set1_epi8(2);
+  // Difference between level 2 and level 1 is 1.
+  const __m128i l21 = _mm_set1_epi8(1);
+  int sum_diff = 0;
+
+  for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> 1); ++r) {
+    vpx_memcpy(sig_buffer[r], sig, 8);
+    vpx_memcpy(sig_buffer[r] + 8, sig + sig_stride, 8);
+    vpx_memcpy(mc_running_buffer[r], mc_running_avg_y, 8);
+    vpx_memcpy(mc_running_buffer[r] + 8, mc_running_avg_y +
+               mc_avg_y_stride, 8);
+    vpx_memcpy(running_buffer[r], running_avg_y, 8);
+    vpx_memcpy(running_buffer[r] + 8, running_avg_y +
+               avg_y_stride, 8);
+    acc_diff = vp9_denoiser_16x1_sse2(sig_buffer[r],
+                                      mc_running_buffer[r],
+                                      running_buffer[r],
+                                      &k_0, &k_4, &k_8, &k_16,
+                                      &l3, &l32, &l21, acc_diff);
+    vpx_memcpy(running_avg_y, running_buffer[r], 8);
+    vpx_memcpy(running_avg_y + avg_y_stride, running_buffer[r] + 8, 8);
+    // Update pointers for next iteration.
+    sig += (sig_stride << 1);
+    mc_running_avg_y += (mc_avg_y_stride << 1);
+    running_avg_y += (avg_y_stride << 1);
+  }
+
+  {
+    sum_diff = sum_diff_16x1(acc_diff);
+    sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+    if (abs(sum_diff) > sum_diff_thresh) {
+      // Before returning to copy the block (i.e., apply no denoising),
+      // checK if we can still apply some (weaker) temporal filtering to
+      // this block, that would otherwise not be denoised at all. Simplest
+      // is to apply an additional adjustment to running_avg_y to bring it
+      // closer to sig. The adjustment is capped by a maximum delta, and
+      // chosen such that in most cases the resulting sum_diff will be
+      // within the accceptable range given by sum_diff_thresh.
+
+      // The delta is set by the excess of absolute pixel diff over the
+      // threshold.
+      int delta = ((abs(sum_diff) - sum_diff_thresh)
+                  >> num_pels_log2_lookup[bs]) + 1;
+      // Only apply the adjustment for max delta up to 3.
+      if (delta < 4) {
+        const __m128i k_delta = _mm_set1_epi8(delta);
+        running_avg_y -= avg_y_stride * (4 << b_height_log2_lookup[bs]);
+        for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> 1); ++r) {
+          acc_diff = vp9_denoiser_adj_16x1_sse2(
+                           sig_buffer[r], mc_running_buffer[r],
+                           running_buffer[r], k_0, k_delta,
+                           acc_diff);
+          vpx_memcpy(running_avg_y, running_buffer[r], 8);
+          vpx_memcpy(running_avg_y + avg_y_stride, running_buffer[r] + 8, 8);
+          // Update pointers for next iteration.
+          running_avg_y += (avg_y_stride << 1);
+        }
+        sum_diff = sum_diff_16x1(acc_diff);
+        if (abs(sum_diff) > sum_diff_thresh) {
+          return COPY_BLOCK;
+        }
+      } else {
+          return COPY_BLOCK;
+      }
+    }
+  }
+  return FILTER_BLOCK;
+}
+
+static int vp9_denoiser_64_32_16xM_sse2(const uint8_t *sig, int sig_stride,
+                                        const uint8_t *mc_running_avg_y,
+                                        int mc_avg_y_stride,
+                                        uint8_t *running_avg_y,
+                                        int avg_y_stride,
+                                        int increase_denoising, BLOCK_SIZE bs,
+                                        int motion_magnitude) {
+  int sum_diff_thresh;
+  int r, c;
+  int shift_inc  = (increase_denoising &&
+                   motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0;
+  __m128i acc_diff[4][4];
+  const __m128i k_0 = _mm_setzero_si128();
+  const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
+  const __m128i k_8 = _mm_set1_epi8(8);
+  const __m128i k_16 = _mm_set1_epi8(16);
+  // Modify each level's adjustment according to motion_magnitude.
+  const __m128i l3 = _mm_set1_epi8(
+                     (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ?
+                     7 + shift_inc : 6);
+  // Difference between level 3 and level 2 is 2.
+  const __m128i l32 = _mm_set1_epi8(2);
+  // Difference between level 2 and level 1 is 1.
+  const __m128i l21 = _mm_set1_epi8(1);
+  int sum_diff = 0;
+
+  for (c = 0; c < 4; ++c) {
+    for (r = 0; r < 4; ++r) {
+      acc_diff[c][r] = _mm_setzero_si128();
+    }
+  }
+
+  for (r = 0; r < (4 << b_height_log2_lookup[bs]); r++) {
+    for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
+      acc_diff[c>>4][r>>4] = vp9_denoiser_16x1_sse2(
+                               sig, mc_running_avg_y,
+                               running_avg_y,
+                               &k_0, &k_4, &k_8, &k_16,
+                               &l3, &l32, &l21, acc_diff[c>>4][r>>4]);
+      // Update pointers for next iteration.
+      sig += 16;
+      mc_running_avg_y += 16;
+      running_avg_y += 16;
+    }
+
+    if ((r + 1) % 16 == 0 || (bs == BLOCK_16X8 && r == 7)) {
+      for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
+        sum_diff += sum_diff_16x1(acc_diff[c>>4][r>>4]);
+      }
+    }
+
+    // Update pointers for next iteration.
+    sig = sig - 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + sig_stride;
+    mc_running_avg_y = mc_running_avg_y -
+                      16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
+                      mc_avg_y_stride;
+    running_avg_y = running_avg_y -
+                    16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
+                    avg_y_stride;
+  }
+
+  {
+    sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+    if (abs(sum_diff) > sum_diff_thresh) {
+      int delta = ((abs(sum_diff) - sum_diff_thresh)
+                  >> num_pels_log2_lookup[bs]) + 1;
+
+      // Only apply the adjustment for max delta up to 3.
+      if (delta < 4) {
+        const __m128i k_delta = _mm_set1_epi8(delta);
+        sig -= sig_stride * (4 << b_height_log2_lookup[bs]);
+        mc_running_avg_y -= mc_avg_y_stride * (4 << b_height_log2_lookup[bs]);
+        running_avg_y -= avg_y_stride * (4 << b_height_log2_lookup[bs]);
+        sum_diff = 0;
+        for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
+          for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
+            acc_diff[c>>4][r>>4] = vp9_denoiser_adj_16x1_sse2(
+                                             sig, mc_running_avg_y,
+                                             running_avg_y, k_0,
+                                             k_delta, acc_diff[c>>4][r>>4]);
+            // Update pointers for next iteration.
+            sig += 16;
+            mc_running_avg_y += 16;
+            running_avg_y += 16;
+          }
+
+          if ((r + 1) % 16 == 0 || (bs == BLOCK_16X8 && r == 7)) {
+            for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
+              sum_diff += sum_diff_16x1(acc_diff[c>>4][r>>4]);
+            }
+          }
+          sig = sig - 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + sig_stride;
+          mc_running_avg_y = mc_running_avg_y -
+                             16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
+                             mc_avg_y_stride;
+          running_avg_y = running_avg_y -
+                          16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
+                          avg_y_stride;
+        }
+        if (abs(sum_diff) > sum_diff_thresh) {
+          return COPY_BLOCK;
+        }
+      } else {
+        return COPY_BLOCK;
+      }
+    }
+  }
+  return FILTER_BLOCK;
+}
+
+int vp9_denoiser_filter_sse2(const uint8_t *sig, int sig_stride,
+                             const uint8_t *mc_avg,
+                             int mc_avg_stride,
+                             uint8_t *avg, int avg_stride,
+                             int increase_denoising,
+                             BLOCK_SIZE bs,
+                             int motion_magnitude) {
+  if (bs == BLOCK_4X4 || bs == BLOCK_4X8) {
+    return vp9_denoiser_4xM_sse2(sig, sig_stride,
+                                 mc_avg, mc_avg_stride,
+                                 avg, avg_stride,
+                                 increase_denoising,
+                                 bs, motion_magnitude);
+  } else if (bs == BLOCK_8X4 || bs == BLOCK_8X8 || bs == BLOCK_8X16) {
+    return vp9_denoiser_8xM_sse2(sig, sig_stride,
+                                 mc_avg, mc_avg_stride,
+                                 avg, avg_stride,
+                                 increase_denoising,
+                                 bs, motion_magnitude);
+  } else if (bs == BLOCK_16X8 || bs == BLOCK_16X16 || bs == BLOCK_16X32 ||
+             bs == BLOCK_32X16|| bs == BLOCK_32X32 || bs == BLOCK_32X64 ||
+             bs == BLOCK_64X32 || bs == BLOCK_64X64) {
+    return vp9_denoiser_64_32_16xM_sse2(sig, sig_stride,
+                                        mc_avg, mc_avg_stride,
+                                        avg, avg_stride,
+                                        increase_denoising,
+                                        bs, motion_magnitude);
+  } else {
+    return COPY_BLOCK;
+  }
+}

diff --git a/source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c b/source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
new file mode 100644
index 0000000..7c1c884
--- /dev/null
+++ b/source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c

@@ -0,0 +1,225 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#include "vpx/vpx_integer.h"
+
+void vp9_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
+                         int skip_block, const int16_t* zbin_ptr,
+                         const int16_t* round_ptr, const int16_t* quant_ptr,
+                         const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
+                         int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
+                         int zbin_oq_value, uint16_t* eob_ptr,
+                         const int16_t* scan_ptr,
+                         const int16_t* iscan_ptr) {
+  __m128i zero;
+  (void)scan_ptr;
+
+  coeff_ptr += n_coeffs;
+  iscan_ptr += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+  zero = _mm_setzero_si128();
+  if (!skip_block) {
+    __m128i eob;
+    __m128i zbin;
+    __m128i round, quant, dequant, shift;
+    {
+      __m128i coeff0, coeff1;
+
+      // Setup global values
+      {
+        __m128i zbin_oq;
+        __m128i pw_1;
+        zbin_oq = _mm_set1_epi16(zbin_oq_value);
+        zbin = _mm_load_si128((const __m128i*)zbin_ptr);
+        round = _mm_load_si128((const __m128i*)round_ptr);
+        quant = _mm_load_si128((const __m128i*)quant_ptr);
+        zbin = _mm_add_epi16(zbin, zbin_oq);
+        pw_1 = _mm_set1_epi16(1);
+        zbin = _mm_sub_epi16(zbin, pw_1);
+        dequant = _mm_load_si128((const __m128i*)dequant_ptr);
+        shift = _mm_load_si128((const __m128i*)quant_shift_ptr);
+      }
+
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+        __m128i cmp_mask0, cmp_mask1;
+        // Do DC and first 15 AC
+        coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
+        coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+        zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+        round = _mm_unpackhi_epi64(round, round);
+        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+        quant = _mm_unpackhi_epi64(quant, quant);
+        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
+        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
+        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
+        shift = _mm_unpackhi_epi64(shift, shift);
+        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
+
+        // Reinsert signs
+        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        // Mask out zbin threshold coeffs
+        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+        dequant = _mm_unpackhi_epi64(dequant, dequant);
+        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+      }
+
+      {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob = _mm_max_epi16(eob, eob1);
+      }
+      n_coeffs += 8 * 2;
+    }
+
+    // AC only loop
+    while (n_coeffs < 0) {
+      __m128i coeff0, coeff1;
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+        __m128i cmp_mask0, cmp_mask1;
+
+        coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
+        coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
+        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
+        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
+        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
+
+        // Reinsert signs
+        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        // Mask out zbin threshold coeffs
+        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+      }
+
+      {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob0, eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob0 = _mm_max_epi16(eob0, eob1);
+        eob = _mm_max_epi16(eob, eob0);
+      }
+      n_coeffs += 8 * 2;
+    }
+
+    // Accumulate EOB
+    {
+      __m128i eob_shuffled;
+      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      *eob_ptr = _mm_extract_epi16(eob, 1);
+    }
+  } else {
+    do {
+      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+      n_coeffs += 8 * 2;
+    } while (n_coeffs < 0);
+    *eob_ptr = 0;
+  }
+}

diff --git a/source/libvpx/vp9/encoder/x86/vp9_sad_intrin_avx2.c b/source/libvpx/vp9/encoder/x86/vp9_sad_intrin_avx2.c
new file mode 100644
index 0000000..1131930
--- /dev/null
+++ b/source/libvpx/vp9/encoder/x86/vp9_sad_intrin_avx2.c

@@ -0,0 +1,180 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <immintrin.h>
+#include "vpx_ports/mem.h"
+
+#define FSAD64_H(h) \
+unsigned int vp9_sad64x##h##_avx2(const uint8_t *src_ptr, \
+                                  int src_stride, \
+                                  const uint8_t *ref_ptr, \
+                                  int ref_stride) { \
+  int i, res; \
+  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
+  __m256i sum_sad = _mm256_setzero_si256(); \
+  __m256i sum_sad_h; \
+  __m128i sum_sad128; \
+  for (i = 0 ; i < h ; i++) { \
+    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
+    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \
+    sad1_reg = _mm256_sad_epu8(ref1_reg, \
+               _mm256_loadu_si256((__m256i const *)src_ptr)); \
+    sad2_reg = _mm256_sad_epu8(ref2_reg, \
+               _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \
+    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
+    ref_ptr+= ref_stride; \
+    src_ptr+= src_stride; \
+  } \
+  sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
+  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
+  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
+  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
+  res = _mm_cvtsi128_si32(sum_sad128); \
+  return res; \
+}
+
+#define FSAD32_H(h) \
+unsigned int vp9_sad32x##h##_avx2(const uint8_t *src_ptr, \
+                                  int src_stride, \
+                                  const uint8_t *ref_ptr, \
+                                  int ref_stride) { \
+  int i, res; \
+  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
+  __m256i sum_sad = _mm256_setzero_si256(); \
+  __m256i sum_sad_h; \
+  __m128i sum_sad128; \
+  int ref2_stride = ref_stride << 1; \
+  int src2_stride = src_stride << 1; \
+  int max = h >> 1; \
+  for (i = 0 ; i < max ; i++) { \
+    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
+    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
+    sad1_reg = _mm256_sad_epu8(ref1_reg, \
+               _mm256_loadu_si256((__m256i const *)src_ptr)); \
+    sad2_reg = _mm256_sad_epu8(ref2_reg, \
+               _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \
+    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
+    ref_ptr+= ref2_stride; \
+    src_ptr+= src2_stride; \
+  } \
+  sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
+  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
+  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
+  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
+  res = _mm_cvtsi128_si32(sum_sad128); \
+  return res; \
+}
+
+#define FSAD64 \
+FSAD64_H(64); \
+FSAD64_H(32);
+
+#define FSAD32 \
+FSAD32_H(64); \
+FSAD32_H(32); \
+FSAD32_H(16);
+
+FSAD64;
+FSAD32;
+
+#undef FSAD64
+#undef FSAD32
+#undef FSAD64_H
+#undef FSAD32_H
+
+#define FSADAVG64_H(h) \
+unsigned int vp9_sad64x##h##_avg_avx2(const uint8_t *src_ptr, \
+                                      int src_stride, \
+                                      const uint8_t *ref_ptr, \
+                                      int  ref_stride, \
+                                      const uint8_t *second_pred) { \
+  int i, res; \
+  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
+  __m256i sum_sad = _mm256_setzero_si256(); \
+  __m256i sum_sad_h; \
+  __m128i sum_sad128; \
+  for (i = 0 ; i < h ; i++) { \
+    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
+    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \
+    ref1_reg = _mm256_avg_epu8(ref1_reg, \
+               _mm256_loadu_si256((__m256i const *)second_pred)); \
+    ref2_reg = _mm256_avg_epu8(ref2_reg, \
+               _mm256_loadu_si256((__m256i const *)(second_pred +32))); \
+    sad1_reg = _mm256_sad_epu8(ref1_reg, \
+               _mm256_loadu_si256((__m256i const *)src_ptr)); \
+    sad2_reg = _mm256_sad_epu8(ref2_reg, \
+               _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \
+    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
+    ref_ptr+= ref_stride; \
+    src_ptr+= src_stride; \
+    second_pred+= 64; \
+  } \
+  sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
+  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
+  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
+  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
+  res = _mm_cvtsi128_si32(sum_sad128); \
+  return res; \
+}
+
+#define FSADAVG32_H(h) \
+unsigned int vp9_sad32x##h##_avg_avx2(const uint8_t *src_ptr, \
+                                      int src_stride, \
+                                      const uint8_t *ref_ptr, \
+                                      int  ref_stride, \
+                                      const uint8_t *second_pred) { \
+  int i, res; \
+  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
+  __m256i sum_sad = _mm256_setzero_si256(); \
+  __m256i sum_sad_h; \
+  __m128i sum_sad128; \
+  int ref2_stride = ref_stride << 1; \
+  int src2_stride = src_stride << 1; \
+  int max = h >> 1; \
+  for (i = 0 ; i < max ; i++) { \
+    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
+    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
+    ref1_reg = _mm256_avg_epu8(ref1_reg, \
+               _mm256_loadu_si256((__m256i const *)second_pred)); \
+    ref2_reg = _mm256_avg_epu8(ref2_reg, \
+               _mm256_loadu_si256((__m256i const *)(second_pred +32))); \
+    sad1_reg = _mm256_sad_epu8(ref1_reg, \
+               _mm256_loadu_si256((__m256i const *)src_ptr)); \
+    sad2_reg = _mm256_sad_epu8(ref2_reg, \
+               _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \
+    sum_sad = _mm256_add_epi32(sum_sad, \
+              _mm256_add_epi32(sad1_reg, sad2_reg)); \
+    ref_ptr+= ref2_stride; \
+    src_ptr+= src2_stride; \
+    second_pred+= 64; \
+  } \
+  sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
+  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
+  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
+  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
+  res = _mm_cvtsi128_si32(sum_sad128); \
+  return res; \
+}
+
+#define FSADAVG64 \
+FSADAVG64_H(64); \
+FSADAVG64_H(32);
+
+#define FSADAVG32 \
+FSADAVG32_H(64); \
+FSADAVG32_H(32); \
+FSADAVG32_H(16);
+
+FSADAVG64;
+FSADAVG32;
+
+#undef FSADAVG64
+#undef FSADAVG32
+#undef FSADAVG64_H
+#undef FSADAVG32_H

diff --git a/source/libvpx/vp9/vp9_common.mk b/source/libvpx/vp9/vp9_common.mk
index 07a3be8..9414120 100644
--- a/source/libvpx/vp9/vp9_common.mk
+++ b/source/libvpx/vp9/vp9_common.mk

@@ -93,6 +93,7 @@
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_intrapred_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_subpixel_8t_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_subpixel_bilinear_sse2.asm
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_loopfilter_intrin_sse2.c
 endif
 
 # common (c)

diff --git a/source/libvpx/vp9/vp9_cx_iface.c b/source/libvpx/vp9/vp9_cx_iface.c
index fbf4aa2..d0ca524 100644
--- a/source/libvpx/vp9/vp9_cx_iface.c
+++ b/source/libvpx/vp9/vp9_cx_iface.c

@@ -188,11 +188,9 @@
     }
     if (alt_ref_sum > REF_FRAMES - cfg->ss_number_layers)
       ERROR("Not enough ref buffers for svc alt ref frames");
-    if ((cfg->ss_number_layers > 3 ||
-         cfg->ss_number_layers * cfg->ts_number_layers > 4) &&
+    if (cfg->ss_number_layers * cfg->ts_number_layers > 3 &&
         cfg->g_error_resilient == 0)
-    ERROR("Multiple frame context are not supported for more than 3 spatial "
-          "layers or more than 4 spatial x temporal layers");
+    ERROR("Multiple frame context are not supported for more than 3 layers");
   }
 #endif
 
@@ -274,27 +272,49 @@
   }
 
 #if !CONFIG_VP9_HIGHBITDEPTH
-  if (cfg->g_profile > (unsigned int)PROFILE_1)
+  if (cfg->g_profile > (unsigned int)PROFILE_1) {
     ERROR("Profile > 1 not supported in this build configuration");
+  }
 #endif
   if (cfg->g_profile <= (unsigned int)PROFILE_1 &&
-      extra_cfg->bit_depth > VPX_BITS_8)
+      cfg->g_bit_depth > VPX_BITS_8) {
     ERROR("Codec high bit-depth not supported in profile < 2");
+  }
+  if (cfg->g_profile <= (unsigned int)PROFILE_1 &&
+      cfg->g_input_bit_depth > 8) {
+    ERROR("Source high bit-depth not supported in profile < 2");
+  }
   if (cfg->g_profile > (unsigned int)PROFILE_1 &&
-      extra_cfg->bit_depth == VPX_BITS_8)
+      cfg->g_bit_depth == VPX_BITS_8) {
     ERROR("Codec bit-depth 8 not supported in profile > 1");
+  }
 
   return VPX_CODEC_OK;
 }
 
-
 static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx,
                                     const vpx_image_t *img) {
   switch (img->fmt) {
     case VPX_IMG_FMT_YV12:
     case VPX_IMG_FMT_I420:
+    case VPX_IMG_FMT_I42016:
+      break;
     case VPX_IMG_FMT_I422:
     case VPX_IMG_FMT_I444:
+    case VPX_IMG_FMT_I440:
+      if (ctx->cfg.g_profile != (unsigned int)PROFILE_1) {
+        ERROR("Invalid image format. I422, I444, I440 images are "
+              "not supported in profile.");
+      }
+      break;
+    case VPX_IMG_FMT_I42216:
+    case VPX_IMG_FMT_I44416:
+    case VPX_IMG_FMT_I44016:
+      if (ctx->cfg.g_profile != (unsigned int)PROFILE_1 &&
+          ctx->cfg.g_profile != (unsigned int)PROFILE_3) {
+        ERROR("Invalid image format. 16-bit I422, I444, I440 images are "
+              "not supported in profile.");
+      }
       break;
     default:
       ERROR("Invalid image format. Only YV12, I420, I422, I444 images are "
@@ -314,9 +334,11 @@
     case VPX_IMG_FMT_I420: return 12;
     case VPX_IMG_FMT_I422: return 16;
     case VPX_IMG_FMT_I444: return 24;
+    case VPX_IMG_FMT_I440: return 16;
     case VPX_IMG_FMT_I42016: return 24;
     case VPX_IMG_FMT_I42216: return 32;
     case VPX_IMG_FMT_I44416: return 48;
+    case VPX_IMG_FMT_I44016: return 32;
     default: assert(0 && "Invalid image format"); break;
   }
   return 0;
@@ -330,7 +352,7 @@
   oxcf->profile = cfg->g_profile;
   oxcf->width   = cfg->g_w;
   oxcf->height  = cfg->g_h;
-  oxcf->bit_depth = extra_cfg->bit_depth;
+  oxcf->bit_depth = cfg->g_bit_depth;
   oxcf->input_bit_depth = cfg->g_input_bit_depth;
   // guess a frame rate if out of whack, use 30
   oxcf->init_framerate = (double)cfg->g_timebase.den / cfg->g_timebase.num;

diff --git a/source/libvpx/vp9/vp9_dx_iface.c b/source/libvpx/vp9/vp9_dx_iface.c
index 393c66e..85e32d3 100644
--- a/source/libvpx/vp9/vp9_dx_iface.c
+++ b/source/libvpx/vp9/vp9_dx_iface.c

@@ -437,7 +437,6 @@
     // call to get_frame.
     if (!(*iter)) {
       img = &ctx->img;
-      img->bit_depth = (int)ctx->pbi->common.bit_depth;
       *iter = img;
     }
   }

diff --git a/source/libvpx/vp9/vp9_iface_common.h b/source/libvpx/vp9/vp9_iface_common.h
index fc98b62..00fbfdd 100644
--- a/source/libvpx/vp9/vp9_iface_common.h
+++ b/source/libvpx/vp9/vp9_iface_common.h

@@ -16,11 +16,9 @@
     * the Y, U, and V planes, nor other alignment adjustments that
     * might be representable by a YV12_BUFFER_CONFIG, so we just
     * initialize all the fields.*/
-  const int ss_x = yv12->uv_crop_width < yv12->y_crop_width;
-  const int ss_y = yv12->uv_crop_height < yv12->y_crop_height;
   int bps;
-  if (!ss_y) {
-    if (!ss_x) {
+  if (!yv12->subsampling_y) {
+    if (!yv12->subsampling_x) {
       img->fmt = VPX_IMG_FMT_I444;
       bps = 24;
     } else {
@@ -28,16 +26,21 @@
       bps = 16;
     }
   } else {
-    img->fmt = VPX_IMG_FMT_I420;
-    bps = 12;
+    if (!yv12->subsampling_x) {
+      img->fmt = VPX_IMG_FMT_I440;
+      bps = 16;
+    } else {
+      img->fmt = VPX_IMG_FMT_I420;
+      bps = 12;
+    }
   }
   img->bit_depth = 8;
   img->w = yv12->y_stride;
   img->h = ALIGN_POWER_OF_TWO(yv12->y_height + 2 * VP9_ENC_BORDER_IN_PIXELS, 3);
   img->d_w = yv12->y_crop_width;
   img->d_h = yv12->y_crop_height;
-  img->x_chroma_shift = ss_x;
-  img->y_chroma_shift = ss_y;
+  img->x_chroma_shift = yv12->subsampling_x;
+  img->y_chroma_shift = yv12->subsampling_y;
   img->planes[VPX_PLANE_Y] = yv12->y_buffer;
   img->planes[VPX_PLANE_U] = yv12->u_buffer;
   img->planes[VPX_PLANE_V] = yv12->v_buffer;
@@ -46,6 +49,22 @@
   img->stride[VPX_PLANE_U] = yv12->uv_stride;
   img->stride[VPX_PLANE_V] = yv12->uv_stride;
   img->stride[VPX_PLANE_ALPHA] = yv12->y_stride;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (yv12->flags & YV12_FLAG_HIGHBITDEPTH) {
+    // vpx_image_t uses byte strides and a pointer to the first byte
+    // of the image.
+    img->fmt |= VPX_IMG_FMT_HIGHBITDEPTH;
+    img->bit_depth = yv12->bit_depth;
+    img->planes[VPX_PLANE_Y] = (uint8_t*)CONVERT_TO_SHORTPTR(yv12->y_buffer);
+    img->planes[VPX_PLANE_U] = (uint8_t*)CONVERT_TO_SHORTPTR(yv12->u_buffer);
+    img->planes[VPX_PLANE_V] = (uint8_t*)CONVERT_TO_SHORTPTR(yv12->v_buffer);
+    img->planes[VPX_PLANE_ALPHA] = NULL;
+    img->stride[VPX_PLANE_Y] = 2 * yv12->y_stride;
+    img->stride[VPX_PLANE_U] = 2 * yv12->uv_stride;
+    img->stride[VPX_PLANE_V] = 2 * yv12->uv_stride;
+    img->stride[VPX_PLANE_ALPHA] = 2 * yv12->y_stride;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   img->bps = bps;
   img->user_priv = user_priv;
   img->img_data = yv12->buffer_alloc;
@@ -68,11 +87,39 @@
                                             : yv12->y_width;
   yv12->uv_height = img->y_chroma_shift == 1 ? (1 + yv12->y_height) / 2
                                              : yv12->y_height;
+  yv12->uv_crop_width = yv12->uv_width;
+  yv12->uv_crop_height = yv12->uv_height;
 
   yv12->y_stride = img->stride[VPX_PLANE_Y];
   yv12->uv_stride = img->stride[VPX_PLANE_U];
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+    // In vpx_image_t
+    //     planes point to uint8 address of start of data
+    //     stride counts uint8s to reach next row
+    // In YV12_BUFFER_CONFIG
+    //     y_buffer, u_buffer, v_buffer point to uint16 address of data
+    //     stride and border counts in uint16s
+    // This means that all the address calculations in the main body of code
+    // should work correctly.
+    // However, before we do any pixel operations we need to cast the address
+    // to a uint16 ponter and double its value.
+    yv12->y_buffer = CONVERT_TO_BYTEPTR(yv12->y_buffer);
+    yv12->u_buffer = CONVERT_TO_BYTEPTR(yv12->u_buffer);
+    yv12->v_buffer = CONVERT_TO_BYTEPTR(yv12->v_buffer);
+    yv12->y_stride >>= 1;
+    yv12->uv_stride >>= 1;
+    yv12->flags = YV12_FLAG_HIGHBITDEPTH;
+  } else {
+    yv12->flags = 0;
+  }
+  yv12->border  = (yv12->y_stride - img->w) / 2;
+#else
   yv12->border  = (img->stride[VPX_PLANE_Y] - img->w) / 2;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  yv12->subsampling_x = img->x_chroma_shift;
+  yv12->subsampling_y = img->y_chroma_shift;
   return VPX_CODEC_OK;
 }
 

diff --git a/source/libvpx/vp9/vp9cx.mk b/source/libvpx/vp9/vp9cx.mk
index e450f7b..e72cb00 100644
--- a/source/libvpx/vp9/vp9cx.mk
+++ b/source/libvpx/vp9/vp9cx.mk

@@ -17,6 +17,7 @@
 
 VP9_CX_SRCS-yes += vp9_cx_iface.c
 
+VP9_CX_SRCS-yes += encoder/vp9_avg.c
 VP9_CX_SRCS-yes += encoder/vp9_bitstream.c
 VP9_CX_SRCS-yes += encoder/vp9_context_tree.c
 VP9_CX_SRCS-yes += encoder/vp9_context_tree.h
@@ -95,10 +96,12 @@
 
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_avg_intrin_sse2.c
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_sad4d_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
 
 ifeq ($(CONFIG_USE_X86INC),yes)
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
@@ -115,11 +118,16 @@
 endif
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_ssse3.asm
 VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm
+VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_sad_intrin_avx2.c
 VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt_x86_64.asm
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct32x32_sse2.c
 
+ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes)
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_denoiser_sse2.c
+endif
+
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct32x32_avx2.c
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct_avx2.c
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c

diff --git a/source/libvpx/vpx/src/svc_encodeframe.c b/source/libvpx/vpx/src/svc_encodeframe.c
index 773087d..fa3409c 100644
--- a/source/libvpx/vpx/src/svc_encodeframe.c
+++ b/source/libvpx/vpx/src/svc_encodeframe.c

@@ -350,7 +350,7 @@
       }
     }
 
-    for (i = 0; i < svc_ctx->spatial_layers; ++i) {
+    for (i = 0; i < VPX_SS_MAX_LAYERS; ++i) {
       if (total > 0) {
         enc_cfg->ss_target_bitrate[i] = (unsigned int)
             (enc_cfg->rc_target_bitrate * alloc_ratio[i] / total);

diff --git a/source/libvpx/vpx/src/vpx_image.c b/source/libvpx/vpx/src/vpx_image.c
index e58b61e..9aae12c 100644
--- a/source/libvpx/vpx/src/vpx_image.c
+++ b/source/libvpx/vpx/src/vpx_image.c

@@ -15,16 +15,16 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_mem/vpx_mem.h"
 
-static vpx_image_t *img_alloc_helper(vpx_image_t   *img,
-                                     vpx_img_fmt_t  fmt,
-                                     unsigned int   d_w,
-                                     unsigned int   d_h,
-                                     unsigned int   buf_align,
-                                     unsigned int   stride_align,
+static vpx_image_t *img_alloc_helper(vpx_image_t *img,
+                                     vpx_img_fmt_t fmt,
+                                     unsigned int d_w,
+                                     unsigned int d_h,
+                                     unsigned int buf_align,
+                                     unsigned int stride_align,
                                      unsigned char *img_data) {
-
-  unsigned int  h, w, s, xcs, ycs, bps;
-  int           align;
+  unsigned int h, w, s, xcs, ycs, bps;
+  unsigned int stride_in_bytes;
+  int align;
 
   /* Treat align==0 like align==1 */
   if (!buf_align)
@@ -70,6 +70,7 @@
       bps = 12;
       break;
     case VPX_IMG_FMT_I422:
+    case VPX_IMG_FMT_I440:
       bps = 16;
       break;
     case VPX_IMG_FMT_I444:
@@ -79,6 +80,7 @@
       bps = 24;
       break;
     case VPX_IMG_FMT_I42216:
+    case VPX_IMG_FMT_I44016:
       bps = 32;
       break;
     case VPX_IMG_FMT_I44416:
@@ -107,9 +109,12 @@
 
   switch (fmt) {
     case VPX_IMG_FMT_I420:
+    case VPX_IMG_FMT_I440:
     case VPX_IMG_FMT_YV12:
     case VPX_IMG_FMT_VPXI420:
     case VPX_IMG_FMT_VPXYV12:
+    case VPX_IMG_FMT_I42016:
+    case VPX_IMG_FMT_I44016:
       ycs = 1;
       break;
     default:
@@ -124,6 +129,7 @@
   h = (d_h + align) & ~align;
   s = (fmt & VPX_IMG_FMT_PLANAR) ? w : bps * w / 8;
   s = (s + stride_align - 1) & ~(stride_align - 1);
+  stride_in_bytes = (fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? s * 2 : s;
 
   /* Allocate the new image */
   if (!img) {
@@ -162,8 +168,8 @@
   img->bps = bps;
 
   /* Calculate strides */
-  img->stride[VPX_PLANE_Y] = img->stride[VPX_PLANE_ALPHA] = s;
-  img->stride[VPX_PLANE_U] = img->stride[VPX_PLANE_V] = s >> xcs;
+  img->stride[VPX_PLANE_Y] = img->stride[VPX_PLANE_ALPHA] = stride_in_bytes;
+  img->stride[VPX_PLANE_U] = img->stride[VPX_PLANE_V] = stride_in_bytes >> xcs;
 
   /* Default viewport to entire image */
   if (!vpx_img_set_rect(img, 0, 0, d_w, d_h))
@@ -209,39 +215,40 @@
       img->planes[VPX_PLANE_PACKED] =
         img->img_data + x * img->bps / 8 + y * img->stride[VPX_PLANE_PACKED];
     } else {
+      const int bytes_per_sample =
+          (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
       data = img->img_data;
 
       if (img->fmt & VPX_IMG_FMT_HAS_ALPHA) {
         img->planes[VPX_PLANE_ALPHA] =
-          data + x + y * img->stride[VPX_PLANE_ALPHA];
+            data + x * bytes_per_sample + y * img->stride[VPX_PLANE_ALPHA];
         data += img->h * img->stride[VPX_PLANE_ALPHA];
       }
 
-      img->planes[VPX_PLANE_Y] = data + x + y * img->stride[VPX_PLANE_Y];
+      img->planes[VPX_PLANE_Y] = data + x * bytes_per_sample +
+          y * img->stride[VPX_PLANE_Y];
       data += img->h * img->stride[VPX_PLANE_Y];
 
       if (!(img->fmt & VPX_IMG_FMT_UV_FLIP)) {
-        img->planes[VPX_PLANE_U] = data
-                                   + (x >> img->x_chroma_shift)
-                                   + (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_U];
+        img->planes[VPX_PLANE_U] =
+            data + (x >> img->x_chroma_shift) * bytes_per_sample +
+            (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_U];
         data += (img->h >> img->y_chroma_shift) * img->stride[VPX_PLANE_U];
-        img->planes[VPX_PLANE_V] = data
-                                   + (x >> img->x_chroma_shift)
-                                   + (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_V];
+        img->planes[VPX_PLANE_V] =
+            data + (x >> img->x_chroma_shift) * bytes_per_sample +
+            (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_V];
       } else {
-        img->planes[VPX_PLANE_V] = data
-                                   + (x >> img->x_chroma_shift)
-                                   + (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_V];
+        img->planes[VPX_PLANE_V] =
+            data + (x >> img->x_chroma_shift) * bytes_per_sample +
+            (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_V];
         data += (img->h >> img->y_chroma_shift) * img->stride[VPX_PLANE_V];
-        img->planes[VPX_PLANE_U] = data
-                                   + (x >> img->x_chroma_shift)
-                                   + (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_U];
+        img->planes[VPX_PLANE_U] =
+            data + (x >> img->x_chroma_shift) * bytes_per_sample +
+            (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_U];
       }
     }
-
     return 0;
   }
-
   return -1;
 }
 

diff --git a/source/libvpx/vpx/vpx_encoder.h b/source/libvpx/vpx/vpx_encoder.h
index c6c7d08..044243d 100644
--- a/source/libvpx/vpx/vpx_encoder.h
+++ b/source/libvpx/vpx/vpx_encoder.h

@@ -161,7 +161,9 @@
     VPX_CODEC_STATS_PKT,       /**< Two-pass statistics for this frame */
     VPX_CODEC_FPMB_STATS_PKT,  /**< first pass mb statistics for this frame */
     VPX_CODEC_PSNR_PKT,        /**< PSNR statistics for this frame */
-#if CONFIG_SPATIAL_SVC
+    // TODO(minghai): This is for testing purporses. The released library can't
+    // depend on vpx_config.h
+#if defined(CONFIG_SPATIAL_SVC) && CONFIG_SPATIAL_SVC
     VPX_CODEC_SPATIAL_SVC_LAYER_SIZES, /**< Sizes for each layer in this frame*/
     VPX_CODEC_SPATIAL_SVC_LAYER_PSNR, /**< PSNR for each layer in this frame*/
 #endif
@@ -201,7 +203,9 @@
         double       psnr[4];     /**< PSNR, total/y/u/v */
       } psnr;                       /**< data for PSNR packet */
       vpx_fixed_buf_t raw;     /**< data for arbitrary packets */
-#if CONFIG_SPATIAL_SVC
+      // TODO(minghai): This is for testing purporses. The released library
+      // can't depend on vpx_config.h
+#if defined(CONFIG_SPATIAL_SVC) && CONFIG_SPATIAL_SVC
       size_t layer_sizes[VPX_SS_MAX_LAYERS];
       struct vpx_psnr_pkt layer_psnr[VPX_SS_MAX_LAYERS];
 #endif

diff --git a/source/libvpx/vpx/vpx_image.h b/source/libvpx/vpx/vpx_image.h
index 0b7bb90..337e4c4 100644
--- a/source/libvpx/vpx/vpx_image.h
+++ b/source/libvpx/vpx/vpx_image.h

@@ -58,46 +58,14 @@
     VPX_IMG_FMT_VPXI420 = VPX_IMG_FMT_PLANAR | 4,
     VPX_IMG_FMT_I422    = VPX_IMG_FMT_PLANAR | 5,
     VPX_IMG_FMT_I444    = VPX_IMG_FMT_PLANAR | 6,
-    VPX_IMG_FMT_444A    = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_HAS_ALPHA | 7,
+    VPX_IMG_FMT_I440    = VPX_IMG_FMT_PLANAR | 7,
+    VPX_IMG_FMT_444A    = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_HAS_ALPHA | 6,
     VPX_IMG_FMT_I42016    = VPX_IMG_FMT_I420 | VPX_IMG_FMT_HIGHBITDEPTH,
     VPX_IMG_FMT_I42216    = VPX_IMG_FMT_I422 | VPX_IMG_FMT_HIGHBITDEPTH,
-    VPX_IMG_FMT_I44416    = VPX_IMG_FMT_I444 | VPX_IMG_FMT_HIGHBITDEPTH
+    VPX_IMG_FMT_I44416    = VPX_IMG_FMT_I444 | VPX_IMG_FMT_HIGHBITDEPTH,
+    VPX_IMG_FMT_I44016    = VPX_IMG_FMT_I440 | VPX_IMG_FMT_HIGHBITDEPTH
   } vpx_img_fmt_t; /**< alias for enum vpx_img_fmt */
 
-#if !defined(VPX_CODEC_DISABLE_COMPAT) || !VPX_CODEC_DISABLE_COMPAT
-#define IMG_FMT_PLANAR         VPX_IMG_FMT_PLANAR     /**< \deprecated Use #VPX_IMG_FMT_PLANAR */
-#define IMG_FMT_UV_FLIP        VPX_IMG_FMT_UV_FLIP    /**< \deprecated Use #VPX_IMG_FMT_UV_FLIP */
-#define IMG_FMT_HAS_ALPHA      VPX_IMG_FMT_HAS_ALPHA  /**< \deprecated Use #VPX_IMG_FMT_HAS_ALPHA */
-
-  /*!\brief Deprecated list of supported image formats
-   * \deprecated New code should use #vpx_img_fmt
-   */
-#define img_fmt   vpx_img_fmt
-  /*!\brief alias for enum img_fmt.
-   * \deprecated New code should use #vpx_img_fmt_t
-   */
-#define img_fmt_t vpx_img_fmt_t
-
-#define IMG_FMT_NONE       VPX_IMG_FMT_NONE       /**< \deprecated Use #VPX_IMG_FMT_NONE */
-#define IMG_FMT_RGB24      VPX_IMG_FMT_RGB24      /**< \deprecated Use #VPX_IMG_FMT_RGB24 */
-#define IMG_FMT_RGB32      VPX_IMG_FMT_RGB32      /**< \deprecated Use #VPX_IMG_FMT_RGB32 */
-#define IMG_FMT_RGB565     VPX_IMG_FMT_RGB565     /**< \deprecated Use #VPX_IMG_FMT_RGB565 */
-#define IMG_FMT_RGB555     VPX_IMG_FMT_RGB555     /**< \deprecated Use #VPX_IMG_FMT_RGB555 */
-#define IMG_FMT_UYVY       VPX_IMG_FMT_UYVY       /**< \deprecated Use #VPX_IMG_FMT_UYVY */
-#define IMG_FMT_YUY2       VPX_IMG_FMT_YUY2       /**< \deprecated Use #VPX_IMG_FMT_YUY2 */
-#define IMG_FMT_YVYU       VPX_IMG_FMT_YVYU       /**< \deprecated Use #VPX_IMG_FMT_YVYU */
-#define IMG_FMT_BGR24      VPX_IMG_FMT_BGR24      /**< \deprecated Use #VPX_IMG_FMT_BGR24 */
-#define IMG_FMT_RGB32_LE   VPX_IMG_FMT_RGB32_LE   /**< \deprecated Use #VPX_IMG_FMT_RGB32_LE */
-#define IMG_FMT_ARGB       VPX_IMG_FMT_ARGB       /**< \deprecated Use #VPX_IMG_FMT_ARGB */
-#define IMG_FMT_ARGB_LE    VPX_IMG_FMT_ARGB_LE    /**< \deprecated Use #VPX_IMG_FMT_ARGB_LE */
-#define IMG_FMT_RGB565_LE  VPX_IMG_FMT_RGB565_LE  /**< \deprecated Use #VPX_IMG_FMT_RGB565_LE */
-#define IMG_FMT_RGB555_LE  VPX_IMG_FMT_RGB555_LE  /**< \deprecated Use #VPX_IMG_FMT_RGB555_LE */
-#define IMG_FMT_YV12       VPX_IMG_FMT_YV12       /**< \deprecated Use #VPX_IMG_FMT_YV12 */
-#define IMG_FMT_I420       VPX_IMG_FMT_I420       /**< \deprecated Use #VPX_IMG_FMT_I420 */
-#define IMG_FMT_VPXYV12    VPX_IMG_FMT_VPXYV12    /**< \deprecated Use #VPX_IMG_FMT_VPXYV12 */
-#define IMG_FMT_VPXI420    VPX_IMG_FMT_VPXI420    /**< \deprecated Use #VPX_IMG_FMT_VPXI420 */
-#endif /* VPX_CODEC_DISABLE_COMPAT */
-
   /**\brief Image Descriptor */
   typedef struct vpx_image {
     vpx_img_fmt_t fmt; /**< Image Format */
@@ -121,13 +89,6 @@
 #define VPX_PLANE_U      1   /**< U (Chroma) plane */
 #define VPX_PLANE_V      2   /**< V (Chroma) plane */
 #define VPX_PLANE_ALPHA  3   /**< A (Transparency) plane */
-#if !defined(VPX_CODEC_DISABLE_COMPAT) || !VPX_CODEC_DISABLE_COMPAT
-#define PLANE_PACKED     VPX_PLANE_PACKED
-#define PLANE_Y          VPX_PLANE_Y
-#define PLANE_U          VPX_PLANE_U
-#define PLANE_V          VPX_PLANE_V
-#define PLANE_ALPHA      VPX_PLANE_ALPHA
-#endif
     unsigned char *planes[4];  /**< pointer to the top left pixel for each plane */
     int      stride[4];  /**< stride between rows for each plane */
 

diff --git a/source/libvpx/vpx/vpx_integer.h b/source/libvpx/vpx/vpx_integer.h
index ffeefb8..500f9b9 100644
--- a/source/libvpx/vpx/vpx_integer.h
+++ b/source/libvpx/vpx/vpx_integer.h

@@ -49,9 +49,15 @@
 
 /* Most platforms have the C99 standard integer types. */
 
-#if defined(__cplusplus) && !defined(__STDC_FORMAT_MACROS)
-#define __STDC_FORMAT_MACROS
-#endif
+#if defined(__cplusplus)
+# if !defined(__STDC_FORMAT_MACROS)
+#  define __STDC_FORMAT_MACROS
+# endif
+# if !defined(__STDC_LIMIT_MACROS)
+#  define __STDC_LIMIT_MACROS
+# endif
+#endif  // __cplusplus
+
 #include <stdint.h>
 
 #endif

diff --git a/source/libvpx/vpx_ports/arm.h b/source/libvpx/vpx_ports/arm.h
index 1e4a8e2..42c98f5 100644
--- a/source/libvpx/vpx_ports/arm.h
+++ b/source/libvpx/vpx_ports/arm.h

@@ -27,6 +27,12 @@
 
 int arm_cpu_caps(void);
 
+// Earlier gcc compilers have issues with some neon intrinsics
+#if !defined(__clang__) && defined(__GNUC__) && \
+    __GNUC__ == 4 && __GNUC_MINOR__ <= 6
+#define VPX_INCOMPATIBLE_GCC
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/source/libvpx/vpx_scale/generic/yv12config.c b/source/libvpx/vpx_scale/generic/yv12config.c
index 475d231..00a8c16 100644
--- a/source/libvpx/vpx_scale/generic/yv12config.c
+++ b/source/libvpx/vpx_scale/generic/yv12config.c

@@ -10,7 +10,6 @@
 
 #include <assert.h>
 
-#include "./vpx_config.h"
 #include "vpx_scale/yv12config.h"
 #include "vpx_mem/vpx_mem.h"
 #if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
@@ -242,6 +241,8 @@
 
     ybf->border = border;
     ybf->frame_size = (int)frame_size;
+    ybf->subsampling_x = ss_x;
+    ybf->subsampling_y = ss_y;
 
 #if CONFIG_VP9_HIGHBITDEPTH
     if (use_highbitdepth) {

diff --git a/source/libvpx/vpx_scale/yv12config.h b/source/libvpx/vpx_scale/yv12config.h
index 9ff764c..b9f13fd 100644
--- a/source/libvpx/vpx_scale/yv12config.h
+++ b/source/libvpx/vpx_scale/yv12config.h

@@ -15,6 +15,7 @@
 extern "C" {
 #endif
 
+#include "./vpx_config.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_frame_buffer.h"
 #include "vpx/vpx_integer.h"
@@ -51,6 +52,8 @@
   int buffer_alloc_sz;
   int border;
   int frame_size;
+  int subsampling_x;
+  int subsampling_y;
   unsigned int bit_depth;
 
   int corrupted;

diff --git a/source/libvpx/vpxdec.c b/source/libvpx/vpxdec.c
index cf23c29..2afdb71 100644
--- a/source/libvpx/vpxdec.c
+++ b/source/libvpx/vpxdec.c

@@ -47,52 +47,49 @@
   struct WebmInputContext *webm_ctx;
 };
 
-static const arg_def_t looparg = ARG_DEF(NULL, "loops", 1,
-                                          "Number of times to decode the file");
-static const arg_def_t codecarg = ARG_DEF(NULL, "codec", 1,
-                                          "Codec to use");
-static const arg_def_t use_yv12 = ARG_DEF(NULL, "yv12", 0,
-                                          "Output raw YV12 frames");
-static const arg_def_t use_i420 = ARG_DEF(NULL, "i420", 0,
-                                          "Output raw I420 frames");
-static const arg_def_t flipuvarg = ARG_DEF(NULL, "flipuv", 0,
-                                           "Flip the chroma planes in the output");
-static const arg_def_t rawvideo = ARG_DEF(NULL, "rawvideo", 0,
-                                          "Output raw YUV frames");
-static const arg_def_t noblitarg = ARG_DEF(NULL, "noblit", 0,
-                                           "Don't process the decoded frames");
-static const arg_def_t progressarg = ARG_DEF(NULL, "progress", 0,
-                                             "Show progress after each frame decodes");
-static const arg_def_t limitarg = ARG_DEF(NULL, "limit", 1,
-                                          "Stop decoding after n frames");
-static const arg_def_t skiparg = ARG_DEF(NULL, "skip", 1,
-                                         "Skip the first n input frames");
-static const arg_def_t postprocarg = ARG_DEF(NULL, "postproc", 0,
-                                             "Postprocess decoded frames");
-static const arg_def_t summaryarg = ARG_DEF(NULL, "summary", 0,
-                                            "Show timing summary");
-static const arg_def_t outputfile = ARG_DEF("o", "output", 1,
-                                            "Output file name pattern (see below)");
-static const arg_def_t threadsarg = ARG_DEF("t", "threads", 1,
-                                            "Max threads to use");
-static const arg_def_t verbosearg = ARG_DEF("v", "verbose", 0,
-                                            "Show version string");
-static const arg_def_t error_concealment = ARG_DEF(NULL, "error-concealment", 0,
-                                                   "Enable decoder error-concealment");
-static const arg_def_t scalearg = ARG_DEF("S", "scale", 0,
-                                            "Scale output frames uniformly");
-static const arg_def_t continuearg =
-    ARG_DEF("k", "keep-going", 0, "(debug) Continue decoding after error");
-
-static const arg_def_t fb_arg =
-    ARG_DEF(NULL, "frame-buffers", 1, "Number of frame buffers to use");
-
-static const arg_def_t md5arg = ARG_DEF(NULL, "md5", 0,
-                                        "Compute the MD5 sum of the decoded frame");
+static const arg_def_t looparg = ARG_DEF(
+    NULL, "loops", 1, "Number of times to decode the file");
+static const arg_def_t codecarg = ARG_DEF(
+    NULL, "codec", 1, "Codec to use");
+static const arg_def_t use_yv12 = ARG_DEF(
+    NULL, "yv12", 0, "Output raw YV12 frames");
+static const arg_def_t use_i420 = ARG_DEF(
+    NULL, "i420", 0, "Output raw I420 frames");
+static const arg_def_t flipuvarg = ARG_DEF(
+    NULL, "flipuv", 0, "Flip the chroma planes in the output");
+static const arg_def_t rawvideo = ARG_DEF(
+    NULL, "rawvideo", 0, "Output raw YUV frames");
+static const arg_def_t noblitarg = ARG_DEF(
+    NULL, "noblit", 0, "Don't process the decoded frames");
+static const arg_def_t progressarg = ARG_DEF(
+    NULL, "progress", 0, "Show progress after each frame decodes");
+static const arg_def_t limitarg = ARG_DEF(
+    NULL, "limit", 1, "Stop decoding after n frames");
+static const arg_def_t skiparg = ARG_DEF(
+    NULL, "skip", 1, "Skip the first n input frames");
+static const arg_def_t postprocarg = ARG_DEF(
+    NULL, "postproc", 0, "Postprocess decoded frames");
+static const arg_def_t summaryarg = ARG_DEF(
+    NULL, "summary", 0, "Show timing summary");
+static const arg_def_t outputfile = ARG_DEF(
+    "o", "output", 1, "Output file name pattern (see below)");
+static const arg_def_t threadsarg = ARG_DEF(
+    "t", "threads", 1, "Max threads to use");
+static const arg_def_t verbosearg = ARG_DEF(
+    "v", "verbose", 0, "Show version string");
+static const arg_def_t error_concealment = ARG_DEF(
+    NULL, "error-concealment", 0, "Enable decoder error-concealment");
+static const arg_def_t scalearg = ARG_DEF(
+    "S", "scale", 0, "Scale output frames uniformly");
+static const arg_def_t continuearg = ARG_DEF(
+    "k", "keep-going", 0, "(debug) Continue decoding after error");
+static const arg_def_t fb_arg = ARG_DEF(
+    NULL, "frame-buffers", 1, "Number of frame buffers to use");
+static const arg_def_t md5arg = ARG_DEF(
+    NULL, "md5", 0, "Compute the MD5 sum of the decoded frame");
 #if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
 static const arg_def_t outbitdeptharg = ARG_DEF(
-    NULL, "output-bit-depth", 1,
-    "Output bit-depth for decoded frames");
+    NULL, "output-bit-depth", 1, "Output bit-depth for decoded frames");
 #endif
 
 static const arg_def_t *all_args[] = {
@@ -527,174 +524,12 @@
 }
 
 #if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
-static void high_img_upshift(vpx_image_t *dst, vpx_image_t *src,
-                             int input_shift) {
-  const int offset = input_shift > 0 ? (1 << (input_shift - 1)) : 0;
-  int plane;
-  if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
-      dst->x_chroma_shift != src->x_chroma_shift ||
-      dst->y_chroma_shift != src->y_chroma_shift ||
-      dst->fmt != src->fmt || input_shift < 0) {
-    fatal("Unsupported image conversion");
-  }
-  switch (src->fmt) {
-    case VPX_IMG_FMT_I42016:
-    case VPX_IMG_FMT_I42216:
-    case VPX_IMG_FMT_I44416:
-      break;
-    default:
-      fatal("Unsupported image conversion");
-      break;
-  }
-  for (plane = 0; plane < 3; plane++) {
-    int w = src->d_w;
-    int h = src->d_h;
-    int x, y;
-    if (plane) {
-      w >>= src->x_chroma_shift;
-      h >>= src->y_chroma_shift;
-    }
-    for (y = 0; y < h; y++) {
-      uint16_t *p_src = (uint16_t *)(src->planes[plane] +
-                                     y * src->stride[plane]);
-      uint16_t *p_dst = (uint16_t *)(dst->planes[plane] +
-                                     y * dst->stride[plane]);
-      for (x = 0; x < w; x++)
-        *p_dst++ = (*p_src++ << input_shift) + offset;
-    }
-  }
-}
-
-static void low_img_upshift(vpx_image_t *dst, vpx_image_t *src,
-                            int input_shift) {
-  const int offset = input_shift > 0 ? (1 << (input_shift - 1)) : 0;
-  int plane;
-  if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
-      dst->x_chroma_shift != src->x_chroma_shift ||
-      dst->y_chroma_shift != src->y_chroma_shift ||
-      dst->fmt != src->fmt + VPX_IMG_FMT_HIGHBITDEPTH ||
-      input_shift < 0) {
-    fatal("Unsupported image conversion");
-  }
-  switch (src->fmt) {
-    case VPX_IMG_FMT_I420:
-    case VPX_IMG_FMT_I422:
-    case VPX_IMG_FMT_I444:
-      break;
-    default:
-      fatal("Unsupported image conversion");
-      break;
-  }
-  for (plane = 0; plane < 3; plane++) {
-    int w = src->d_w;
-    int h = src->d_h;
-    int x, y;
-    if (plane) {
-      w >>= src->x_chroma_shift;
-      h >>= src->y_chroma_shift;
-    }
-    for (y = 0; y < h; y++) {
-      uint8_t *p_src = src->planes[plane] + y * src->stride[plane];
-      uint16_t *p_dst = (uint16_t *)(dst->planes[plane] +
-                                     y * dst->stride[plane]);
-      for (x = 0; x < w; x++) {
-        *p_dst++ = (*p_src++ << input_shift) + offset;
-      }
-    }
-  }
-}
-
-static void img_upshift(vpx_image_t *dst, vpx_image_t *src,
-                        int input_shift) {
-  if (src->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
-    high_img_upshift(dst, src, input_shift);
-  } else {
-    low_img_upshift(dst, src, input_shift);
-  }
-}
-
-static void high_img_downshift(vpx_image_t *dst, vpx_image_t *src,
-                               int down_shift) {
-  int plane;
-  if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
-      dst->x_chroma_shift != src->x_chroma_shift ||
-      dst->y_chroma_shift != src->y_chroma_shift ||
-      dst->fmt != src->fmt || down_shift < 0) {
-    fatal("Unsupported image conversion");
-  }
-  switch (src->fmt) {
-    case VPX_IMG_FMT_I42016:
-    case VPX_IMG_FMT_I42216:
-    case VPX_IMG_FMT_I44416:
-      break;
-    default:
-      fatal("Unsupported image conversion");
-      break;
-  }
-  for (plane = 0; plane < 3; plane++) {
-    int w = src->d_w;
-    int h = src->d_h;
-    int x, y;
-    if (plane) {
-      w >>= src->x_chroma_shift;
-      h >>= src->y_chroma_shift;
-    }
-    for (y = 0; y < h; y++) {
-      uint16_t *p_src = (uint16_t *)(src->planes[plane] +
-                                     y * src->stride[plane]);
-      uint16_t *p_dst = (uint16_t *)(dst->planes[plane] +
-                                     y * dst->stride[plane]);
-      for (x = 0; x < w; x++)
-        *p_dst++ = *p_src++ >> down_shift;
-    }
-  }
-}
-
-static void low_img_downshift(vpx_image_t *dst, vpx_image_t *src,
-                            int down_shift) {
-  int plane;
-  if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
-      dst->x_chroma_shift != src->x_chroma_shift ||
-      dst->y_chroma_shift != src->y_chroma_shift ||
-      src->fmt != dst->fmt + VPX_IMG_FMT_HIGHBITDEPTH ||
-      down_shift < 0) {
-    fatal("Unsupported image conversion");
-  }
-  switch (dst->fmt) {
-    case VPX_IMG_FMT_I420:
-    case VPX_IMG_FMT_I422:
-    case VPX_IMG_FMT_I444:
-      break;
-    default:
-      fatal("Unsupported image conversion");
-      break;
-  }
-  for (plane = 0; plane < 3; plane++) {
-    int w = src->d_w;
-    int h = src->d_h;
-    int x, y;
-    if (plane) {
-      w >>= src->x_chroma_shift;
-      h >>= src->y_chroma_shift;
-    }
-    for (y = 0; y < h; y++) {
-      uint16_t *p_src = (uint16_t *)(src->planes[plane] +
-                                     y * src->stride[plane]);
-      uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane];
-      for (x = 0; x < w; x++) {
-        *p_dst++ = *p_src++ >> down_shift;
-      }
-    }
-  }
-}
-
-static void img_downshift(vpx_image_t *dst, vpx_image_t *src,
-                          int down_shift) {
-  if (dst->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
-    high_img_downshift(dst, src, down_shift);
-  } else {
-    low_img_downshift(dst, src, down_shift);
-  }
+static int img_shifted_realloc_required(const vpx_image_t *img,
+                                        const vpx_image_t *shifted,
+                                        vpx_img_fmt_t required_fmt) {
+  return img->d_w != shifted->d_w ||
+         img->d_h != shifted->d_h ||
+         required_fmt != shifted->fmt;
 }
 #endif
 
@@ -933,7 +768,7 @@
   if (use_y4m && !noblit) {
     if (!single_file) {
       fprintf(stderr, "YUV4MPEG2 not supported with output patterns,"
-              " try --i420 or --yv12.\n");
+              " try --i420 or --yv12 or --rawvideo.\n");
       return EXIT_FAILURE;
     }
 
@@ -1130,22 +965,25 @@
       }
       // Shift up or down if necessary
       if (output_bit_depth != img->bit_depth) {
+        const vpx_img_fmt_t shifted_fmt = output_bit_depth == 8 ?
+            img->fmt ^ (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) :
+            img->fmt | VPX_IMG_FMT_HIGHBITDEPTH;
+        if (img_shifted &&
+            img_shifted_realloc_required(img, img_shifted, shifted_fmt)) {
+          vpx_img_free(img_shifted);
+          img_shifted = NULL;
+        }
         if (!img_shifted) {
-          if (output_bit_depth == 8) {
-            img_shifted = vpx_img_alloc(
-                NULL, img->fmt - VPX_IMG_FMT_HIGHBITDEPTH,
-                img->d_w, img->d_h, 16);
-          } else {
-            img_shifted = vpx_img_alloc(
-                NULL, img->fmt | VPX_IMG_FMT_HIGHBITDEPTH,
-                img->d_w, img->d_h, 16);
-          }
+          img_shifted = vpx_img_alloc(NULL, shifted_fmt,
+                                      img->d_w, img->d_h, 16);
           img_shifted->bit_depth = output_bit_depth;
         }
         if (output_bit_depth > img->bit_depth) {
-          img_upshift(img_shifted, img, output_bit_depth - img->bit_depth);
+          vpx_img_upshift(img_shifted, img,
+                          output_bit_depth - img->bit_depth);
         } else {
-          img_downshift(img_shifted, img, img->bit_depth - output_bit_depth);
+          vpx_img_downshift(img_shifted, img,
+                            img->bit_depth - output_bit_depth);
         }
         img = img_shifted;
       }
@@ -1155,6 +993,10 @@
         if (use_y4m) {
           char buf[Y4M_BUFFER_SIZE] = {0};
           size_t len = 0;
+          if (img->fmt == VPX_IMG_FMT_I440 || img->fmt == VPX_IMG_FMT_I44016) {
+            fprintf(stderr, "Cannot produce y4m output for 440 sampling.\n");
+            goto fail;
+          }
           if (frame_out == 1) {
             // Y4M file header
             len = y4m_write_file_header(buf, sizeof(buf),

diff --git a/source/libvpx/vpxenc.c b/source/libvpx/vpxenc.c
index 1b0b632..0a0c071 100644
--- a/source/libvpx/vpxenc.c
+++ b/source/libvpx/vpxenc.c

@@ -128,48 +128,50 @@
   return 0;
 }
 
-static const arg_def_t debugmode = ARG_DEF("D", "debug", 0,
-                                           "Debug mode (makes output deterministic)");
-static const arg_def_t outputfile = ARG_DEF("o", "output", 1,
-                                            "Output filename");
-static const arg_def_t use_yv12 = ARG_DEF(NULL, "yv12", 0,
-                                          "Input file is YV12 ");
-static const arg_def_t use_i420 = ARG_DEF(NULL, "i420", 0,
-                                          "Input file is I420 (default)");
-static const arg_def_t use_i422 = ARG_DEF(NULL, "i422", 0,
-                                          "Input file is I422");
-static const arg_def_t use_i444 = ARG_DEF(NULL, "i444", 0,
-                                          "Input file is I444");
-static const arg_def_t codecarg = ARG_DEF(NULL, "codec", 1,
-                                          "Codec to use");
-static const arg_def_t passes           = ARG_DEF("p", "passes", 1,
-                                                  "Number of passes (1/2)");
-static const arg_def_t pass_arg         = ARG_DEF(NULL, "pass", 1,
-                                                  "Pass to execute (1/2)");
-static const arg_def_t fpf_name         = ARG_DEF(NULL, "fpf", 1,
-                                                  "First pass statistics file name");
+static const arg_def_t debugmode = ARG_DEF(
+    "D", "debug", 0, "Debug mode (makes output deterministic)");
+static const arg_def_t outputfile = ARG_DEF(
+    "o", "output", 1, "Output filename");
+static const arg_def_t use_yv12 = ARG_DEF(
+    NULL, "yv12", 0, "Input file is YV12 ");
+static const arg_def_t use_i420 = ARG_DEF(
+    NULL, "i420", 0, "Input file is I420 (default)");
+static const arg_def_t use_i422 = ARG_DEF(
+    NULL, "i422", 0, "Input file is I422");
+static const arg_def_t use_i444 = ARG_DEF(
+    NULL, "i444", 0, "Input file is I444");
+static const arg_def_t use_i440 = ARG_DEF(
+    NULL, "i440", 0, "Input file is I440");
+static const arg_def_t codecarg = ARG_DEF(
+    NULL, "codec", 1, "Codec to use");
+static const arg_def_t passes = ARG_DEF(
+    "p", "passes", 1, "Number of passes (1/2)");
+static const arg_def_t pass_arg = ARG_DEF(
+    NULL, "pass", 1, "Pass to execute (1/2)");
+static const arg_def_t fpf_name = ARG_DEF(
+    NULL, "fpf", 1, "First pass statistics file name");
 #if CONFIG_FP_MB_STATS
-static const arg_def_t fpmbf_name         = ARG_DEF(NULL, "fpmbf", 1,
-                                      "First pass block statistics file name");
+static const arg_def_t fpmbf_name = ARG_DEF(
+    NULL, "fpmbf", 1, "First pass block statistics file name");
 #endif
-static const arg_def_t limit = ARG_DEF(NULL, "limit", 1,
-                                       "Stop encoding after n input frames");
-static const arg_def_t skip = ARG_DEF(NULL, "skip", 1,
-                                      "Skip the first n input frames");
-static const arg_def_t deadline         = ARG_DEF("d", "deadline", 1,
-                                                  "Deadline per frame (usec)");
-static const arg_def_t best_dl          = ARG_DEF(NULL, "best", 0,
-                                                  "Use Best Quality Deadline");
-static const arg_def_t good_dl          = ARG_DEF(NULL, "good", 0,
-                                                  "Use Good Quality Deadline");
-static const arg_def_t rt_dl            = ARG_DEF(NULL, "rt", 0,
-                                                  "Use Realtime Quality Deadline");
-static const arg_def_t quietarg         = ARG_DEF("q", "quiet", 0,
-                                                  "Do not print encode progress");
-static const arg_def_t verbosearg       = ARG_DEF("v", "verbose", 0,
-                                                  "Show encoder parameters");
-static const arg_def_t psnrarg          = ARG_DEF(NULL, "psnr", 0,
-                                                  "Show PSNR in status line");
+static const arg_def_t limit = ARG_DEF(
+    NULL, "limit", 1, "Stop encoding after n input frames");
+static const arg_def_t skip = ARG_DEF(
+    NULL, "skip", 1, "Skip the first n input frames");
+static const arg_def_t deadline = ARG_DEF(
+    "d", "deadline", 1, "Deadline per frame (usec)");
+static const arg_def_t best_dl = ARG_DEF(
+    NULL, "best", 0, "Use Best Quality Deadline");
+static const arg_def_t good_dl = ARG_DEF(
+    NULL, "good", 0, "Use Good Quality Deadline");
+static const arg_def_t rt_dl = ARG_DEF(
+    NULL, "rt", 0, "Use Realtime Quality Deadline");
+static const arg_def_t quietarg = ARG_DEF(
+    "q", "quiet", 0, "Do not print encode progress");
+static const arg_def_t verbosearg = ARG_DEF(
+    "v", "verbose", 0, "Show encoder parameters");
+static const arg_def_t psnrarg = ARG_DEF(
+    NULL, "psnr", 0, "Show PSNR in status line");
 
 static const struct arg_enum_list test_decode_enum[] = {
   {"off",   TEST_DECODE_OFF},
@@ -177,28 +179,25 @@
   {"warn",  TEST_DECODE_WARN},
   {NULL, 0}
 };
-static const arg_def_t recontest = ARG_DEF_ENUM(NULL, "test-decode", 1,
-                                                "Test encode/decode mismatch",
-                                                test_decode_enum);
-static const arg_def_t framerate        = ARG_DEF(NULL, "fps", 1,
-                                                  "Stream frame rate (rate/scale)");
-static const arg_def_t use_ivf          = ARG_DEF(NULL, "ivf", 0,
-                                                  "Output IVF (default is WebM if WebM IO is enabled)");
-static const arg_def_t out_part = ARG_DEF("P", "output-partitions", 0,
-                                          "Makes encoder output partitions. Requires IVF output!");
-static const arg_def_t q_hist_n         = ARG_DEF(NULL, "q-hist", 1,
-                                                  "Show quantizer histogram (n-buckets)");
-static const arg_def_t rate_hist_n         = ARG_DEF(NULL, "rate-hist", 1,
-                                                     "Show rate histogram (n-buckets)");
-static const arg_def_t disable_warnings =
-    ARG_DEF(NULL, "disable-warnings", 0,
-            "Disable warnings about potentially incorrect encode settings.");
-static const arg_def_t disable_warning_prompt =
-    ARG_DEF("y", "disable-warning-prompt", 0,
-            "Display warnings, but do not prompt user to continue.");
-static const arg_def_t experimental_bitstream =
-    ARG_DEF(NULL, "experimental-bitstream", 0,
-            "Allow experimental bitstream features.");
+static const arg_def_t recontest = ARG_DEF_ENUM(
+    NULL, "test-decode", 1, "Test encode/decode mismatch", test_decode_enum);
+static const arg_def_t framerate = ARG_DEF(
+    NULL, "fps", 1, "Stream frame rate (rate/scale)");
+static const arg_def_t use_ivf = ARG_DEF(
+    NULL, "ivf", 0, "Output IVF (default is WebM if WebM IO is enabled)");
+static const arg_def_t out_part = ARG_DEF(
+    "P", "output-partitions", 0,
+    "Makes encoder output partitions. Requires IVF output!");
+static const arg_def_t q_hist_n = ARG_DEF(
+    NULL, "q-hist", 1, "Show quantizer histogram (n-buckets)");
+static const arg_def_t rate_hist_n = ARG_DEF(
+    NULL, "rate-hist", 1, "Show rate histogram (n-buckets)");
+static const arg_def_t disable_warnings = ARG_DEF(
+    NULL, "disable-warnings", 0,
+    "Disable warnings about potentially incorrect encode settings.");
+static const arg_def_t disable_warning_prompt = ARG_DEF(
+    "y", "disable-warning-prompt", 0,
+    "Display warnings, but do not prompt user to continue.");
 
 #if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
 static const arg_def_t test16bitinternalarg = ARG_DEF(
@@ -214,16 +213,14 @@
   NULL
 };
 
-static const arg_def_t usage            = ARG_DEF("u", "usage", 1,
-                                                  "Usage profile number to use");
-static const arg_def_t threads          = ARG_DEF("t", "threads", 1,
-                                                  "Max number of threads to use");
-static const arg_def_t profile          = ARG_DEF(NULL, "profile", 1,
-                                                  "Bitstream profile number to use");
-static const arg_def_t width            = ARG_DEF("w", "width", 1,
-                                                  "Frame width");
-static const arg_def_t height           = ARG_DEF("h", "height", 1,
-                                                  "Frame height");
+static const arg_def_t usage = ARG_DEF(
+    "u", "usage", 1, "Usage profile number to use");
+static const arg_def_t threads = ARG_DEF(
+    "t", "threads", 1, "Max number of threads to use");
+static const arg_def_t profile = ARG_DEF(
+    NULL, "profile", 1, "Bitstream profile number to use");
+static const arg_def_t width = ARG_DEF("w", "width", 1, "Frame width");
+static const arg_def_t height = ARG_DEF("h", "height", 1, "Frame height");
 #if CONFIG_WEBM_IO
 static const struct arg_enum_list stereo_mode_enum[] = {
   {"mono", STEREO_FORMAT_MONO},
@@ -233,18 +230,18 @@
   {"right-left", STEREO_FORMAT_RIGHT_LEFT},
   {NULL, 0}
 };
-static const arg_def_t stereo_mode      = ARG_DEF_ENUM(NULL, "stereo-mode", 1,
-                                                       "Stereo 3D video format", stereo_mode_enum);
+static const arg_def_t stereo_mode = ARG_DEF_ENUM(
+    NULL, "stereo-mode", 1, "Stereo 3D video format", stereo_mode_enum);
 #endif
-static const arg_def_t timebase         = ARG_DEF(NULL, "timebase", 1,
-                                                  "Output timestamp precision (fractional seconds)");
-static const arg_def_t error_resilient  = ARG_DEF(NULL, "error-resilient", 1,
-                                                  "Enable error resiliency features");
-static const arg_def_t lag_in_frames    = ARG_DEF(NULL, "lag-in-frames", 1,
-                                                  "Max number of frames to lag");
+static const arg_def_t timebase = ARG_DEF(
+    NULL, "timebase", 1, "Output timestamp precision (fractional seconds)");
+static const arg_def_t error_resilient = ARG_DEF(
+    NULL, "error-resilient", 1, "Enable error resiliency features");
+static const arg_def_t lag_in_frames = ARG_DEF(
+    NULL, "lag-in-frames", 1, "Max number of frames to lag");
 
 static const arg_def_t *global_args[] = {
-  &use_yv12, &use_i420, &use_i422, &use_i444,
+  &use_yv12, &use_i420, &use_i422, &use_i444, &use_i440,
   &usage, &threads, &profile,
   &width, &height,
 #if CONFIG_WEBM_IO
@@ -258,18 +255,18 @@
   &lag_in_frames, NULL
 };
 
-static const arg_def_t dropframe_thresh   = ARG_DEF(NULL, "drop-frame", 1,
-                                                    "Temporal resampling threshold (buf %)");
-static const arg_def_t resize_allowed     = ARG_DEF(NULL, "resize-allowed", 1,
-                                                    "Spatial resampling enabled (bool)");
-static const arg_def_t resize_width       = ARG_DEF(NULL, "resize-width", 1,
-                                                    "Width of encoded frame");
-static const arg_def_t resize_height      = ARG_DEF(NULL, "resize-height", 1,
-                                                    "Height of encoded frame");
-static const arg_def_t resize_up_thresh   = ARG_DEF(NULL, "resize-up", 1,
-                                                    "Upscale threshold (buf %)");
-static const arg_def_t resize_down_thresh = ARG_DEF(NULL, "resize-down", 1,
-                                                    "Downscale threshold (buf %)");
+static const arg_def_t dropframe_thresh = ARG_DEF(
+    NULL, "drop-frame", 1, "Temporal resampling threshold (buf %)");
+static const arg_def_t resize_allowed = ARG_DEF(
+    NULL, "resize-allowed", 1, "Spatial resampling enabled (bool)");
+static const arg_def_t resize_width = ARG_DEF(
+    NULL, "resize-width", 1, "Width of encoded frame");
+static const arg_def_t resize_height = ARG_DEF(
+    NULL, "resize-height", 1, "Height of encoded frame");
+static const arg_def_t resize_up_thresh = ARG_DEF(
+    NULL, "resize-up", 1, "Upscale threshold (buf %)");
+static const arg_def_t resize_down_thresh = ARG_DEF(
+    NULL, "resize-down", 1, "Downscale threshold (buf %)");
 static const struct arg_enum_list end_usage_enum[] = {
   {"vbr", VPX_VBR},
   {"cbr", VPX_CBR},
@@ -277,24 +274,24 @@
   {"q",   VPX_Q},
   {NULL, 0}
 };
-static const arg_def_t end_usage          = ARG_DEF_ENUM(NULL, "end-usage", 1,
-                                                         "Rate control mode", end_usage_enum);
-static const arg_def_t target_bitrate     = ARG_DEF(NULL, "target-bitrate", 1,
-                                                    "Bitrate (kbps)");
-static const arg_def_t min_quantizer      = ARG_DEF(NULL, "min-q", 1,
-                                                    "Minimum (best) quantizer");
-static const arg_def_t max_quantizer      = ARG_DEF(NULL, "max-q", 1,
-                                                    "Maximum (worst) quantizer");
-static const arg_def_t undershoot_pct     = ARG_DEF(NULL, "undershoot-pct", 1,
-                                                    "Datarate undershoot (min) target (%)");
-static const arg_def_t overshoot_pct      = ARG_DEF(NULL, "overshoot-pct", 1,
-                                                    "Datarate overshoot (max) target (%)");
-static const arg_def_t buf_sz             = ARG_DEF(NULL, "buf-sz", 1,
-                                                    "Client buffer size (ms)");
-static const arg_def_t buf_initial_sz     = ARG_DEF(NULL, "buf-initial-sz", 1,
-                                                    "Client initial buffer size (ms)");
-static const arg_def_t buf_optimal_sz     = ARG_DEF(NULL, "buf-optimal-sz", 1,
-                                                    "Client optimal buffer size (ms)");
+static const arg_def_t end_usage = ARG_DEF_ENUM(
+    NULL, "end-usage", 1, "Rate control mode", end_usage_enum);
+static const arg_def_t target_bitrate = ARG_DEF(
+    NULL, "target-bitrate", 1, "Bitrate (kbps)");
+static const arg_def_t min_quantizer = ARG_DEF(
+    NULL, "min-q", 1, "Minimum (best) quantizer");
+static const arg_def_t max_quantizer = ARG_DEF(
+    NULL, "max-q", 1, "Maximum (worst) quantizer");
+static const arg_def_t undershoot_pct = ARG_DEF(
+    NULL, "undershoot-pct", 1, "Datarate undershoot (min) target (%)");
+static const arg_def_t overshoot_pct = ARG_DEF(
+    NULL, "overshoot-pct", 1, "Datarate overshoot (max) target (%)");
+static const arg_def_t buf_sz = ARG_DEF(
+    NULL, "buf-sz", 1, "Client buffer size (ms)");
+static const arg_def_t buf_initial_sz = ARG_DEF(
+    NULL, "buf-initial-sz", 1, "Client initial buffer size (ms)");
+static const arg_def_t buf_optimal_sz = ARG_DEF(
+    NULL, "buf-optimal-sz", 1, "Client optimal buffer size (ms)");
 static const arg_def_t *rc_args[] = {
   &dropframe_thresh, &resize_allowed, &resize_width, &resize_height,
   &resize_up_thresh, &resize_down_thresh, &end_usage, &target_bitrate,
@@ -303,59 +300,59 @@
 };
 
 
-static const arg_def_t bias_pct = ARG_DEF(NULL, "bias-pct", 1,
-                                          "CBR/VBR bias (0=CBR, 100=VBR)");
-static const arg_def_t minsection_pct = ARG_DEF(NULL, "minsection-pct", 1,
-                                                "GOP min bitrate (% of target)");
-static const arg_def_t maxsection_pct = ARG_DEF(NULL, "maxsection-pct", 1,
-                                                "GOP max bitrate (% of target)");
+static const arg_def_t bias_pct = ARG_DEF(
+    NULL, "bias-pct", 1, "CBR/VBR bias (0=CBR, 100=VBR)");
+static const arg_def_t minsection_pct = ARG_DEF(
+    NULL, "minsection-pct", 1, "GOP min bitrate (% of target)");
+static const arg_def_t maxsection_pct = ARG_DEF(
+    NULL, "maxsection-pct", 1, "GOP max bitrate (% of target)");
 static const arg_def_t *rc_twopass_args[] = {
   &bias_pct, &minsection_pct, &maxsection_pct, NULL
 };
 
 
-static const arg_def_t kf_min_dist = ARG_DEF(NULL, "kf-min-dist", 1,
-                                             "Minimum keyframe interval (frames)");
-static const arg_def_t kf_max_dist = ARG_DEF(NULL, "kf-max-dist", 1,
-                                             "Maximum keyframe interval (frames)");
-static const arg_def_t kf_disabled = ARG_DEF(NULL, "disable-kf", 0,
-                                             "Disable keyframe placement");
+static const arg_def_t kf_min_dist = ARG_DEF(
+    NULL, "kf-min-dist", 1, "Minimum keyframe interval (frames)");
+static const arg_def_t kf_max_dist = ARG_DEF(
+    NULL, "kf-max-dist", 1, "Maximum keyframe interval (frames)");
+static const arg_def_t kf_disabled = ARG_DEF(
+    NULL, "disable-kf", 0, "Disable keyframe placement");
 static const arg_def_t *kf_args[] = {
   &kf_min_dist, &kf_max_dist, &kf_disabled, NULL
 };
 
 
-static const arg_def_t noise_sens = ARG_DEF(NULL, "noise-sensitivity", 1,
-                                            "Noise sensitivity (frames to blur)");
-static const arg_def_t sharpness = ARG_DEF(NULL, "sharpness", 1,
-                                           "Loop filter sharpness (0..7)");
-static const arg_def_t static_thresh = ARG_DEF(NULL, "static-thresh", 1,
-                                               "Motion detection threshold");
-static const arg_def_t cpu_used = ARG_DEF(NULL, "cpu-used", 1,
-                                          "CPU Used (-16..16)");
-static const arg_def_t auto_altref = ARG_DEF(NULL, "auto-alt-ref", 1,
-                                             "Enable automatic alt reference frames");
-static const arg_def_t arnr_maxframes = ARG_DEF(NULL, "arnr-maxframes", 1,
-                                                "AltRef max frames (0..15)");
-static const arg_def_t arnr_strength = ARG_DEF(NULL, "arnr-strength", 1,
-                                               "AltRef filter strength (0..6)");
-static const arg_def_t arnr_type = ARG_DEF(NULL, "arnr-type", 1,
-                                           "AltRef type");
+static const arg_def_t noise_sens = ARG_DEF(
+    NULL, "noise-sensitivity", 1, "Noise sensitivity (frames to blur)");
+static const arg_def_t sharpness = ARG_DEF(
+    NULL, "sharpness", 1, "Loop filter sharpness (0..7)");
+static const arg_def_t static_thresh = ARG_DEF(
+    NULL, "static-thresh", 1, "Motion detection threshold");
+static const arg_def_t cpu_used = ARG_DEF(
+    NULL, "cpu-used", 1, "CPU Used (-16..16)");
+static const arg_def_t auto_altref = ARG_DEF(
+    NULL, "auto-alt-ref", 1, "Enable automatic alt reference frames");
+static const arg_def_t arnr_maxframes = ARG_DEF(
+    NULL, "arnr-maxframes", 1, "AltRef max frames (0..15)");
+static const arg_def_t arnr_strength = ARG_DEF(
+    NULL, "arnr-strength", 1, "AltRef filter strength (0..6)");
+static const arg_def_t arnr_type = ARG_DEF(
+    NULL, "arnr-type", 1, "AltRef type");
 static const struct arg_enum_list tuning_enum[] = {
   {"psnr", VP8_TUNE_PSNR},
   {"ssim", VP8_TUNE_SSIM},
   {NULL, 0}
 };
-static const arg_def_t tune_ssim = ARG_DEF_ENUM(NULL, "tune", 1,
-                                                "Material to favor", tuning_enum);
-static const arg_def_t cq_level = ARG_DEF(NULL, "cq-level", 1,
-                                          "Constant/Constrained Quality level");
-static const arg_def_t max_intra_rate_pct = ARG_DEF(NULL, "max-intra-rate", 1,
-                                                    "Max I-frame bitrate (pct)");
+static const arg_def_t tune_ssim = ARG_DEF_ENUM(
+    NULL, "tune", 1, "Material to favor", tuning_enum);
+static const arg_def_t cq_level = ARG_DEF(
+    NULL, "cq-level", 1, "Constant/Constrained Quality level");
+static const arg_def_t max_intra_rate_pct = ARG_DEF(
+    NULL, "max-intra-rate", 1, "Max I-frame bitrate (pct)");
 
 #if CONFIG_VP8_ENCODER
-static const arg_def_t token_parts =
-    ARG_DEF(NULL, "token-parts", 1, "Number of token partitions to use, log2");
+static const arg_def_t token_parts = ARG_DEF(
+    NULL, "token-parts", 1, "Number of token partitions to use, log2");
 static const arg_def_t *vp8_args[] = {
   &cpu_used, &auto_altref, &noise_sens, &sharpness, &static_thresh,
   &token_parts, &arnr_maxframes, &arnr_strength, &arnr_type,
@@ -373,11 +370,12 @@
 #endif
 
 #if CONFIG_VP9_ENCODER
-static const arg_def_t tile_cols =
-    ARG_DEF(NULL, "tile-columns", 1, "Number of tile columns to use, log2");
-static const arg_def_t tile_rows =
-    ARG_DEF(NULL, "tile-rows", 1, "Number of tile rows to use, log2");
-static const arg_def_t lossless = ARG_DEF(NULL, "lossless", 1, "Lossless mode");
+static const arg_def_t tile_cols = ARG_DEF(
+    NULL, "tile-columns", 1, "Number of tile columns to use, log2");
+static const arg_def_t tile_rows = ARG_DEF(
+    NULL, "tile-rows", 1, "Number of tile rows to use, log2");
+static const arg_def_t lossless = ARG_DEF(
+    NULL, "lossless", 1, "Lossless mode");
 static const arg_def_t frame_parallel_decoding = ARG_DEF(
     NULL, "frame-parallel", 1, "Enable frame parallel decodability features");
 static const arg_def_t aq_mode = ARG_DEF(
@@ -396,13 +394,12 @@
   {NULL, 0}
 };
 
-static const arg_def_t bitdeptharg   = ARG_DEF_ENUM("b", "bit-depth", 1,
-                                                    "Bit depth for codec "
-                                                    "(8 for version <=1, "
-                                                    "10 or 12 for version 2)",
-                                                    bitdepth_enum);
-static const arg_def_t inbitdeptharg = ARG_DEF(NULL, "input-bit-depth", 1,
-                                               "Bit depth of input");
+static const arg_def_t bitdeptharg = ARG_DEF_ENUM(
+    "b", "bit-depth", 1,
+    "Bit depth for codec (8 for version <=1, 10 or 12 for version 2)",
+    bitdepth_enum);
+static const arg_def_t inbitdeptharg = ARG_DEF(
+    NULL, "input-bit-depth", 1, "Bit depth of input");
 #endif
 
 static const struct arg_enum_list tune_content_enum[] = {
@@ -829,6 +826,8 @@
       global->color_type = I422;
     else if (arg_match(&arg, &use_i444, argi))
       global->color_type = I444;
+    else if (arg_match(&arg, &use_i440, argi))
+      global->color_type = I440;
     else if (arg_match(&arg, &quietarg, argi))
       global->quiet = 1;
     else if (arg_match(&arg, &verbosearg, argi))
@@ -857,8 +856,6 @@
       global->disable_warnings = 1;
     else if (arg_match(&arg, &disable_warning_prompt, argi))
       global->disable_warning_prompt = 1;
-    else if (arg_match(&arg, &experimental_bitstream, argi))
-      global->experimental_bitstream = 1;
     else
       argj++;
   }
@@ -1162,7 +1159,6 @@
             if (j == config->arg_ctrl_cnt)
               config->arg_ctrl_cnt++;
           }
-
         }
       }
       if (!match)
@@ -1196,12 +1192,6 @@
     fatal("Stream %d: Specify stream dimensions with --width (-w) "
           " and --height (-h)", stream->index);
 
-  if (stream->config.cfg.g_profile != 0 && !global->experimental_bitstream) {
-    fatal("Stream %d: profile %d is experimental and requires the --%s flag",
-          stream->index, stream->config.cfg.g_profile,
-          experimental_bitstream.long_name);
-  }
-
   // Check that the codec bit depth is greater than the input bit depth.
   if (stream->config.cfg.g_input_bit_depth >
       (unsigned int)stream->config.cfg.g_bit_depth) {
@@ -1288,7 +1278,12 @@
     case VPX_IMG_FMT_I420: return "I420";
     case VPX_IMG_FMT_I422: return "I422";
     case VPX_IMG_FMT_I444: return "I444";
+    case VPX_IMG_FMT_I440: return "I440";
     case VPX_IMG_FMT_YV12: return "YV12";
+    case VPX_IMG_FMT_I42016: return "I42016";
+    case VPX_IMG_FMT_I42216: return "I42216";
+    case VPX_IMG_FMT_I44416: return "I44416";
+    case VPX_IMG_FMT_I44016: return "I44016";
     default: return "Other";
   }
 }
@@ -1695,7 +1690,7 @@
 }
 
 
-static void show_psnr(struct stream_state  *stream) {
+static void show_psnr(struct stream_state  *stream, double peak) {
   int i;
   double ovpsnr;
 
@@ -1703,7 +1698,7 @@
     return;
 
   fprintf(stderr, "Stream %d PSNR (Overall/Avg/Y/U/V)", stream->index);
-  ovpsnr = sse_to_psnr((double)stream->psnr_samples_total, 255.0,
+  ovpsnr = sse_to_psnr((double)stream->psnr_samples_total, peak,
                        (double)stream->psnr_sse_total);
   fprintf(stderr, " %.3f", ovpsnr);
 
@@ -1718,132 +1713,6 @@
   return (float)(usec > 0 ? frames * 1000000.0 / (float)usec : 0);
 }
 
-#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
-static void high_img_upshift(vpx_image_t *dst, vpx_image_t *src,
-                             int input_shift) {
-  // Note the offset is 1 less than half
-  const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0;
-  int plane;
-  if (dst->w != src->w || dst->h != src->h ||
-      dst->x_chroma_shift != src->x_chroma_shift ||
-      dst->y_chroma_shift != src->y_chroma_shift ||
-      dst->fmt != src->fmt || input_shift < 0) {
-    fatal("Unsupported image conversion");
-  }
-  switch (src->fmt) {
-    case VPX_IMG_FMT_I42016:
-    case VPX_IMG_FMT_I42216:
-    case VPX_IMG_FMT_I44416:
-      break;
-    default:
-      fatal("Unsupported image conversion");
-      break;
-  }
-  for (plane = 0; plane < 3; plane++) {
-    int w = src->w;
-    int h = src->h;
-    int x, y;
-    if (plane) {
-      w >>= src->x_chroma_shift;
-      h >>= src->y_chroma_shift;
-    }
-    for (y = 0; y < h; y++) {
-      uint16_t *p_src = (uint16_t *)(src->planes[plane] +
-                                     y * src->stride[plane]);
-      uint16_t *p_dst = (uint16_t *)(dst->planes[plane] +
-                                     y * dst->stride[plane]);
-      for (x = 0; x < w; x++)
-        *p_dst++ = (*p_src++ << input_shift) + offset;
-    }
-  }
-}
-
-static void low_img_upshift(vpx_image_t *dst, vpx_image_t *src,
-                            int input_shift) {
-  // Note the offset is 1 less than half
-  const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0;
-  int plane;
-  if (dst->w != src->w || dst->h != src->h ||
-      dst->x_chroma_shift != src->x_chroma_shift ||
-      dst->y_chroma_shift != src->y_chroma_shift ||
-      dst->fmt != src->fmt + VPX_IMG_FMT_HIGHBITDEPTH ||
-      input_shift < 0) {
-    fatal("Unsupported image conversion");
-  }
-  switch (src->fmt) {
-    case VPX_IMG_FMT_I420:
-    case VPX_IMG_FMT_I422:
-    case VPX_IMG_FMT_I444:
-      break;
-    default:
-      fatal("Unsupported image conversion");
-      break;
-  }
-  for (plane = 0; plane < 3; plane++) {
-    int w = src->w;
-    int h = src->h;
-    int x, y;
-    if (plane) {
-      w >>= src->x_chroma_shift;
-      h >>= src->y_chroma_shift;
-    }
-    for (y = 0; y < h; y++) {
-      uint8_t *p_src = src->planes[plane] + y * src->stride[plane];
-      uint16_t *p_dst = (uint16_t *)(dst->planes[plane] +
-                                     y * dst->stride[plane]);
-      for (x = 0; x < w; x++) {
-        *p_dst++ = (*p_src++ << input_shift) + offset;
-      }
-    }
-  }
-}
-
-static void img_upshift(vpx_image_t *dst, vpx_image_t *src,
-                        int input_shift) {
-  if (src->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
-    high_img_upshift(dst, src, input_shift);
-  } else {
-    low_img_upshift(dst, src, input_shift);
-  }
-}
-
-static void img_cast_16_to_8(vpx_image_t *dst, vpx_image_t *src) {
-  int plane;
-  if (dst->fmt + VPX_IMG_FMT_HIGHBITDEPTH != src->fmt ||
-      dst->d_w != src->d_w || dst->d_h != src->d_h ||
-      dst->x_chroma_shift != src->x_chroma_shift ||
-      dst->y_chroma_shift != src->y_chroma_shift) {
-    fatal("Unsupported image conversion");
-  }
-  switch (dst->fmt) {
-    case VPX_IMG_FMT_I420:
-    case VPX_IMG_FMT_I422:
-    case VPX_IMG_FMT_I444:
-      break;
-    default:
-      fatal("Unsupported image conversion");
-      break;
-  }
-  for (plane = 0; plane < 3; plane++) {
-    int w = src->d_w;
-    int h = src->d_h;
-    int x, y;
-    if (plane) {
-      w >>= src->x_chroma_shift;
-      h >>= src->y_chroma_shift;
-    }
-    for (y = 0; y < h; y++) {
-      uint16_t *p_src = (uint16_t *)(src->planes[plane] +
-                                     y * src->stride[plane]);
-      uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane];
-      for (x = 0; x < w; x++) {
-        *p_dst++ = *p_src++;
-      }
-    }
-  }
-}
-#endif
-
 static void test_decode(struct stream_state  *stream,
                         enum TestDecodeFatality fatal,
                         const VpxInterface *codec) {
@@ -1883,12 +1752,12 @@
       if (enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
         vpx_img_alloc(&enc_img, enc_img.fmt - VPX_IMG_FMT_HIGHBITDEPTH,
                       enc_img.d_w, enc_img.d_h, 16);
-        img_cast_16_to_8(&enc_img, &ref_enc.img);
+        vpx_img_truncate_16_to_8(&enc_img, &ref_enc.img);
       }
       if (dec_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
         vpx_img_alloc(&dec_img, dec_img.fmt - VPX_IMG_FMT_HIGHBITDEPTH,
                       dec_img.d_w, dec_img.d_h, 16);
-        img_cast_16_to_8(&dec_img, &ref_dec.img);
+        vpx_img_truncate_16_to_8(&dec_img, &ref_dec.img);
       }
     }
 #endif
@@ -1993,6 +1862,9 @@
     case I444:
       input.fmt = VPX_IMG_FMT_I444;
       break;
+    case I440:
+      input.fmt = VPX_IMG_FMT_I440;
+      break;
     case YV12:
       input.fmt = VPX_IMG_FMT_YV12;
       break;
@@ -2042,14 +1914,15 @@
     /* If the input file doesn't specify its w/h (raw files), try to get
      * the data from the first stream's configuration.
      */
-    if (!input.width || !input.height)
-      FOREACH_STREAM( {
-      if (stream->config.cfg.g_w && stream->config.cfg.g_h) {
-        input.width = stream->config.cfg.g_w;
-        input.height = stream->config.cfg.g_h;
-        break;
-      }
-    });
+    if (!input.width || !input.height) {
+      FOREACH_STREAM({
+        if (stream->config.cfg.g_w && stream->config.cfg.g_h) {
+          input.width = stream->config.cfg.g_w;
+          input.height = stream->config.cfg.g_h;
+          break;
+        }
+      });
+    }
 
     /* Update stream configurations from the input file's parameters */
     if (!input.width || !input.height)
@@ -2196,7 +2069,7 @@
                           input.width, input.height, 32);
             allocated_raw_shift = 1;
           }
-          img_upshift(&raw_shift, &raw, input_shift);
+          vpx_img_upshift(&raw_shift, &raw, input_shift);
           frame_to_encode = &raw_shift;
         } else {
           frame_to_encode = &raw;
@@ -2272,24 +2145,29 @@
     if (stream_cnt > 1)
       fprintf(stderr, "\n");
 
-    if (!global.quiet)
-      FOREACH_STREAM(fprintf(
-                       stderr,
-                       "\rPass %d/%d frame %4d/%-4d %7"PRId64"B %7lub/f %7"PRId64"b/s"
-                       " %7"PRId64" %s (%.2f fps)\033[K\n", pass + 1,
-                       global.passes, frames_in, stream->frames_out, (int64_t)stream->nbytes,
-                       seen_frames ? (unsigned long)(stream->nbytes * 8 / seen_frames) : 0,
-                       seen_frames ? (int64_t)stream->nbytes * 8
-                       * (int64_t)global.framerate.num / global.framerate.den
-                       / seen_frames
-                       : 0,
-                       stream->cx_time > 9999999 ? stream->cx_time / 1000 : stream->cx_time,
-                       stream->cx_time > 9999999 ? "ms" : "us",
-                       usec_to_fps(stream->cx_time, seen_frames));
-                    );
+    if (!global.quiet) {
+      FOREACH_STREAM(fprintf(stderr,
+          "\rPass %d/%d frame %4d/%-4d %7"PRId64"B %7"PRId64"b/f %7"PRId64"b/s"
+          " %7"PRId64" %s (%.2f fps)\033[K\n",
+          pass + 1,
+          global.passes, frames_in, stream->frames_out, (int64_t)stream->nbytes,
+          seen_frames ? (int64_t)(stream->nbytes * 8 / seen_frames) : 0,
+          seen_frames ? (int64_t)stream->nbytes * 8 *
+              (int64_t)global.framerate.num / global.framerate.den /
+              seen_frames : 0,
+          stream->cx_time > 9999999 ? stream->cx_time / 1000 : stream->cx_time,
+          stream->cx_time > 9999999 ? "ms" : "us",
+          usec_to_fps(stream->cx_time, seen_frames)));
+    }
 
-    if (global.show_psnr)
-      FOREACH_STREAM(show_psnr(stream));
+    if (global.show_psnr) {
+      if (global.codec->fourcc == VP9_FOURCC) {
+        FOREACH_STREAM(
+            show_psnr(stream, (1 << stream->config.cfg.g_input_bit_depth) - 1));
+      } else {
+        FOREACH_STREAM(show_psnr(stream, 255.0));
+      }
+    }
 
     FOREACH_STREAM(vpx_codec_destroy(&stream->encoder));
 

diff --git a/source/libvpx/vpxenc.h b/source/libvpx/vpxenc.h
index 3d6728e..d867e9d 100644
--- a/source/libvpx/vpxenc.h
+++ b/source/libvpx/vpxenc.h

@@ -26,6 +26,7 @@
   I420,  // 4:2:0 8+ bit-depth
   I422,  // 4:2:2 8+ bit-depth
   I444,  // 4:4:4 8+ bit-depth
+  I440,  // 4:4:0 8+ bit-depth
   YV12,  // 4:2:0 with uv flipped, only 8-bit depth
 } ColorInputType;
commit	2e5ced5fd62a73f4f5687ab19520b3aad1c53f6f	[log] [tgz]
author	johannkoenig@google.com <johannkoenig@google.com>	Thu Oct 23 03:24:33 2014 +0000
committer	johannkoenig@google.com <johannkoenig@google.com>	Thu Oct 23 03:24:33 2014 +0000
tree	9163e06f32fc7d69af18bef0b936d4ef135e1f1d
parent	d4edbc4ba038ec534d70f3543890f290b30b902c [diff]